2021-04-01 23:25:03

by Kees Cook

[permalink] [raw]
Subject: [PATCH v10 0/6] Optionally randomize kernel stack offset each syscall

Hi!

This should be good to go now. :)

v10:
- switch from "m" to "o" constraint (will)
- switch to raw_cpu_*() (tglx)
- hooked LKDTM test up to kselftest
v9: https://lore.kernel.org/lkml/[email protected]/
v8: https://lore.kernel.org/lkml/[email protected]/
v7: https://lore.kernel.org/lkml/[email protected]/
v6: https://lore.kernel.org/lkml/[email protected]/
v5: https://lore.kernel.org/lkml/[email protected]/
v4: https://lore.kernel.org/lkml/[email protected]/
v3: https://lore.kernel.org/lkml/[email protected]/
v2: https://lore.kernel.org/lkml/[email protected]/
rfc: https://lore.kernel.org/kernel-hardening/[email protected]/

This is a continuation and refactoring of Elena's earlier effort to add
kernel stack base offset randomization. In the time since the earlier
discussions, two attacks[1][2] were made public that depended on stack
determinism, so we're no longer in the position of "this is a good idea
but we have no examples of attacks". :)

Earlier discussions also devolved into debates on entropy sources, which
is mostly a red herring, given the already low entropy available due
to stack size. Regardless, entropy can be changed/improved separately
from this series as needed.

Earlier discussions also got stuck debating how much syscall overhead
was too much, but this is also a red herring since the feature itself
needs to be selectable at boot with no cost for those that don't want it:
this is solved here with static branches.

So, here is the latest improved version, made as arch-agnostic as
possible, with usage added for x86 and arm64. It also includes some small
static branch clean ups, and addresses some surprise performance issues
due to the stack canary[3].

Thanks!

-Kees

[1] https://a13xp0p0v.github.io/2020/02/15/CVE-2019-18683.html
[2] https://repositorio-aberto.up.pt/bitstream/10216/125357/2/374717.pdf
[3] https://lore.kernel.org/lkml/202003281520.A9BFF461@keescook/

Kees Cook (6):
jump_label: Provide CONFIG-driven build state defaults
init_on_alloc: Optimize static branches
stack: Optionally randomize kernel stack offset each syscall
x86/entry: Enable random_kstack_offset support
arm64: entry: Enable random_kstack_offset support
lkdtm: Add REPORT_STACK for checking stack offsets

.../admin-guide/kernel-parameters.txt | 11 ++++
Makefile | 4 ++
arch/Kconfig | 23 ++++++++
arch/arm64/Kconfig | 1 +
arch/arm64/kernel/Makefile | 5 ++
arch/arm64/kernel/syscall.c | 16 ++++++
arch/x86/Kconfig | 1 +
arch/x86/entry/common.c | 3 ++
arch/x86/include/asm/entry-common.h | 16 ++++++
drivers/misc/lkdtm/bugs.c | 17 ++++++
drivers/misc/lkdtm/core.c | 1 +
drivers/misc/lkdtm/lkdtm.h | 1 +
include/linux/jump_label.h | 19 +++++++
include/linux/mm.h | 10 ++--
include/linux/randomize_kstack.h | 54 +++++++++++++++++++
init/main.c | 23 ++++++++
mm/page_alloc.c | 4 +-
mm/slab.h | 6 ++-
tools/testing/selftests/lkdtm/.gitignore | 1 +
tools/testing/selftests/lkdtm/Makefile | 1 +
.../testing/selftests/lkdtm/stack-entropy.sh | 36 +++++++++++++
21 files changed, 245 insertions(+), 8 deletions(-)
create mode 100644 include/linux/randomize_kstack.h
create mode 100755 tools/testing/selftests/lkdtm/stack-entropy.sh

--
2.25.1


2021-04-01 23:26:12

by Kees Cook

[permalink] [raw]
Subject: [PATCH v10 6/6] lkdtm: Add REPORT_STACK for checking stack offsets

For validating the stack offset behavior, report the offset from a given
process's first seen stack address. Add script to calculate the results
to the LKDTM kselftests.

Signed-off-by: Kees Cook <[email protected]>
---
drivers/misc/lkdtm/bugs.c | 17 +++++++++
drivers/misc/lkdtm/core.c | 1 +
drivers/misc/lkdtm/lkdtm.h | 1 +
tools/testing/selftests/lkdtm/.gitignore | 1 +
tools/testing/selftests/lkdtm/Makefile | 1 +
.../testing/selftests/lkdtm/stack-entropy.sh | 36 +++++++++++++++++++
6 files changed, 57 insertions(+)
create mode 100755 tools/testing/selftests/lkdtm/stack-entropy.sh

diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c
index 110f5a8538e9..0e8254d0cf0b 100644
--- a/drivers/misc/lkdtm/bugs.c
+++ b/drivers/misc/lkdtm/bugs.c
@@ -134,6 +134,23 @@ noinline void lkdtm_CORRUPT_STACK_STRONG(void)
__lkdtm_CORRUPT_STACK((void *)&data);
}

+static pid_t stack_pid;
+static unsigned long stack_addr;
+
+void lkdtm_REPORT_STACK(void)
+{
+ volatile uintptr_t magic;
+ pid_t pid = task_pid_nr(current);
+
+ if (pid != stack_pid) {
+ pr_info("Starting stack offset tracking for pid %d\n", pid);
+ stack_pid = pid;
+ stack_addr = (uintptr_t)&magic;
+ }
+
+ pr_info("Stack offset: %d\n", (int)(stack_addr - (uintptr_t)&magic));
+}
+
void lkdtm_UNALIGNED_LOAD_STORE_WRITE(void)
{
static u8 data[5] __attribute__((aligned(4))) = {1, 2, 3, 4, 5};
diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c
index b2aff4d87c01..8024b6a5cc7f 100644
--- a/drivers/misc/lkdtm/core.c
+++ b/drivers/misc/lkdtm/core.c
@@ -110,6 +110,7 @@ static const struct crashtype crashtypes[] = {
CRASHTYPE(EXHAUST_STACK),
CRASHTYPE(CORRUPT_STACK),
CRASHTYPE(CORRUPT_STACK_STRONG),
+ CRASHTYPE(REPORT_STACK),
CRASHTYPE(CORRUPT_LIST_ADD),
CRASHTYPE(CORRUPT_LIST_DEL),
CRASHTYPE(STACK_GUARD_PAGE_LEADING),
diff --git a/drivers/misc/lkdtm/lkdtm.h b/drivers/misc/lkdtm/lkdtm.h
index 5ae48c64df24..99f90d3e5e9c 100644
--- a/drivers/misc/lkdtm/lkdtm.h
+++ b/drivers/misc/lkdtm/lkdtm.h
@@ -17,6 +17,7 @@ void lkdtm_LOOP(void);
void lkdtm_EXHAUST_STACK(void);
void lkdtm_CORRUPT_STACK(void);
void lkdtm_CORRUPT_STACK_STRONG(void);
+void lkdtm_REPORT_STACK(void);
void lkdtm_UNALIGNED_LOAD_STORE_WRITE(void);
void lkdtm_SOFTLOCKUP(void);
void lkdtm_HARDLOCKUP(void);
diff --git a/tools/testing/selftests/lkdtm/.gitignore b/tools/testing/selftests/lkdtm/.gitignore
index f26212605b6b..d4b0be857deb 100644
--- a/tools/testing/selftests/lkdtm/.gitignore
+++ b/tools/testing/selftests/lkdtm/.gitignore
@@ -1,2 +1,3 @@
*.sh
!run.sh
+!stack-entropy.sh
diff --git a/tools/testing/selftests/lkdtm/Makefile b/tools/testing/selftests/lkdtm/Makefile
index 1bcc9ee990eb..c71109ceeb2d 100644
--- a/tools/testing/selftests/lkdtm/Makefile
+++ b/tools/testing/selftests/lkdtm/Makefile
@@ -5,6 +5,7 @@ include ../lib.mk

# NOTE: $(OUTPUT) won't get default value if used before lib.mk
TEST_FILES := tests.txt
+TEST_PROGS := stack-entropy.sh
TEST_GEN_PROGS = $(patsubst %,$(OUTPUT)/%.sh,$(shell awk '{print $$1}' tests.txt | sed -e 's/\#//'))
all: $(TEST_GEN_PROGS)

diff --git a/tools/testing/selftests/lkdtm/stack-entropy.sh b/tools/testing/selftests/lkdtm/stack-entropy.sh
new file mode 100755
index 000000000000..b1b8a5097cbb
--- /dev/null
+++ b/tools/testing/selftests/lkdtm/stack-entropy.sh
@@ -0,0 +1,36 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Measure kernel stack entropy by sampling via LKDTM's REPORT_STACK test.
+set -e
+samples="${1:-1000}"
+
+# Capture dmesg continuously since it may fill up depending on sample size.
+log=$(mktemp -t stack-entropy-XXXXXX)
+dmesg --follow >"$log" & pid=$!
+report=-1
+for i in $(seq 1 $samples); do
+ echo "REPORT_STACK" >/sys/kernel/debug/provoke-crash/DIRECT
+ if [ -t 1 ]; then
+ percent=$(( 100 * $i / $samples ))
+ if [ "$percent" -ne "$report" ]; then
+ /bin/echo -en "$percent%\r"
+ report="$percent"
+ fi
+ fi
+done
+kill "$pid"
+
+# Count unique offsets since last run.
+seen=$(tac "$log" | grep -m1 -B"$samples"0 'Starting stack offset' | \
+ grep 'Stack offset' | awk '{print $NF}' | sort | uniq -c | wc -l)
+bits=$(echo "obase=2; $seen" | bc | wc -L)
+echo "Bits of stack entropy: $bits"
+rm -f "$log"
+
+# We would expect any functional stack randomization to be at least 5 bits.
+if [ "$bits" -lt 5 ]; then
+ exit 1
+else
+ exit 0
+fi
--
2.25.1

2021-04-01 23:27:11

by Kees Cook

[permalink] [raw]
Subject: [PATCH v10 3/6] stack: Optionally randomize kernel stack offset each syscall

This provides the ability for architectures to enable kernel stack base
address offset randomization. This feature is controlled by the boot
param "randomize_kstack_offset=on/off", with its default value set by
CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT.

This feature is based on the original idea from the last public release
of PaX's RANDKSTACK feature: https://pax.grsecurity.net/docs/randkstack.txt
All the credit for the original idea goes to the PaX team. Note that
the design and implementation of this upstream randomize_kstack_offset
feature differs greatly from the RANDKSTACK feature (see below).

Reasoning for the feature:

This feature aims to make harder the various stack-based attacks that
rely on deterministic stack structure. We have had many such attacks in
past (just to name few):

https://jon.oberheide.org/files/infiltrate12-thestackisback.pdf
https://jon.oberheide.org/files/stackjacking-infiltrate11.pdf
https://googleprojectzero.blogspot.com/2016/06/exploiting-recursion-in-linux-kernel_20.html

As Linux kernel stack protections have been constantly improving
(vmap-based stack allocation with guard pages, removal of thread_info,
STACKLEAK), attackers have had to find new ways for their exploits
to work. They have done so, continuing to rely on the kernel's stack
determinism, in situations where VMAP_STACK and THREAD_INFO_IN_TASK_STRUCT
were not relevant. For example, the following recent attacks would have
been hampered if the stack offset was non-deterministic between syscalls:

https://repositorio-aberto.up.pt/bitstream/10216/125357/2/374717.pdf
(page 70: targeting the pt_regs copy with linear stack overflow)

https://a13xp0p0v.github.io/2020/02/15/CVE-2019-18683.html
(leaked stack address from one syscall as a target during next syscall)

The main idea is that since the stack offset is randomized on each system
call, it is harder for an attack to reliably land in any particular place
on the thread stack, even with address exposures, as the stack base will
change on the next syscall. Also, since randomization is performed after
placing pt_regs, the ptrace-based approach[1] to discover the randomized
offset during a long-running syscall should not be possible.

Design description:

During most of the kernel's execution, it runs on the "thread stack",
which is pretty deterministic in its structure: it is fixed in size,
and on every entry from userspace to kernel on a syscall the thread
stack starts construction from an address fetched from the per-cpu
cpu_current_top_of_stack variable. The first element to be pushed to the
thread stack is the pt_regs struct that stores all required CPU registers
and syscall parameters. Finally the specific syscall function is called,
with the stack being used as the kernel executes the resulting request.

The goal of randomize_kstack_offset feature is to add a random offset
after the pt_regs has been pushed to the stack and before the rest of the
thread stack is used during the syscall processing, and to change it every
time a process issues a syscall. The source of randomness is currently
architecture-defined (but x86 is using the low byte of rdtsc()). Future
improvements for different entropy sources is possible, but out of scope
for this patch. Further more, to add more unpredictability, new offsets
are chosen at the end of syscalls (the timing of which should be less
easy to measure from userspace than at syscall entry time), and stored
in a per-CPU variable, so that the life of the value does not stay
explicitly tied to a single task.

As suggested by Andy Lutomirski, the offset is added using alloca()
and an empty asm() statement with an output constraint, since it avoids
changes to assembly syscall entry code, to the unwinder, and provides
correct stack alignment as defined by the compiler.

In order to make this available by default with zero performance impact
for those that don't want it, it is boot-time selectable with static
branches. This way, if the overhead is not wanted, it can just be
left turned off with no performance impact.

The generated assembly for x86_64 with GCC looks like this:

...
ffffffff81003977: 65 8b 05 02 ea 00 7f mov %gs:0x7f00ea02(%rip),%eax
# 12380 <kstack_offset>
ffffffff8100397e: 25 ff 03 00 00 and $0x3ff,%eax
ffffffff81003983: 48 83 c0 0f add $0xf,%rax
ffffffff81003987: 25 f8 07 00 00 and $0x7f8,%eax
ffffffff8100398c: 48 29 c4 sub %rax,%rsp
ffffffff8100398f: 48 8d 44 24 0f lea 0xf(%rsp),%rax
ffffffff81003994: 48 83 e0 f0 and $0xfffffffffffffff0,%rax
...

As a result of the above stack alignment, this patch introduces about
5 bits of randomness after pt_regs is spilled to the thread stack on
x86_64, and 6 bits on x86_32 (since its has 1 fewer bit required for
stack alignment). The amount of entropy could be adjusted based on how
much of the stack space we wish to trade for security.

My measure of syscall performance overhead (on x86_64):

lmbench: /usr/lib/lmbench/bin/x86_64-linux-gnu/lat_syscall -N 10000 null
randomize_kstack_offset=y Simple syscall: 0.7082 microseconds
randomize_kstack_offset=n Simple syscall: 0.7016 microseconds

So, roughly 0.9% overhead growth for a no-op syscall, which is very
manageable. And for people that don't want this, it's off by default.

There are two gotchas with using the alloca() trick. First,
compilers that have Stack Clash protection (-fstack-clash-protection)
enabled by default (e.g. Ubuntu[3]) add pagesize stack probes to
any dynamic stack allocations. While the randomization offset is
always less than a page, the resulting assembly would still contain
(unreachable!) probing routines, bloating the resulting assembly. To
avoid this, -fno-stack-clash-protection is unconditionally added to
the kernel Makefile since this is the only dynamic stack allocation in
the kernel (now that VLAs have been removed) and it is provably safe
from Stack Clash style attacks.

The second gotcha with alloca() is a negative interaction with
-fstack-protector*, in that it sees the alloca() as an array allocation,
which triggers the unconditional addition of the stack canary function
pre/post-amble which slows down syscalls regardless of the static
branch. In order to avoid adding this unneeded check and its associated
performance impact, architectures need to carefully remove uses of
-fstack-protector-strong (or -fstack-protector) in the compilation units
that use the add_random_kstack() macro and to audit the resulting stack
mitigation coverage (to make sure no desired coverage disappears). No
change is visible for this on x86 because the stack protector is already
unconditionally disabled for the compilation unit, but the change is
required on arm64. There is, unfortunately, no attribute that can be
used to disable stack protector for specific functions.

Comparison to PaX RANDKSTACK feature:

The RANDKSTACK feature randomizes the location of the stack start
(cpu_current_top_of_stack), i.e. including the location of pt_regs
structure itself on the stack. Initially this patch followed the same
approach, but during the recent discussions[2], it has been determined
to be of a little value since, if ptrace functionality is available for
an attacker, they can use PTRACE_PEEKUSR/PTRACE_POKEUSR to read/write
different offsets in the pt_regs struct, observe the cache behavior of
the pt_regs accesses, and figure out the random stack offset. Another
difference is that the random offset is stored in a per-cpu variable,
rather than having it be per-thread. As a result, these implementations
differ a fair bit in their implementation details and results, though
obviously the intent is similar.

[1] https://lore.kernel.org/kernel-hardening/2236FBA76BA1254E88B949DDB74E612BA4BC57C1@IRSMSX102.ger.corp.intel.com/
[2] https://lore.kernel.org/kernel-hardening/[email protected]/
[3] https://lists.ubuntu.com/archives/ubuntu-devel/2019-June/040741.html

Co-developed-by: Elena Reshetova <[email protected]>
Signed-off-by: Elena Reshetova <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
Reviewed-by: Thomas Gleixner <[email protected]>
Link: https://lore.kernel.org/lkml/[email protected]/
Signed-off-by: Kees Cook <[email protected]>
---
.../admin-guide/kernel-parameters.txt | 11 ++++
Makefile | 4 ++
arch/Kconfig | 23 ++++++++
include/linux/randomize_kstack.h | 54 +++++++++++++++++++
init/main.c | 23 ++++++++
5 files changed, 115 insertions(+)
create mode 100644 include/linux/randomize_kstack.h

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 04545725f187..bee8644a192e 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4061,6 +4061,17 @@
fully seed the kernel's CRNG. Default is controlled
by CONFIG_RANDOM_TRUST_CPU.

+ randomize_kstack_offset=
+ [KNL] Enable or disable kernel stack offset
+ randomization, which provides roughly 5 bits of
+ entropy, frustrating memory corruption attacks
+ that depend on stack address determinism or
+ cross-syscall address exposures. This is only
+ available on architectures that have defined
+ CONFIG_HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET.
+ Format: <bool> (1/Y/y=enable, 0/N/n=disable)
+ Default is CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT.
+
ras=option[,option,...] [KNL] RAS-specific options

cec_disable [X86]
diff --git a/Makefile b/Makefile
index 31dcdb3d61fa..8a959a264588 100644
--- a/Makefile
+++ b/Makefile
@@ -811,6 +811,10 @@ KBUILD_CFLAGS += -ftrivial-auto-var-init=zero
KBUILD_CFLAGS += -enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang
endif

+# While VLAs have been removed, GCC produces unreachable stack probes
+# for the randomize_kstack_offset feature. Disable it for all compilers.
+KBUILD_CFLAGS += $(call cc-option, -fno-stack-clash-protection)
+
DEBUG_CFLAGS :=

# Workaround for GCC versions < 5.0
diff --git a/arch/Kconfig b/arch/Kconfig
index 2bb30673d8e6..4fe6b047fcbc 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1055,6 +1055,29 @@ config VMAP_STACK
backing virtual mappings with real shadow memory, and KASAN_VMALLOC
must be enabled.

+config HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
+ def_bool n
+ help
+ An arch should select this symbol if it can support kernel stack
+ offset randomization with calls to add_random_kstack_offset()
+ during syscall entry and choose_random_kstack_offset() during
+ syscall exit. Careful removal of -fstack-protector-strong and
+ -fstack-protector should also be applied to the entry code and
+ closely examined, as the artificial stack bump looks like an array
+ to the compiler, so it will attempt to add canary checks regardless
+ of the static branch state.
+
+config RANDOMIZE_KSTACK_OFFSET_DEFAULT
+ bool "Randomize kernel stack offset on syscall entry"
+ depends on HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
+ help
+ The kernel stack offset can be randomized (after pt_regs) by
+ roughly 5 bits of entropy, frustrating memory corruption
+ attacks that depend on stack address determinism or
+ cross-syscall address exposures. This feature is controlled
+ by kernel boot param "randomize_kstack_offset=on/off", and this
+ config chooses the default boot state.
+
config ARCH_OPTIONAL_KERNEL_RWX
def_bool n

diff --git a/include/linux/randomize_kstack.h b/include/linux/randomize_kstack.h
new file mode 100644
index 000000000000..fd80fab663a9
--- /dev/null
+++ b/include/linux/randomize_kstack.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _LINUX_RANDOMIZE_KSTACK_H
+#define _LINUX_RANDOMIZE_KSTACK_H
+
+#include <linux/kernel.h>
+#include <linux/jump_label.h>
+#include <linux/percpu-defs.h>
+
+DECLARE_STATIC_KEY_MAYBE(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT,
+ randomize_kstack_offset);
+DECLARE_PER_CPU(u32, kstack_offset);
+
+/*
+ * Do not use this anywhere else in the kernel. This is used here because
+ * it provides an arch-agnostic way to grow the stack with correct
+ * alignment. Also, since this use is being explicitly masked to a max of
+ * 10 bits, stack-clash style attacks are unlikely. For more details see
+ * "VLAs" in Documentation/process/deprecated.rst
+ */
+void *__builtin_alloca(size_t size);
+/*
+ * Use, at most, 10 bits of entropy. We explicitly cap this to keep the
+ * "VLA" from being unbounded (see above). 10 bits leaves enough room for
+ * per-arch offset masks to reduce entropy (by removing higher bits, since
+ * high entropy may overly constrain usable stack space), and for
+ * compiler/arch-specific stack alignment to remove the lower bits.
+ */
+#define KSTACK_OFFSET_MAX(x) ((x) & 0x3FF)
+
+/*
+ * These macros must be used during syscall entry when interrupts and
+ * preempt are disabled, and after user registers have been stored to
+ * the stack.
+ */
+#define add_random_kstack_offset() do { \
+ if (static_branch_maybe(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, \
+ &randomize_kstack_offset)) { \
+ u32 offset = raw_cpu_read(kstack_offset); \
+ u8 *ptr = __builtin_alloca(KSTACK_OFFSET_MAX(offset)); \
+ /* Keep allocation even after "ptr" loses scope. */ \
+ asm volatile("" : "=o"(*ptr) :: "memory"); \
+ } \
+} while (0)
+
+#define choose_random_kstack_offset(rand) do { \
+ if (static_branch_maybe(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, \
+ &randomize_kstack_offset)) { \
+ u32 offset = raw_cpu_read(kstack_offset); \
+ offset ^= (rand); \
+ raw_cpu_write(kstack_offset, offset); \
+ } \
+} while (0)
+
+#endif
diff --git a/init/main.c b/init/main.c
index 53b278845b88..f498aac26e8c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -844,6 +844,29 @@ static void __init mm_init(void)
pti_init();
}

+#ifdef CONFIG_HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
+DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT,
+ randomize_kstack_offset);
+DEFINE_PER_CPU(u32, kstack_offset);
+
+static int __init early_randomize_kstack_offset(char *buf)
+{
+ int ret;
+ bool bool_result;
+
+ ret = kstrtobool(buf, &bool_result);
+ if (ret)
+ return ret;
+
+ if (bool_result)
+ static_branch_enable(&randomize_kstack_offset);
+ else
+ static_branch_disable(&randomize_kstack_offset);
+ return 0;
+}
+early_param("randomize_kstack_offset", early_randomize_kstack_offset);
+#endif
+
void __init __weak arch_call_rest_init(void)
{
rest_init();
--
2.25.1

2021-04-01 23:27:45

by Kees Cook

[permalink] [raw]
Subject: [PATCH v10 5/6] arm64: entry: Enable random_kstack_offset support

Allow for a randomized stack offset on a per-syscall basis, with roughly
5 bits of entropy. (And include AAPCS rationale AAPCS thanks to Mark
Rutland.)

In order to avoid unconditional stack canaries on syscall entry (due to
the use of alloca()), also disable stack protector to avoid triggering
needless checks and slowing down the entry path. As there is no general
way to control stack protector coverage with a function attribute[1],
this must be disabled at the compilation unit level. This isn't a problem
here, though, since stack protector was not triggered before: examining
the resulting syscall.o, there are no changes in canary coverage (none
before, none now).

[1] a working __attribute__((no_stack_protector)) has been added to GCC
and Clang but has not been released in any version yet:
https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=346b302d09c1e6db56d9fe69048acb32fbb97845
https://reviews.llvm.org/rG4fbf84c1732fca596ad1d6e96015e19760eb8a9b

Signed-off-by: Kees Cook <[email protected]>
---
arch/arm64/Kconfig | 1 +
arch/arm64/kernel/Makefile | 5 +++++
arch/arm64/kernel/syscall.c | 16 ++++++++++++++++
3 files changed, 22 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1f212b47a48a..2d0e5f544429 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -146,6 +146,7 @@ config ARM64
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
select HAVE_ARCH_PFN_VALID
select HAVE_ARCH_PREL32_RELOCATIONS
+ select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_STACKLEAK
select HAVE_ARCH_THREAD_STRUCT_WHITELIST
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index ed65576ce710..6cc97730790e 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -9,6 +9,11 @@ CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_insn.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE)

+# Remove stack protector to avoid triggering unneeded stack canary
+# checks due to randomize_kstack_offset.
+CFLAGS_REMOVE_syscall.o = -fstack-protector -fstack-protector-strong
+CFLAGS_syscall.o += -fno-stack-protector
+
# Object file lists.
obj-y := debug-monitors.o entry.o irq.o fpsimd.o \
entry-common.o entry-fpsimd.o process.o ptrace.o \
diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c
index b9cf12b271d7..263d6c1a525f 100644
--- a/arch/arm64/kernel/syscall.c
+++ b/arch/arm64/kernel/syscall.c
@@ -5,6 +5,7 @@
#include <linux/errno.h>
#include <linux/nospec.h>
#include <linux/ptrace.h>
+#include <linux/randomize_kstack.h>
#include <linux/syscalls.h>

#include <asm/daifflags.h>
@@ -43,6 +44,8 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno,
{
long ret;

+ add_random_kstack_offset();
+
if (scno < sc_nr) {
syscall_fn_t syscall_fn;
syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)];
@@ -55,6 +58,19 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno,
ret = lower_32_bits(ret);

regs->regs[0] = ret;
+
+ /*
+ * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(),
+ * but not enough for arm64 stack utilization comfort. To keep
+ * reasonable stack head room, reduce the maximum offset to 9 bits.
+ *
+ * The actual entropy will be further reduced by the compiler when
+ * applying stack alignment constraints: the AAPCS mandates a
+ * 16-byte (i.e. 4-bit) aligned SP at function boundaries.
+ *
+ * The resulting 5 bits of entropy is seen in SP[8:4].
+ */
+ choose_random_kstack_offset(get_random_int() & 0x1FF);
}

static inline bool has_syscall_work(unsigned long flags)
--
2.25.1

2021-04-07 22:07:00

by Will Deacon

[permalink] [raw]
Subject: Re: [PATCH v10 3/6] stack: Optionally randomize kernel stack offset each syscall

On Thu, Apr 01, 2021 at 04:23:44PM -0700, Kees Cook wrote:
> This provides the ability for architectures to enable kernel stack base
> address offset randomization. This feature is controlled by the boot
> param "randomize_kstack_offset=on/off", with its default value set by
> CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT.
>
> This feature is based on the original idea from the last public release
> of PaX's RANDKSTACK feature: https://pax.grsecurity.net/docs/randkstack.txt
> All the credit for the original idea goes to the PaX team. Note that
> the design and implementation of this upstream randomize_kstack_offset
> feature differs greatly from the RANDKSTACK feature (see below).

[...]

> diff --git a/include/linux/randomize_kstack.h b/include/linux/randomize_kstack.h
> new file mode 100644
> index 000000000000..fd80fab663a9
> --- /dev/null
> +++ b/include/linux/randomize_kstack.h
> @@ -0,0 +1,54 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +#ifndef _LINUX_RANDOMIZE_KSTACK_H
> +#define _LINUX_RANDOMIZE_KSTACK_H
> +
> +#include <linux/kernel.h>
> +#include <linux/jump_label.h>
> +#include <linux/percpu-defs.h>
> +
> +DECLARE_STATIC_KEY_MAYBE(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT,
> + randomize_kstack_offset);
> +DECLARE_PER_CPU(u32, kstack_offset);
> +
> +/*
> + * Do not use this anywhere else in the kernel. This is used here because
> + * it provides an arch-agnostic way to grow the stack with correct
> + * alignment. Also, since this use is being explicitly masked to a max of
> + * 10 bits, stack-clash style attacks are unlikely. For more details see
> + * "VLAs" in Documentation/process/deprecated.rst
> + */
> +void *__builtin_alloca(size_t size);
> +/*
> + * Use, at most, 10 bits of entropy. We explicitly cap this to keep the
> + * "VLA" from being unbounded (see above). 10 bits leaves enough room for
> + * per-arch offset masks to reduce entropy (by removing higher bits, since
> + * high entropy may overly constrain usable stack space), and for
> + * compiler/arch-specific stack alignment to remove the lower bits.
> + */
> +#define KSTACK_OFFSET_MAX(x) ((x) & 0x3FF)
> +
> +/*
> + * These macros must be used during syscall entry when interrupts and
> + * preempt are disabled, and after user registers have been stored to
> + * the stack.
> + */

This comment is out of date, as this is called from preemptible context on
arm64. Does that matter in terms of offset randomness?

Will

2021-04-07 22:07:31

by Will Deacon

[permalink] [raw]
Subject: Re: [PATCH v10 5/6] arm64: entry: Enable random_kstack_offset support

On Thu, Apr 01, 2021 at 04:23:46PM -0700, Kees Cook wrote:
> Allow for a randomized stack offset on a per-syscall basis, with roughly
> 5 bits of entropy. (And include AAPCS rationale AAPCS thanks to Mark
> Rutland.)
>
> In order to avoid unconditional stack canaries on syscall entry (due to
> the use of alloca()), also disable stack protector to avoid triggering
> needless checks and slowing down the entry path. As there is no general
> way to control stack protector coverage with a function attribute[1],
> this must be disabled at the compilation unit level. This isn't a problem
> here, though, since stack protector was not triggered before: examining
> the resulting syscall.o, there are no changes in canary coverage (none
> before, none now).
>
> [1] a working __attribute__((no_stack_protector)) has been added to GCC
> and Clang but has not been released in any version yet:
> https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=346b302d09c1e6db56d9fe69048acb32fbb97845
> https://reviews.llvm.org/rG4fbf84c1732fca596ad1d6e96015e19760eb8a9b
>
> Signed-off-by: Kees Cook <[email protected]>
> ---
> arch/arm64/Kconfig | 1 +
> arch/arm64/kernel/Makefile | 5 +++++
> arch/arm64/kernel/syscall.c | 16 ++++++++++++++++
> 3 files changed, 22 insertions(+)

Acked-by: Will Deacon <[email protected]>

Will

2021-04-08 12:15:01

by tip-bot2 for Haifeng Xu

[permalink] [raw]
Subject: [tip: x86/entry] lkdtm: Add REPORT_STACK for checking stack offsets

The following commit has been merged into the x86/entry branch of tip:

Commit-ID: 68ef8735d253f3d840082b78f996bf2d89ee6e5f
Gitweb: https://git.kernel.org/tip/68ef8735d253f3d840082b78f996bf2d89ee6e5f
Author: Kees Cook <[email protected]>
AuthorDate: Thu, 01 Apr 2021 16:23:47 -07:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Thu, 08 Apr 2021 14:05:20 +02:00

lkdtm: Add REPORT_STACK for checking stack offsets

For validating the stack offset behavior, report the offset from a given
process's first seen stack address. Add s script to calculate the results
to the LKDTM kselftests.

Signed-off-by: Kees Cook <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
drivers/misc/lkdtm/bugs.c | 17 ++++++++-
drivers/misc/lkdtm/core.c | 1 +-
drivers/misc/lkdtm/lkdtm.h | 1 +-
tools/testing/selftests/lkdtm/.gitignore | 1 +-
tools/testing/selftests/lkdtm/Makefile | 1 +-
tools/testing/selftests/lkdtm/stack-entropy.sh | 36 +++++++++++++++++-
6 files changed, 57 insertions(+)
create mode 100755 tools/testing/selftests/lkdtm/stack-entropy.sh

diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c
index 110f5a8..0e8254d 100644
--- a/drivers/misc/lkdtm/bugs.c
+++ b/drivers/misc/lkdtm/bugs.c
@@ -134,6 +134,23 @@ noinline void lkdtm_CORRUPT_STACK_STRONG(void)
__lkdtm_CORRUPT_STACK((void *)&data);
}

+static pid_t stack_pid;
+static unsigned long stack_addr;
+
+void lkdtm_REPORT_STACK(void)
+{
+ volatile uintptr_t magic;
+ pid_t pid = task_pid_nr(current);
+
+ if (pid != stack_pid) {
+ pr_info("Starting stack offset tracking for pid %d\n", pid);
+ stack_pid = pid;
+ stack_addr = (uintptr_t)&magic;
+ }
+
+ pr_info("Stack offset: %d\n", (int)(stack_addr - (uintptr_t)&magic));
+}
+
void lkdtm_UNALIGNED_LOAD_STORE_WRITE(void)
{
static u8 data[5] __attribute__((aligned(4))) = {1, 2, 3, 4, 5};
diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c
index b2aff4d..8024b6a 100644
--- a/drivers/misc/lkdtm/core.c
+++ b/drivers/misc/lkdtm/core.c
@@ -110,6 +110,7 @@ static const struct crashtype crashtypes[] = {
CRASHTYPE(EXHAUST_STACK),
CRASHTYPE(CORRUPT_STACK),
CRASHTYPE(CORRUPT_STACK_STRONG),
+ CRASHTYPE(REPORT_STACK),
CRASHTYPE(CORRUPT_LIST_ADD),
CRASHTYPE(CORRUPT_LIST_DEL),
CRASHTYPE(STACK_GUARD_PAGE_LEADING),
diff --git a/drivers/misc/lkdtm/lkdtm.h b/drivers/misc/lkdtm/lkdtm.h
index 5ae48c6..99f90d3 100644
--- a/drivers/misc/lkdtm/lkdtm.h
+++ b/drivers/misc/lkdtm/lkdtm.h
@@ -17,6 +17,7 @@ void lkdtm_LOOP(void);
void lkdtm_EXHAUST_STACK(void);
void lkdtm_CORRUPT_STACK(void);
void lkdtm_CORRUPT_STACK_STRONG(void);
+void lkdtm_REPORT_STACK(void);
void lkdtm_UNALIGNED_LOAD_STORE_WRITE(void);
void lkdtm_SOFTLOCKUP(void);
void lkdtm_HARDLOCKUP(void);
diff --git a/tools/testing/selftests/lkdtm/.gitignore b/tools/testing/selftests/lkdtm/.gitignore
index f262126..d4b0be8 100644
--- a/tools/testing/selftests/lkdtm/.gitignore
+++ b/tools/testing/selftests/lkdtm/.gitignore
@@ -1,2 +1,3 @@
*.sh
!run.sh
+!stack-entropy.sh
diff --git a/tools/testing/selftests/lkdtm/Makefile b/tools/testing/selftests/lkdtm/Makefile
index 1bcc9ee..c71109c 100644
--- a/tools/testing/selftests/lkdtm/Makefile
+++ b/tools/testing/selftests/lkdtm/Makefile
@@ -5,6 +5,7 @@ include ../lib.mk

# NOTE: $(OUTPUT) won't get default value if used before lib.mk
TEST_FILES := tests.txt
+TEST_PROGS := stack-entropy.sh
TEST_GEN_PROGS = $(patsubst %,$(OUTPUT)/%.sh,$(shell awk '{print $$1}' tests.txt | sed -e 's/\#//'))
all: $(TEST_GEN_PROGS)

diff --git a/tools/testing/selftests/lkdtm/stack-entropy.sh b/tools/testing/selftests/lkdtm/stack-entropy.sh
new file mode 100755
index 0000000..b1b8a50
--- /dev/null
+++ b/tools/testing/selftests/lkdtm/stack-entropy.sh
@@ -0,0 +1,36 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Measure kernel stack entropy by sampling via LKDTM's REPORT_STACK test.
+set -e
+samples="${1:-1000}"
+
+# Capture dmesg continuously since it may fill up depending on sample size.
+log=$(mktemp -t stack-entropy-XXXXXX)
+dmesg --follow >"$log" & pid=$!
+report=-1
+for i in $(seq 1 $samples); do
+ echo "REPORT_STACK" >/sys/kernel/debug/provoke-crash/DIRECT
+ if [ -t 1 ]; then
+ percent=$(( 100 * $i / $samples ))
+ if [ "$percent" -ne "$report" ]; then
+ /bin/echo -en "$percent%\r"
+ report="$percent"
+ fi
+ fi
+done
+kill "$pid"
+
+# Count unique offsets since last run.
+seen=$(tac "$log" | grep -m1 -B"$samples"0 'Starting stack offset' | \
+ grep 'Stack offset' | awk '{print $NF}' | sort | uniq -c | wc -l)
+bits=$(echo "obase=2; $seen" | bc | wc -L)
+echo "Bits of stack entropy: $bits"
+rm -f "$log"
+
+# We would expect any functional stack randomization to be at least 5 bits.
+if [ "$bits" -lt 5 ]; then
+ exit 1
+else
+ exit 0
+fi

2021-04-08 12:16:10

by tip-bot2 for Haifeng Xu

[permalink] [raw]
Subject: [tip: x86/entry] stack: Optionally randomize kernel stack offset each syscall

The following commit has been merged into the x86/entry branch of tip:

Commit-ID: 39218ff4c625dbf2e68224024fe0acaa60bcd51a
Gitweb: https://git.kernel.org/tip/39218ff4c625dbf2e68224024fe0acaa60bcd51a
Author: Kees Cook <[email protected]>
AuthorDate: Thu, 01 Apr 2021 16:23:44 -07:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Thu, 08 Apr 2021 14:05:19 +02:00

stack: Optionally randomize kernel stack offset each syscall

This provides the ability for architectures to enable kernel stack base
address offset randomization. This feature is controlled by the boot
param "randomize_kstack_offset=on/off", with its default value set by
CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT.

This feature is based on the original idea from the last public release
of PaX's RANDKSTACK feature: https://pax.grsecurity.net/docs/randkstack.txt
All the credit for the original idea goes to the PaX team. Note that
the design and implementation of this upstream randomize_kstack_offset
feature differs greatly from the RANDKSTACK feature (see below).

Reasoning for the feature:

This feature aims to make harder the various stack-based attacks that
rely on deterministic stack structure. We have had many such attacks in
past (just to name few):

https://jon.oberheide.org/files/infiltrate12-thestackisback.pdf
https://jon.oberheide.org/files/stackjacking-infiltrate11.pdf
https://googleprojectzero.blogspot.com/2016/06/exploiting-recursion-in-linux-kernel_20.html

As Linux kernel stack protections have been constantly improving
(vmap-based stack allocation with guard pages, removal of thread_info,
STACKLEAK), attackers have had to find new ways for their exploits
to work. They have done so, continuing to rely on the kernel's stack
determinism, in situations where VMAP_STACK and THREAD_INFO_IN_TASK_STRUCT
were not relevant. For example, the following recent attacks would have
been hampered if the stack offset was non-deterministic between syscalls:

https://repositorio-aberto.up.pt/bitstream/10216/125357/2/374717.pdf
(page 70: targeting the pt_regs copy with linear stack overflow)

https://a13xp0p0v.github.io/2020/02/15/CVE-2019-18683.html
(leaked stack address from one syscall as a target during next syscall)

The main idea is that since the stack offset is randomized on each system
call, it is harder for an attack to reliably land in any particular place
on the thread stack, even with address exposures, as the stack base will
change on the next syscall. Also, since randomization is performed after
placing pt_regs, the ptrace-based approach[1] to discover the randomized
offset during a long-running syscall should not be possible.

Design description:

During most of the kernel's execution, it runs on the "thread stack",
which is pretty deterministic in its structure: it is fixed in size,
and on every entry from userspace to kernel on a syscall the thread
stack starts construction from an address fetched from the per-cpu
cpu_current_top_of_stack variable. The first element to be pushed to the
thread stack is the pt_regs struct that stores all required CPU registers
and syscall parameters. Finally the specific syscall function is called,
with the stack being used as the kernel executes the resulting request.

The goal of randomize_kstack_offset feature is to add a random offset
after the pt_regs has been pushed to the stack and before the rest of the
thread stack is used during the syscall processing, and to change it every
time a process issues a syscall. The source of randomness is currently
architecture-defined (but x86 is using the low byte of rdtsc()). Future
improvements for different entropy sources is possible, but out of scope
for this patch. Further more, to add more unpredictability, new offsets
are chosen at the end of syscalls (the timing of which should be less
easy to measure from userspace than at syscall entry time), and stored
in a per-CPU variable, so that the life of the value does not stay
explicitly tied to a single task.

As suggested by Andy Lutomirski, the offset is added using alloca()
and an empty asm() statement with an output constraint, since it avoids
changes to assembly syscall entry code, to the unwinder, and provides
correct stack alignment as defined by the compiler.

In order to make this available by default with zero performance impact
for those that don't want it, it is boot-time selectable with static
branches. This way, if the overhead is not wanted, it can just be
left turned off with no performance impact.

The generated assembly for x86_64 with GCC looks like this:

...
ffffffff81003977: 65 8b 05 02 ea 00 7f mov %gs:0x7f00ea02(%rip),%eax
# 12380 <kstack_offset>
ffffffff8100397e: 25 ff 03 00 00 and $0x3ff,%eax
ffffffff81003983: 48 83 c0 0f add $0xf,%rax
ffffffff81003987: 25 f8 07 00 00 and $0x7f8,%eax
ffffffff8100398c: 48 29 c4 sub %rax,%rsp
ffffffff8100398f: 48 8d 44 24 0f lea 0xf(%rsp),%rax
ffffffff81003994: 48 83 e0 f0 and $0xfffffffffffffff0,%rax
...

As a result of the above stack alignment, this patch introduces about
5 bits of randomness after pt_regs is spilled to the thread stack on
x86_64, and 6 bits on x86_32 (since its has 1 fewer bit required for
stack alignment). The amount of entropy could be adjusted based on how
much of the stack space we wish to trade for security.

My measure of syscall performance overhead (on x86_64):

lmbench: /usr/lib/lmbench/bin/x86_64-linux-gnu/lat_syscall -N 10000 null
randomize_kstack_offset=y Simple syscall: 0.7082 microseconds
randomize_kstack_offset=n Simple syscall: 0.7016 microseconds

So, roughly 0.9% overhead growth for a no-op syscall, which is very
manageable. And for people that don't want this, it's off by default.

There are two gotchas with using the alloca() trick. First,
compilers that have Stack Clash protection (-fstack-clash-protection)
enabled by default (e.g. Ubuntu[3]) add pagesize stack probes to
any dynamic stack allocations. While the randomization offset is
always less than a page, the resulting assembly would still contain
(unreachable!) probing routines, bloating the resulting assembly. To
avoid this, -fno-stack-clash-protection is unconditionally added to
the kernel Makefile since this is the only dynamic stack allocation in
the kernel (now that VLAs have been removed) and it is provably safe
from Stack Clash style attacks.

The second gotcha with alloca() is a negative interaction with
-fstack-protector*, in that it sees the alloca() as an array allocation,
which triggers the unconditional addition of the stack canary function
pre/post-amble which slows down syscalls regardless of the static
branch. In order to avoid adding this unneeded check and its associated
performance impact, architectures need to carefully remove uses of
-fstack-protector-strong (or -fstack-protector) in the compilation units
that use the add_random_kstack() macro and to audit the resulting stack
mitigation coverage (to make sure no desired coverage disappears). No
change is visible for this on x86 because the stack protector is already
unconditionally disabled for the compilation unit, but the change is
required on arm64. There is, unfortunately, no attribute that can be
used to disable stack protector for specific functions.

Comparison to PaX RANDKSTACK feature:

The RANDKSTACK feature randomizes the location of the stack start
(cpu_current_top_of_stack), i.e. including the location of pt_regs
structure itself on the stack. Initially this patch followed the same
approach, but during the recent discussions[2], it has been determined
to be of a little value since, if ptrace functionality is available for
an attacker, they can use PTRACE_PEEKUSR/PTRACE_POKEUSR to read/write
different offsets in the pt_regs struct, observe the cache behavior of
the pt_regs accesses, and figure out the random stack offset. Another
difference is that the random offset is stored in a per-cpu variable,
rather than having it be per-thread. As a result, these implementations
differ a fair bit in their implementation details and results, though
obviously the intent is similar.

[1] https://lore.kernel.org/kernel-hardening/2236FBA76BA1254E88B949DDB74E612BA4BC57C1@IRSMSX102.ger.corp.intel.com/
[2] https://lore.kernel.org/kernel-hardening/[email protected]/
[3] https://lists.ubuntu.com/archives/ubuntu-devel/2019-June/040741.html

Co-developed-by: Elena Reshetova <[email protected]>
Signed-off-by: Elena Reshetova <[email protected]>
Signed-off-by: Kees Cook <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Reviewed-by: Thomas Gleixner <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
Documentation/admin-guide/kernel-parameters.txt | 11 +++-
Makefile | 4 +-
arch/Kconfig | 23 +++++++-
include/linux/randomize_kstack.h | 54 ++++++++++++++++-
init/main.c | 23 +++++++-
5 files changed, 115 insertions(+)
create mode 100644 include/linux/randomize_kstack.h

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 0454572..bee8644 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4061,6 +4061,17 @@
fully seed the kernel's CRNG. Default is controlled
by CONFIG_RANDOM_TRUST_CPU.

+ randomize_kstack_offset=
+ [KNL] Enable or disable kernel stack offset
+ randomization, which provides roughly 5 bits of
+ entropy, frustrating memory corruption attacks
+ that depend on stack address determinism or
+ cross-syscall address exposures. This is only
+ available on architectures that have defined
+ CONFIG_HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET.
+ Format: <bool> (1/Y/y=enable, 0/N/n=disable)
+ Default is CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT.
+
ras=option[,option,...] [KNL] RAS-specific options

cec_disable [X86]
diff --git a/Makefile b/Makefile
index cc77fd4..d3bf503 100644
--- a/Makefile
+++ b/Makefile
@@ -813,6 +813,10 @@ KBUILD_CFLAGS += -ftrivial-auto-var-init=zero
KBUILD_CFLAGS += -enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang
endif

+# While VLAs have been removed, GCC produces unreachable stack probes
+# for the randomize_kstack_offset feature. Disable it for all compilers.
+KBUILD_CFLAGS += $(call cc-option, -fno-stack-clash-protection)
+
DEBUG_CFLAGS :=

# Workaround for GCC versions < 5.0
diff --git a/arch/Kconfig b/arch/Kconfig
index ecfd352..6b11c82 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1054,6 +1054,29 @@ config VMAP_STACK
backing virtual mappings with real shadow memory, and KASAN_VMALLOC
must be enabled.

+config HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
+ def_bool n
+ help
+ An arch should select this symbol if it can support kernel stack
+ offset randomization with calls to add_random_kstack_offset()
+ during syscall entry and choose_random_kstack_offset() during
+ syscall exit. Careful removal of -fstack-protector-strong and
+ -fstack-protector should also be applied to the entry code and
+ closely examined, as the artificial stack bump looks like an array
+ to the compiler, so it will attempt to add canary checks regardless
+ of the static branch state.
+
+config RANDOMIZE_KSTACK_OFFSET_DEFAULT
+ bool "Randomize kernel stack offset on syscall entry"
+ depends on HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
+ help
+ The kernel stack offset can be randomized (after pt_regs) by
+ roughly 5 bits of entropy, frustrating memory corruption
+ attacks that depend on stack address determinism or
+ cross-syscall address exposures. This feature is controlled
+ by kernel boot param "randomize_kstack_offset=on/off", and this
+ config chooses the default boot state.
+
config ARCH_OPTIONAL_KERNEL_RWX
def_bool n

diff --git a/include/linux/randomize_kstack.h b/include/linux/randomize_kstack.h
new file mode 100644
index 0000000..fd80fab
--- /dev/null
+++ b/include/linux/randomize_kstack.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _LINUX_RANDOMIZE_KSTACK_H
+#define _LINUX_RANDOMIZE_KSTACK_H
+
+#include <linux/kernel.h>
+#include <linux/jump_label.h>
+#include <linux/percpu-defs.h>
+
+DECLARE_STATIC_KEY_MAYBE(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT,
+ randomize_kstack_offset);
+DECLARE_PER_CPU(u32, kstack_offset);
+
+/*
+ * Do not use this anywhere else in the kernel. This is used here because
+ * it provides an arch-agnostic way to grow the stack with correct
+ * alignment. Also, since this use is being explicitly masked to a max of
+ * 10 bits, stack-clash style attacks are unlikely. For more details see
+ * "VLAs" in Documentation/process/deprecated.rst
+ */
+void *__builtin_alloca(size_t size);
+/*
+ * Use, at most, 10 bits of entropy. We explicitly cap this to keep the
+ * "VLA" from being unbounded (see above). 10 bits leaves enough room for
+ * per-arch offset masks to reduce entropy (by removing higher bits, since
+ * high entropy may overly constrain usable stack space), and for
+ * compiler/arch-specific stack alignment to remove the lower bits.
+ */
+#define KSTACK_OFFSET_MAX(x) ((x) & 0x3FF)
+
+/*
+ * These macros must be used during syscall entry when interrupts and
+ * preempt are disabled, and after user registers have been stored to
+ * the stack.
+ */
+#define add_random_kstack_offset() do { \
+ if (static_branch_maybe(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, \
+ &randomize_kstack_offset)) { \
+ u32 offset = raw_cpu_read(kstack_offset); \
+ u8 *ptr = __builtin_alloca(KSTACK_OFFSET_MAX(offset)); \
+ /* Keep allocation even after "ptr" loses scope. */ \
+ asm volatile("" : "=o"(*ptr) :: "memory"); \
+ } \
+} while (0)
+
+#define choose_random_kstack_offset(rand) do { \
+ if (static_branch_maybe(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, \
+ &randomize_kstack_offset)) { \
+ u32 offset = raw_cpu_read(kstack_offset); \
+ offset ^= (rand); \
+ raw_cpu_write(kstack_offset, offset); \
+ } \
+} while (0)
+
+#endif
diff --git a/init/main.c b/init/main.c
index 53b2788..f498aac 100644
--- a/init/main.c
+++ b/init/main.c
@@ -844,6 +844,29 @@ static void __init mm_init(void)
pti_init();
}

+#ifdef CONFIG_HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
+DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT,
+ randomize_kstack_offset);
+DEFINE_PER_CPU(u32, kstack_offset);
+
+static int __init early_randomize_kstack_offset(char *buf)
+{
+ int ret;
+ bool bool_result;
+
+ ret = kstrtobool(buf, &bool_result);
+ if (ret)
+ return ret;
+
+ if (bool_result)
+ static_branch_enable(&randomize_kstack_offset);
+ else
+ static_branch_disable(&randomize_kstack_offset);
+ return 0;
+}
+early_param("randomize_kstack_offset", early_randomize_kstack_offset);
+#endif
+
void __init __weak arch_call_rest_init(void)
{
rest_init();

2021-04-08 12:17:07

by tip-bot2 for Haifeng Xu

[permalink] [raw]
Subject: [tip: x86/entry] arm64: entry: Enable random_kstack_offset support

The following commit has been merged into the x86/entry branch of tip:

Commit-ID: 70918779aec9bd01d16f4e6e800ffe423d196021
Gitweb: https://git.kernel.org/tip/70918779aec9bd01d16f4e6e800ffe423d196021
Author: Kees Cook <[email protected]>
AuthorDate: Thu, 01 Apr 2021 16:23:46 -07:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Thu, 08 Apr 2021 14:12:19 +02:00

arm64: entry: Enable random_kstack_offset support

Allow for a randomized stack offset on a per-syscall basis, with roughly
5 bits of entropy. (And include AAPCS rationale AAPCS thanks to Mark
Rutland.)

In order to avoid unconditional stack canaries on syscall entry (due to
the use of alloca()), also disable stack protector to avoid triggering
needless checks and slowing down the entry path. As there is no general
way to control stack protector coverage with a function attribute[1],
this must be disabled at the compilation unit level. This isn't a problem
here, though, since stack protector was not triggered before: examining
the resulting syscall.o, there are no changes in canary coverage (none
before, none now).

[1] a working __attribute__((no_stack_protector)) has been added to GCC
and Clang but has not been released in any version yet:
https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=346b302d09c1e6db56d9fe69048acb32fbb97845
https://reviews.llvm.org/rG4fbf84c1732fca596ad1d6e96015e19760eb8a9b

Signed-off-by: Kees Cook <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Acked-by: Will Deacon <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
arch/arm64/Kconfig | 1 +
arch/arm64/kernel/Makefile | 5 +++++
arch/arm64/kernel/syscall.c | 16 ++++++++++++++++
3 files changed, 22 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index e4e1b65..4640d25 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -146,6 +146,7 @@ config ARM64
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
select HAVE_ARCH_PFN_VALID
select HAVE_ARCH_PREL32_RELOCATIONS
+ select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_STACKLEAK
select HAVE_ARCH_THREAD_STRUCT_WHITELIST
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index ed65576..6cc9773 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -9,6 +9,11 @@ CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_insn.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE)

+# Remove stack protector to avoid triggering unneeded stack canary
+# checks due to randomize_kstack_offset.
+CFLAGS_REMOVE_syscall.o = -fstack-protector -fstack-protector-strong
+CFLAGS_syscall.o += -fno-stack-protector
+
# Object file lists.
obj-y := debug-monitors.o entry.o irq.o fpsimd.o \
entry-common.o entry-fpsimd.o process.o ptrace.o \
diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c
index b9cf12b..263d6c1 100644
--- a/arch/arm64/kernel/syscall.c
+++ b/arch/arm64/kernel/syscall.c
@@ -5,6 +5,7 @@
#include <linux/errno.h>
#include <linux/nospec.h>
#include <linux/ptrace.h>
+#include <linux/randomize_kstack.h>
#include <linux/syscalls.h>

#include <asm/daifflags.h>
@@ -43,6 +44,8 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno,
{
long ret;

+ add_random_kstack_offset();
+
if (scno < sc_nr) {
syscall_fn_t syscall_fn;
syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)];
@@ -55,6 +58,19 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno,
ret = lower_32_bits(ret);

regs->regs[0] = ret;
+
+ /*
+ * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(),
+ * but not enough for arm64 stack utilization comfort. To keep
+ * reasonable stack head room, reduce the maximum offset to 9 bits.
+ *
+ * The actual entropy will be further reduced by the compiler when
+ * applying stack alignment constraints: the AAPCS mandates a
+ * 16-byte (i.e. 4-bit) aligned SP at function boundaries.
+ *
+ * The resulting 5 bits of entropy is seen in SP[8:4].
+ */
+ choose_random_kstack_offset(get_random_int() & 0x1FF);
}

static inline bool has_syscall_work(unsigned long flags)