This patch ensures a syscall does not return to user-mode with a kernel
address limit. If that happened, a process can corrupt kernel-mode
memory and elevate privileges.
For example, it would mitigation this bug:
- https://bugs.chromium.org/p/project-zero/issues/detail?id=990
If the CONFIG_BUG_ON_DATA_CORRUPTION option is enabled, an incorrect
state will result in a BUG_ON.
The CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE option is also
added so each architecture can optimize this change.
Signed-off-by: Thomas Garnier <[email protected]>
---
Based on next-20170322
---
arch/s390/Kconfig | 1 +
include/linux/syscalls.h | 18 +++++++++++++++++-
init/Kconfig | 7 +++++++
kernel/sys.c | 8 ++++++++
4 files changed, 33 insertions(+), 1 deletion(-)
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index a2dcef0aacc7..b73f5b87bc99 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -103,6 +103,7 @@ config S390
select ARCH_INLINE_WRITE_UNLOCK_BH
select ARCH_INLINE_WRITE_UNLOCK_IRQ
select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+ select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
select ARCH_SAVE_PAGE_KEYS if HIBERNATION
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_NUMA_BALANCING
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 980c3c9b06f8..e659076adf6c 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -191,6 +191,19 @@ extern struct trace_event_functions exit_syscall_print_funcs;
SYSCALL_METADATA(sname, x, __VA_ARGS__) \
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
+asmlinkage void verify_pre_usermode_state(void);
+
+#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
+#define __CHECK_USER_CALLER() \
+ bool user_caller = segment_eq(get_fs(), USER_DS)
+#define __VERIFY_PRE_USERMODE_STATE() \
+ if (user_caller) verify_pre_usermode_state()
+#else
+#define __CHECK_USER_CALLER()
+#define __VERIFY_PRE_USERMODE_STATE()
+#endif
+
+
#define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
#define __SYSCALL_DEFINEx(x, name, ...) \
asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \
@@ -199,7 +212,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
{ \
- long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
+ long ret; \
+ __CHECK_USER_CALLER(); \
+ ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
+ __VERIFY_PRE_USERMODE_STATE(); \
__MAP(x,__SC_TEST,__VA_ARGS__); \
__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
return ret; \
diff --git a/init/Kconfig b/init/Kconfig
index c859c993c26f..c4efc3a95e4a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1929,6 +1929,13 @@ config PROFILING
config TRACEPOINTS
bool
+#
+# Set by each architecture that want to optimize how verify_pre_usermode_state
+# is called.
+#
+config ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
+ bool
+
source "arch/Kconfig"
endmenu # General setup
diff --git a/kernel/sys.c b/kernel/sys.c
index 196c7134bee6..411163ac9dc3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2459,3 +2459,11 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
return 0;
}
#endif /* CONFIG_COMPAT */
+
+/* Called before coming back to user-mode */
+asmlinkage void verify_pre_usermode_state(void)
+{
+ if (CHECK_DATA_CORRUPTION(!segment_eq(get_fs(), USER_DS),
+ "incorrect get_fs() on user-mode return"))
+ set_fs(USER_DS);
+}
--
2.12.1.500.gab5fba24ee-goog
Implement specific usage of verify_pre_usermode_state for user-mode
returns for arm64.
---
Based on next-20170322
---
arch/arm64/Kconfig | 1 +
arch/arm64/kernel/entry.S | 25 +++++++++++++++++++++++++
2 files changed, 26 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index f2b0b528037d..0e86d87259f4 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -24,6 +24,7 @@ config ARM64
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION
select ARCH_WANT_FRAME_POINTERS
select ARCH_HAS_UBSAN_SANITIZE_ALL
+ select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
select ARM_AMBA
select ARM_ARCH_TIMER
select ARM_GIC
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 43512d4d7df2..685b43771ac9 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -744,6 +744,10 @@ ENDPROC(cpu_switch_to)
ret_fast_syscall:
disable_irq // disable interrupts
str x0, [sp, #S_X0] // returned x0
+ ldr x2, [tsk, #TSK_TI_ADDR_LIMIT] // check addr limit change
+ mov x1, #TASK_SIZE_64
+ cmp x2, x1
+ b.ne addr_limit_fail
ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for syscall tracing
and x2, x1, #_TIF_SYSCALL_WORK
cbnz x2, ret_fast_syscall_trace
@@ -771,6 +775,11 @@ work_pending:
*/
ret_to_user:
disable_irq // disable interrupts
+ ldr x2, [tsk, #TSK_TI_ADDR_LIMIT] // check addr limit change
+ mov x1, #TASK_SIZE_64
+ cmp x2, x1
+ b.ne addr_limit_fail
+
ldr x1, [tsk, #TSK_TI_FLAGS]
and x2, x1, #_TIF_WORK_MASK
cbnz x2, work_pending
@@ -779,6 +788,22 @@ finish_ret_to_user:
kernel_exit 0
ENDPROC(ret_to_user)
+addr_limit_fail:
+#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
+ stp x0, lr, [sp,#-16]!
+ bl verify_pre_usermode_state
+ ldp x0, lr, [sp],#16
+#else
+ /*
+ * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a
+ * warning.
+ */
+ mov x2, #TASK_SIZE_64
+ str x2, [tsk, #TSK_TI_ADDR_LIMIT]
+ ALTERNATIVE(nop, SET_PSTATE_UAO(0), ARM64_HAS_UAO, CONFIG_ARM64_UAO)
+#endif
+ ret lr
+
/*
* This is how we return from a fork.
*/
--
2.12.1.500.gab5fba24ee-goog
Implement specific usage of verify_pre_usermode_state for user-mode
returns for arm.
---
Based on next-20170322
---
arch/arm/Kconfig | 1 +
arch/arm/include/asm/domain.h | 3 +++
arch/arm/kernel/entry-common.S | 32 +++++++++++++++++++++++++++++++-
3 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index fcbc5ef1ec69..10c6dc3dfff9 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -12,6 +12,7 @@ config ARM
select ARCH_HAVE_CUSTOM_GPIO_H
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_MIGHT_HAVE_PC_PARPORT
+ select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7
select ARCH_SUPPORTS_ATOMIC_RMW
diff --git a/arch/arm/include/asm/domain.h b/arch/arm/include/asm/domain.h
index 99d9f630d6b6..fd1cd24ca9fe 100644
--- a/arch/arm/include/asm/domain.h
+++ b/arch/arm/include/asm/domain.h
@@ -58,6 +58,9 @@
#define domain_mask(dom) ((3) << (2 * (dom)))
#define domain_val(dom,type) ((type) << (2 * (dom)))
+#define DOMAIN_KERNEL_MASK domain_mask(DOMAIN_KERNEL)
+#define DOMAIN_CLIENT_VALUE domain_val(DOMAIN_KERNEL, DOMAIN_CLIENT)
+
#ifdef CONFIG_CPU_SW_DOMAIN_PAN
#define DACR_INIT \
(domain_val(DOMAIN_USER, DOMAIN_NOACCESS) | \
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index eb5cd77bf1d8..8d9bfe9ff313 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -12,6 +12,7 @@
#include <asm/unistd.h>
#include <asm/ftrace.h>
#include <asm/unwind.h>
+#include <asm/memory.h>
#ifdef CONFIG_AEABI
#include <asm/unistd-oabi.h>
#endif
@@ -27,7 +28,6 @@
#include "entry-header.S"
-
.align 5
#if !(IS_ENABLED(CONFIG_TRACE_IRQFLAGS) || IS_ENABLED(CONFIG_CONTEXT_TRACKING))
/*
@@ -40,9 +40,12 @@ ret_fast_syscall:
UNWIND(.fnstart )
UNWIND(.cantunwind )
disable_irq_notrace @ disable interrupts
+ ldr r2, [tsk, #TI_ADDR_LIMIT]
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
bne fast_work_pending
+ cmp r2, #TASK_SIZE
+ blne addr_limit_fail
/* perform architecture specific actions before user return */
arch_ret_to_user r1, lr
@@ -66,6 +69,7 @@ ret_fast_syscall:
UNWIND(.cantunwind )
str r0, [sp, #S_R0 + S_OFF]! @ save returned r0
disable_irq_notrace @ disable interrupts
+ ldr r2, [tsk, #TI_ADDR_LIMIT]
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
beq no_work_pending
@@ -82,6 +86,7 @@ slow_work_pending:
mov r2, why @ 'syscall'
bl do_work_pending
cmp r0, #0
+ ldreq r2, [tsk, #TI_ADDR_LIMIT]
beq no_work_pending
movlt scno, #(__NR_restart_syscall - __NR_SYSCALL_BASE)
ldmia sp, {r0 - r6} @ have to reload r0 - r6
@@ -99,9 +104,12 @@ ret_slow_syscall:
disable_irq_notrace @ disable interrupts
ENTRY(ret_to_user_from_irq)
ldr r1, [tsk, #TI_FLAGS]
+ ldr r2, [tsk, #TI_ADDR_LIMIT]
tst r1, #_TIF_WORK_MASK
bne slow_work_pending
no_work_pending:
+ cmp r2, #TASK_SIZE
+ blne addr_limit_fail
asm_trace_hardirqs_on save = 0
/* perform architecture specific actions before user return */
@@ -125,6 +133,28 @@ ENTRY(ret_from_fork)
b ret_slow_syscall
ENDPROC(ret_from_fork)
+addr_limit_fail:
+#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
+ stmfd sp!, {r0, lr}
+ bl verify_pre_usermode_state
+ ldmfd sp!, {r0, lr}
+#else
+ /*
+ * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a
+ * warning.
+ */
+ mov r2, #TASK_SIZE
+ str r2, [tsk, #TI_ADDR_LIMIT]
+#ifdef CONFIG_CPU_USE_DOMAINS
+ /* Switch domain like modify_domain(DOMAIN_KERNEL, DOMAIN_CLIENT) */
+ mrc p15, 0, r2, c3, c0, 0 @ Get domain register
+ bic r2, r2, #DOMAIN_KERNEL_MASK @ Clear the domain part
+ orr r2, r2, #DOMAIN_CLIENT_VALUE @ Enforce DOMAIN_CLIENT
+ mcr p15, 0, r2, c3, c0, 0 @ Set domain register
+#endif
+#endif
+ ret lr
+
/*=============================================================================
* SWI handler
*-----------------------------------------------------------------------------
--
2.12.1.500.gab5fba24ee-goog
Implement specific usage of verify_pre_usermode_state for user-mode
returns for x86.
---
Based on next-20170322
---
arch/x86/Kconfig | 1 +
arch/x86/entry/common.c | 3 +++
arch/x86/entry/entry_64.S | 8 ++++++++
arch/x86/include/asm/pgtable_64_types.h | 11 +++++++++++
arch/x86/include/asm/processor.h | 11 -----------
5 files changed, 23 insertions(+), 11 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0e1bdadc8222..f48c96b834b5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -63,6 +63,7 @@ config X86
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
+ select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index cdefcfdd9e63..76ef050255c9 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -23,6 +23,7 @@
#include <linux/user-return-notifier.h>
#include <linux/uprobes.h>
#include <linux/livepatch.h>
+#include <linux/syscalls.h>
#include <asm/desc.h>
#include <asm/traps.h>
@@ -183,6 +184,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
struct thread_info *ti = current_thread_info();
u32 cached_flags;
+ verify_pre_usermode_state();
+
if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
local_irq_disable();
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index d2b2a2948ffe..c079b010205c 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -218,6 +218,14 @@ entry_SYSCALL_64_fastpath:
testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
jnz 1f
+ /*
+ * If address limit is not based on user-mode, jump to slow path for
+ * additional security checks.
+ */
+ movq $TASK_SIZE_MAX, %rcx
+ cmp %rcx, TASK_addr_limit(%r11)
+ jnz 1f
+
LOCKDEP_SYS_EXIT
TRACE_IRQS_ON /* user mode is traced as IRQs on */
movq RIP(%rsp), %rcx
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 516593e66bd6..12fa851c7fa8 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -78,4 +78,15 @@ typedef struct { pteval_t pte; } pte_t;
#define EARLY_DYNAMIC_PAGE_TABLES 64
+/*
+ * User space process size. 47bits minus one guard page. The guard
+ * page is necessary on Intel CPUs: if a SYSCALL instruction is at
+ * the highest possible canonical userspace address, then that
+ * syscall will enter the kernel with a non-canonical return
+ * address, and SYSRET will explode dangerously. We avoid this
+ * particular problem by preventing anything from being mapped
+ * at the maximum canonical address.
+ */
+#define TASK_SIZE_MAX ((_AC(1, UL) << 47) - PAGE_SIZE)
+
#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 3cada998a402..e80822582d3e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -825,17 +825,6 @@ static inline void spin_lock_prefetch(const void *x)
#define KSTK_ESP(task) (task_pt_regs(task)->sp)
#else
-/*
- * User space process size. 47bits minus one guard page. The guard
- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
- * the highest possible canonical userspace address, then that
- * syscall will enter the kernel with a non-canonical return
- * address, and SYSRET will explode dangerously. We avoid this
- * particular problem by preventing anything from being mapped
- * at the maximum canonical address.
- */
-#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE)
-
/* This decides where the kernel will search for a free chunk of vm
* space during mmap's.
*/
--
2.12.1.500.gab5fba24ee-goog
On Wed, Mar 22, 2017 at 1:38 PM, Thomas Garnier <[email protected]> wrote:
> This patch ensures a syscall does not return to user-mode with a kernel
> address limit. If that happened, a process can corrupt kernel-mode
> memory and elevate privileges.
>
> For example, it would mitigation this bug:
>
> - https://bugs.chromium.org/p/project-zero/issues/detail?id=990
>
> If the CONFIG_BUG_ON_DATA_CORRUPTION option is enabled, an incorrect
> state will result in a BUG_ON.
I'm a bit confused about this choice of configurability. I can see
two sensible choices:
1. Enable this hardening feature: BUG if there's an exploitable bug.
2. Don't enable it at all.
While it's possible that silently papering over the bug is slightly
faster than BUGging, it will allow bugs to continue to exist
undetected.
--Andy
On Wed, Mar 22, 2017 at 1:44 PM, Andy Lutomirski <[email protected]> wrote:
> On Wed, Mar 22, 2017 at 1:38 PM, Thomas Garnier <[email protected]> wrote:
>> This patch ensures a syscall does not return to user-mode with a kernel
>> address limit. If that happened, a process can corrupt kernel-mode
>> memory and elevate privileges.
>>
>> For example, it would mitigation this bug:
>>
>> - https://bugs.chromium.org/p/project-zero/issues/detail?id=990
>>
>> If the CONFIG_BUG_ON_DATA_CORRUPTION option is enabled, an incorrect
>> state will result in a BUG_ON.
>
> I'm a bit confused about this choice of configurability. I can see
> two sensible choices:
>
> 1. Enable this hardening feature: BUG if there's an exploitable bug.
>
> 2. Don't enable it at all.
>
> While it's possible that silently papering over the bug is slightly
> faster than BUGging, it will allow bugs to continue to exist
> undetected.
We can default to BUGging. I think my approach was avoiding doing a
BUG_ON just to avoid breaking people.
>
> --Andy
--
Thomas
On 03/22/17 13:44, Andy Lutomirski wrote:
>
> While it's possible that silently papering over the bug is slightly
> faster than BUGging, it will allow bugs to continue to exist
> undetected.
>
It would also allow the test to be inlined (at least on architectures
which have a one-site implementation) and have only the failure case out
of line, with a __noreturn annotation (which allows it to be jumped to
rather than called, which is usually available as a conditional
operation whereas call often isn't.)
That is...
extern void __noreturn __pre_usermode_state_invalid(void);
static void verify_pre_usermode_state(void)
{
if (unlikely(!segment_eq(get_fs(), USER_DS))
__pre_usermode_state_invalid();
}
-hpa
On 03/22/17 13:49, Thomas Garnier wrote:
>
> We can default to BUGging. I think my approach was avoiding doing a
> BUG_ON just to avoid breaking people.
>
Breaking on a potentially-exploitable bug is a feature.
-hpa
Okay well then people are fine with a BUG_ON approach. I will do a
next iteration tailored to that. I will also try to add the static
inline suggestion from Peter.
On Wed, Mar 22, 2017 at 1:54 PM, H. Peter Anvin <[email protected]> wrote:
> On 03/22/17 13:49, Thomas Garnier wrote:
>>
>> We can default to BUGging. I think my approach was avoiding doing a
>> BUG_ON just to avoid breaking people.
>>
>
> Breaking on a potentially-exploitable bug is a feature.
>
> -hpa
>
>
--
Thomas
On Thu, Mar 23, 2017 at 08:14:44AM -0700, Thomas Garnier wrote:
> Okay well then people are fine with a BUG_ON approach. I will do a
> next iteration tailored to that. I will also try to add the static
> inline suggestion from Peter.
Would it be possible, please, to refrain from top-posting when replying
on the ML?
You sometimes reply inline and after the text and sometimes at the top.
This subthread has both variants and it is really annoying to people
like me who try to follow the discussion.
Thanks.
--
Regards/Gruss,
Boris.
Good mailing practices for 400: avoid top-posting and trim the reply.
On Thu, Mar 23, 2017 at 8:32 AM, Borislav Petkov <[email protected]> wrote:
> On Thu, Mar 23, 2017 at 08:14:44AM -0700, Thomas Garnier wrote:
>> Okay well then people are fine with a BUG_ON approach. I will do a
>> next iteration tailored to that. I will also try to add the static
>> inline suggestion from Peter.
>
> Would it be possible, please, to refrain from top-posting when replying
> on the ML?
>
> You sometimes reply inline and after the text and sometimes at the top.
> This subthread has both variants and it is really annoying to people
> like me who try to follow the discussion.
Sure, I will try to always reply inline. Sorry bad habits.
--
Thomas