This patch ensures a syscall does not return to user-mode with a kernel
address limit. If that happened, a process can corrupt kernel-mode
memory and elevate privileges.
For example, it would mitigation this bug:
- https://bugs.chromium.org/p/project-zero/issues/detail?id=990
The CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE option is also
added so each architecture can optimize this change.
Signed-off-by: Thomas Garnier <[email protected]>
Tested-by: Kees Cook <[email protected]>
---
Based on next-20170404
---
arch/s390/Kconfig | 1 +
include/linux/syscalls.h | 26 +++++++++++++++++++++++++-
init/Kconfig | 7 +++++++
kernel/sys.c | 7 +++++++
4 files changed, 40 insertions(+), 1 deletion(-)
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index ca2fe764be2d..4ec7563f2746 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -103,6 +103,7 @@ config S390
select ARCH_INLINE_WRITE_UNLOCK_BH
select ARCH_INLINE_WRITE_UNLOCK_IRQ
select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+ select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
select ARCH_SAVE_PAGE_KEYS if HIBERNATION
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 980c3c9b06f8..f9ff80fa92ff 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -191,6 +191,27 @@ extern struct trace_event_functions exit_syscall_print_funcs;
SYSCALL_METADATA(sname, x, __VA_ARGS__) \
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
+
+/*
+ * Called before coming back to user-mode. Returning to user-mode with an
+ * address limit different than USER_DS can allow to overwrite kernel memory.
+ */
+static inline void verify_pre_usermode_state(void) {
+ BUG_ON(!segment_eq(get_fs(), USER_DS));
+}
+
+#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
+#define __CHECK_USER_CALLER() \
+ bool user_caller = segment_eq(get_fs(), USER_DS)
+#define __VERIFY_PRE_USERMODE_STATE() \
+ if (user_caller) verify_pre_usermode_state()
+#else
+#define __CHECK_USER_CALLER()
+#define __VERIFY_PRE_USERMODE_STATE()
+asmlinkage void asm_verify_pre_usermode_state(void);
+#endif
+
+
#define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
#define __SYSCALL_DEFINEx(x, name, ...) \
asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \
@@ -199,7 +220,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
{ \
- long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
+ long ret; \
+ __CHECK_USER_CALLER(); \
+ ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
+ __VERIFY_PRE_USERMODE_STATE(); \
__MAP(x,__SC_TEST,__VA_ARGS__); \
__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
return ret; \
diff --git a/init/Kconfig b/init/Kconfig
index 7f7027817bce..2c6b73de9a26 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1958,6 +1958,13 @@ config PROFILING
config TRACEPOINTS
bool
+#
+# Set by each architecture that want to optimize how verify_pre_usermode_state
+# is called.
+#
+config ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
+ bool
+
source "arch/Kconfig"
endmenu # General setup
diff --git a/kernel/sys.c b/kernel/sys.c
index 196c7134bee6..4ae278fcc290 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2459,3 +2459,10 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
return 0;
}
#endif /* CONFIG_COMPAT */
+
+#ifdef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
+asmlinkage void asm_verify_pre_usermode_state(void)
+{
+ verify_pre_usermode_state();
+}
+#endif
--
2.12.2.715.g7642488e1d-goog
Implement specific usage of verify_pre_usermode_state for user-mode
returns for x86.
Signed-off-by: Thomas Garnier <[email protected]>
---
Based on next-20170404
---
arch/x86/Kconfig | 1 +
arch/x86/entry/common.c | 3 +++
arch/x86/entry/entry_64.S | 8 ++++++++
arch/x86/include/asm/pgtable_64_types.h | 11 +++++++++++
arch/x86/include/asm/processor.h | 11 -----------
5 files changed, 23 insertions(+), 11 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9a5af1e1cd61..3d7eb9298f2d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -63,6 +63,7 @@ config X86
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
+ select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index cdefcfdd9e63..76ef050255c9 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -23,6 +23,7 @@
#include <linux/user-return-notifier.h>
#include <linux/uprobes.h>
#include <linux/livepatch.h>
+#include <linux/syscalls.h>
#include <asm/desc.h>
#include <asm/traps.h>
@@ -183,6 +184,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
struct thread_info *ti = current_thread_info();
u32 cached_flags;
+ verify_pre_usermode_state();
+
if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
local_irq_disable();
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index d2b2a2948ffe..c079b010205c 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -218,6 +218,14 @@ entry_SYSCALL_64_fastpath:
testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
jnz 1f
+ /*
+ * If address limit is not based on user-mode, jump to slow path for
+ * additional security checks.
+ */
+ movq $TASK_SIZE_MAX, %rcx
+ cmp %rcx, TASK_addr_limit(%r11)
+ jnz 1f
+
LOCKDEP_SYS_EXIT
TRACE_IRQS_ON /* user mode is traced as IRQs on */
movq RIP(%rsp), %rcx
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 516593e66bd6..12fa851c7fa8 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -78,4 +78,15 @@ typedef struct { pteval_t pte; } pte_t;
#define EARLY_DYNAMIC_PAGE_TABLES 64
+/*
+ * User space process size. 47bits minus one guard page. The guard
+ * page is necessary on Intel CPUs: if a SYSCALL instruction is at
+ * the highest possible canonical userspace address, then that
+ * syscall will enter the kernel with a non-canonical return
+ * address, and SYSRET will explode dangerously. We avoid this
+ * particular problem by preventing anything from being mapped
+ * at the maximum canonical address.
+ */
+#define TASK_SIZE_MAX ((_AC(1, UL) << 47) - PAGE_SIZE)
+
#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 3cada998a402..e80822582d3e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -825,17 +825,6 @@ static inline void spin_lock_prefetch(const void *x)
#define KSTK_ESP(task) (task_pt_regs(task)->sp)
#else
-/*
- * User space process size. 47bits minus one guard page. The guard
- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
- * the highest possible canonical userspace address, then that
- * syscall will enter the kernel with a non-canonical return
- * address, and SYSRET will explode dangerously. We avoid this
- * particular problem by preventing anything from being mapped
- * at the maximum canonical address.
- */
-#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE)
-
/* This decides where the kernel will search for a free chunk of vm
* space during mmap's.
*/
--
2.12.2.715.g7642488e1d-goog
Implement specific usage of verify_pre_usermode_state for user-mode
returns for arm64.
Signed-off-by: Thomas Garnier <[email protected]>
---
Based on next-20170404
---
arch/arm64/Kconfig | 1 +
arch/arm64/kernel/entry.S | 15 +++++++++++++++
2 files changed, 16 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index f2b0b528037d..0e86d87259f4 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -24,6 +24,7 @@ config ARM64
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION
select ARCH_WANT_FRAME_POINTERS
select ARCH_HAS_UBSAN_SANITIZE_ALL
+ select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
select ARM_AMBA
select ARM_ARCH_TIMER
select ARM_GIC
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 43512d4d7df2..6d598e7051c3 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -744,6 +744,10 @@ ENDPROC(cpu_switch_to)
ret_fast_syscall:
disable_irq // disable interrupts
str x0, [sp, #S_X0] // returned x0
+ ldr x2, [tsk, #TSK_TI_ADDR_LIMIT] // check addr limit change
+ mov x1, #TASK_SIZE_64
+ cmp x2, x1
+ b.ne addr_limit_fail
ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for syscall tracing
and x2, x1, #_TIF_SYSCALL_WORK
cbnz x2, ret_fast_syscall_trace
@@ -771,6 +775,11 @@ work_pending:
*/
ret_to_user:
disable_irq // disable interrupts
+ ldr x2, [tsk, #TSK_TI_ADDR_LIMIT] // check addr limit change
+ mov x1, #TASK_SIZE_64
+ cmp x2, x1
+ b.ne addr_limit_fail
+
ldr x1, [tsk, #TSK_TI_FLAGS]
and x2, x1, #_TIF_WORK_MASK
cbnz x2, work_pending
@@ -779,6 +788,12 @@ finish_ret_to_user:
kernel_exit 0
ENDPROC(ret_to_user)
+addr_limit_fail:
+ stp x0, lr, [sp,#-16]!
+ bl asm_verify_pre_usermode_state
+ ldp x0, lr, [sp],#16
+ ret lr
+
/*
* This is how we return from a fork.
*/
--
2.12.2.715.g7642488e1d-goog
Implement specific usage of verify_pre_usermode_state for user-mode
returns for arm.
Signed-off-by: Thomas Garnier <[email protected]>
---
Based on next-20170404
---
arch/arm/Kconfig | 1 +
arch/arm/kernel/entry-common.S | 16 +++++++++++++++-
2 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 2cec306c7390..e2ebc7d37742 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -12,6 +12,7 @@ config ARM
select ARCH_HAVE_CUSTOM_GPIO_H
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_MIGHT_HAVE_PC_PARPORT
+ select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7
select ARCH_SUPPORTS_ATOMIC_RMW
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index eb5cd77bf1d8..88c72c4e44ad 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -12,6 +12,7 @@
#include <asm/unistd.h>
#include <asm/ftrace.h>
#include <asm/unwind.h>
+#include <asm/memory.h>
#ifdef CONFIG_AEABI
#include <asm/unistd-oabi.h>
#endif
@@ -27,7 +28,6 @@
#include "entry-header.S"
-
.align 5
#if !(IS_ENABLED(CONFIG_TRACE_IRQFLAGS) || IS_ENABLED(CONFIG_CONTEXT_TRACKING))
/*
@@ -40,9 +40,12 @@ ret_fast_syscall:
UNWIND(.fnstart )
UNWIND(.cantunwind )
disable_irq_notrace @ disable interrupts
+ ldr r2, [tsk, #TI_ADDR_LIMIT]
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
bne fast_work_pending
+ cmp r2, #TASK_SIZE
+ blne addr_limit_fail
/* perform architecture specific actions before user return */
arch_ret_to_user r1, lr
@@ -66,6 +69,7 @@ ret_fast_syscall:
UNWIND(.cantunwind )
str r0, [sp, #S_R0 + S_OFF]! @ save returned r0
disable_irq_notrace @ disable interrupts
+ ldr r2, [tsk, #TI_ADDR_LIMIT]
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
beq no_work_pending
@@ -82,6 +86,7 @@ slow_work_pending:
mov r2, why @ 'syscall'
bl do_work_pending
cmp r0, #0
+ ldreq r2, [tsk, #TI_ADDR_LIMIT]
beq no_work_pending
movlt scno, #(__NR_restart_syscall - __NR_SYSCALL_BASE)
ldmia sp, {r0 - r6} @ have to reload r0 - r6
@@ -99,9 +104,12 @@ ret_slow_syscall:
disable_irq_notrace @ disable interrupts
ENTRY(ret_to_user_from_irq)
ldr r1, [tsk, #TI_FLAGS]
+ ldr r2, [tsk, #TI_ADDR_LIMIT]
tst r1, #_TIF_WORK_MASK
bne slow_work_pending
no_work_pending:
+ cmp r2, #TASK_SIZE
+ blne addr_limit_fail
asm_trace_hardirqs_on save = 0
/* perform architecture specific actions before user return */
@@ -125,6 +133,12 @@ ENTRY(ret_from_fork)
b ret_slow_syscall
ENDPROC(ret_from_fork)
+addr_limit_fail:
+ stmfd sp!, {r0, lr}
+ bl asm_verify_pre_usermode_state
+ ldmfd sp!, {r0, lr}
+ ret lr
+
/*=============================================================================
* SWI handler
*-----------------------------------------------------------------------------
--
2.12.2.715.g7642488e1d-goog
On 04/04/17 10:47, Thomas Garnier wrote:
> Implement specific usage of verify_pre_usermode_state for user-mode
> returns for x86.
>
> Signed-off-by: Thomas Garnier <[email protected]>
>
> + /*
> + * If address limit is not based on user-mode, jump to slow path for
> + * additional security checks.
> + */
> + movq $TASK_SIZE_MAX, %rcx
> + cmp %rcx, TASK_addr_limit(%r11)
> + jnz 1f
> +
> LOCKDEP_SYS_EXIT
Nitpick: use jne not jnz when comparing for equality. Same instruction
but more readable.
-hpa
On 04/04/17 10:47, Thomas Garnier wrote:
> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
> index 516593e66bd6..12fa851c7fa8 100644
> --- a/arch/x86/include/asm/pgtable_64_types.h
> +++ b/arch/x86/include/asm/pgtable_64_types.h
> @@ -78,4 +78,15 @@ typedef struct { pteval_t pte; } pte_t;
>
> #define EARLY_DYNAMIC_PAGE_TABLES 64
>
> +/*
> + * User space process size. 47bits minus one guard page. The guard
> + * page is necessary on Intel CPUs: if a SYSCALL instruction is at
> + * the highest possible canonical userspace address, then that
> + * syscall will enter the kernel with a non-canonical return
> + * address, and SYSRET will explode dangerously. We avoid this
> + * particular problem by preventing anything from being mapped
> + * at the maximum canonical address.
> + */
> +#define TASK_SIZE_MAX ((_AC(1, UL) << 47) - PAGE_SIZE)
> +
> #endif /* _ASM_X86_PGTABLE_64_DEFS_H */
> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> index 3cada998a402..e80822582d3e 100644
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -825,17 +825,6 @@ static inline void spin_lock_prefetch(const void *x)
> #define KSTK_ESP(task) (task_pt_regs(task)->sp)
>
> #else
> -/*
> - * User space process size. 47bits minus one guard page. The guard
> - * page is necessary on Intel CPUs: if a SYSCALL instruction is at
> - * the highest possible canonical userspace address, then that
> - * syscall will enter the kernel with a non-canonical return
> - * address, and SYSRET will explode dangerously. We avoid this
> - * particular problem by preventing anything from being mapped
> - * at the maximum canonical address.
> - */
> -#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE)
> -
> /* This decides where the kernel will search for a free chunk of vm
> * space during mmap's.
> */
>
This should be an entirely separate patch; if nothing else you need to
explain it in the comments.
Also, you say this is for "x86", but I still don't see any code for i386
whatsoever. Have you verified *all* the i386 and i386-compat paths to
make sure they go via prepare_exit_to_usermode()? [Cc: Andy]
Finally, I can't really believe I'm the only person for whom "Specific
usage of verity_pre_usermode_state" is completely opaque.
-hpa
On Tue, Apr 04, 2017 at 11:27:07AM -0700, H. Peter Anvin wrote:
> Finally, I can't really believe I'm the only person for whom "Specific
> usage of verity_pre_usermode_state" is completely opaque.
No, you're not. I'm missing the usual layout of the commit message
"Problem is A, we need to do B, because of C." And this particular one
needs to be pretty verbose as it is tricky lowlevel, userspace return
blabla code and I'd prefer not to have to rhyme up together myself
what's going on and what we're fixing here.
--
Regards/Gruss,
Boris.
Good mailing practices for 400: avoid top-posting and trim the reply.
On Tue, Apr 4, 2017 at 11:27 AM, H. Peter Anvin <[email protected]> wrote:
> On 04/04/17 10:47, Thomas Garnier wrote:
>> diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
>> index 516593e66bd6..12fa851c7fa8 100644
>> --- a/arch/x86/include/asm/pgtable_64_types.h
>> +++ b/arch/x86/include/asm/pgtable_64_types.h
>> @@ -78,4 +78,15 @@ typedef struct { pteval_t pte; } pte_t;
>>
>> #define EARLY_DYNAMIC_PAGE_TABLES 64
>>
>> +/*
>> + * User space process size. 47bits minus one guard page. The guard
>> + * page is necessary on Intel CPUs: if a SYSCALL instruction is at
>> + * the highest possible canonical userspace address, then that
>> + * syscall will enter the kernel with a non-canonical return
>> + * address, and SYSRET will explode dangerously. We avoid this
>> + * particular problem by preventing anything from being mapped
>> + * at the maximum canonical address.
>> + */
>> +#define TASK_SIZE_MAX ((_AC(1, UL) << 47) - PAGE_SIZE)
>> +
>> #endif /* _ASM_X86_PGTABLE_64_DEFS_H */
>> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
>> index 3cada998a402..e80822582d3e 100644
>> --- a/arch/x86/include/asm/processor.h
>> +++ b/arch/x86/include/asm/processor.h
>> @@ -825,17 +825,6 @@ static inline void spin_lock_prefetch(const void *x)
>> #define KSTK_ESP(task) (task_pt_regs(task)->sp)
>>
>> #else
>> -/*
>> - * User space process size. 47bits minus one guard page. The guard
>> - * page is necessary on Intel CPUs: if a SYSCALL instruction is at
>> - * the highest possible canonical userspace address, then that
>> - * syscall will enter the kernel with a non-canonical return
>> - * address, and SYSRET will explode dangerously. We avoid this
>> - * particular problem by preventing anything from being mapped
>> - * at the maximum canonical address.
>> - */
>> -#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE)
>> -
>> /* This decides where the kernel will search for a free chunk of vm
>> * space during mmap's.
>> */
>>
>
> This should be an entirely separate patch; if nothing else you need to
> explain it in the comments.
I will explain it in the commit message, it should be easier than a
separate patch.
>
> Also, you say this is for "x86", but I still don't see any code for i386
> whatsoever. Have you verified *all* the i386 and i386-compat paths to
> make sure they go via prepare_exit_to_usermode()? [Cc: Andy]
I did but I will do it again for the next iteration.
>
> Finally, I can't really believe I'm the only person for whom "Specific
> usage of verity_pre_usermode_state" is completely opaque.
I agree, I will improve it.
>
> -hpa
>
--
Thomas
<[email protected]>,Catalin Marinas <[email protected]>,Mark Rutland <[email protected]>,James Morse <[email protected]>,linux-s390 <[email protected]>,LKML <[email protected]>,Linux API <[email protected]>,the arch/x86 maintainers <[email protected]>,[email protected],Kernel Hardening <[email protected]>
From: [email protected]
Message-ID: <[email protected]>
On April 4, 2017 12:21:48 PM PDT, Thomas Garnier <[email protected]> wrote:
>On Tue, Apr 4, 2017 at 11:27 AM, H. Peter Anvin <[email protected]> wrote:
>> On 04/04/17 10:47, Thomas Garnier wrote:
>>> diff --git a/arch/x86/include/asm/pgtable_64_types.h
>b/arch/x86/include/asm/pgtable_64_types.h
>>> index 516593e66bd6..12fa851c7fa8 100644
>>> --- a/arch/x86/include/asm/pgtable_64_types.h
>>> +++ b/arch/x86/include/asm/pgtable_64_types.h
>>> @@ -78,4 +78,15 @@ typedef struct { pteval_t pte; } pte_t;
>>>
>>> #define EARLY_DYNAMIC_PAGE_TABLES 64
>>>
>>> +/*
>>> + * User space process size. 47bits minus one guard page. The guard
>>> + * page is necessary on Intel CPUs: if a SYSCALL instruction is at
>>> + * the highest possible canonical userspace address, then that
>>> + * syscall will enter the kernel with a non-canonical return
>>> + * address, and SYSRET will explode dangerously. We avoid this
>>> + * particular problem by preventing anything from being mapped
>>> + * at the maximum canonical address.
>>> + */
>>> +#define TASK_SIZE_MAX ((_AC(1, UL) << 47) - PAGE_SIZE)
>>> +
>>> #endif /* _ASM_X86_PGTABLE_64_DEFS_H */
>>> diff --git a/arch/x86/include/asm/processor.h
>b/arch/x86/include/asm/processor.h
>>> index 3cada998a402..e80822582d3e 100644
>>> --- a/arch/x86/include/asm/processor.h
>>> +++ b/arch/x86/include/asm/processor.h
>>> @@ -825,17 +825,6 @@ static inline void spin_lock_prefetch(const
>void *x)
>>> #define KSTK_ESP(task) (task_pt_regs(task)->sp)
>>>
>>> #else
>>> -/*
>>> - * User space process size. 47bits minus one guard page. The guard
>>> - * page is necessary on Intel CPUs: if a SYSCALL instruction is at
>>> - * the highest possible canonical userspace address, then that
>>> - * syscall will enter the kernel with a non-canonical return
>>> - * address, and SYSRET will explode dangerously. We avoid this
>>> - * particular problem by preventing anything from being mapped
>>> - * at the maximum canonical address.
>>> - */
>>> -#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE)
>>> -
>>> /* This decides where the kernel will search for a free chunk of vm
>>> * space during mmap's.
>>> */
>>>
>>
>> This should be an entirely separate patch; if nothing else you need
>to
>> explain it in the comments.
>
>I will explain it in the commit message, it should be easier than a
>separate patch.
>
>>
>> Also, you say this is for "x86", but I still don't see any code for
>i386
>> whatsoever. Have you verified *all* the i386 and i386-compat paths
>to
>> make sure they go via prepare_exit_to_usermode()? [Cc: Andy]
>
>I did but I will do it again for the next iteration.
>
>>
>> Finally, I can't really believe I'm the only person for whom
>"Specific
>> usage of verity_pre_usermode_state" is completely opaque.
>
>I agree, I will improve it.
>
>>
>> -hpa
>>
Easier for you, perhaps, but not for everyone else...
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.
On Tue, Apr 04, 2017 at 10:47:27AM -0700, Thomas Garnier wrote:
> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
> index 43512d4d7df2..6d598e7051c3 100644
> --- a/arch/arm64/kernel/entry.S
> +++ b/arch/arm64/kernel/entry.S
> @@ -744,6 +744,10 @@ ENDPROC(cpu_switch_to)
> ret_fast_syscall:
> disable_irq // disable interrupts
> str x0, [sp, #S_X0] // returned x0
> + ldr x2, [tsk, #TSK_TI_ADDR_LIMIT] // check addr limit change
> + mov x1, #TASK_SIZE_64
> + cmp x2, x1
> + b.ne addr_limit_fail
KERNEL_DS is set to the maximum address (-1UL), so it would be easier to
check against this here and avoid a "mov". Even simpler if you'd check
against bit 63 of the address for KERNEL_DS:
ldr x1, [tsk, TSK_TI_ADDR_LIMIT] // check addr limit change
tbnz x1, #63, addr_limit_fail // KERNEL_DS is -1UL
> ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for syscall tracing
> and x2, x1, #_TIF_SYSCALL_WORK
> cbnz x2, ret_fast_syscall_trace
> @@ -771,6 +775,11 @@ work_pending:
> */
> ret_to_user:
> disable_irq // disable interrupts
> + ldr x2, [tsk, #TSK_TI_ADDR_LIMIT] // check addr limit change
> + mov x1, #TASK_SIZE_64
> + cmp x2, x1
> + b.ne addr_limit_fail
Same here.
> +
> ldr x1, [tsk, #TSK_TI_FLAGS]
> and x2, x1, #_TIF_WORK_MASK
> cbnz x2, work_pending
> @@ -779,6 +788,12 @@ finish_ret_to_user:
> kernel_exit 0
> ENDPROC(ret_to_user)
>
> +addr_limit_fail:
> + stp x0, lr, [sp,#-16]!
> + bl asm_verify_pre_usermode_state
> + ldp x0, lr, [sp],#16
> + ret lr
Where is this supposed to return? What is the value of lr when branching
to addr_limit_fail?
--
Catalin
On Wed, Apr 5, 2017 at 7:22 AM, Catalin Marinas <[email protected]> wrote:
> On Tue, Apr 04, 2017 at 10:47:27AM -0700, Thomas Garnier wrote:
>> diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
>> index 43512d4d7df2..6d598e7051c3 100644
>> --- a/arch/arm64/kernel/entry.S
>> +++ b/arch/arm64/kernel/entry.S
>> @@ -744,6 +744,10 @@ ENDPROC(cpu_switch_to)
>> ret_fast_syscall:
>> disable_irq // disable interrupts
>> str x0, [sp, #S_X0] // returned x0
>> + ldr x2, [tsk, #TSK_TI_ADDR_LIMIT] // check addr limit change
>> + mov x1, #TASK_SIZE_64
>> + cmp x2, x1
>> + b.ne addr_limit_fail
>
> KERNEL_DS is set to the maximum address (-1UL), so it would be easier to
> check against this here and avoid a "mov". Even simpler if you'd check
> against bit 63 of the address for KERNEL_DS:
We also want to catch corruption so checking the 63 bit make sense. I
will look for this change in the next iteration.
>
> ldr x1, [tsk, TSK_TI_ADDR_LIMIT] // check addr limit change
> tbnz x1, #63, addr_limit_fail // KERNEL_DS is -1UL
>
>> ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for syscall tracing
>> and x2, x1, #_TIF_SYSCALL_WORK
>> cbnz x2, ret_fast_syscall_trace
>> @@ -771,6 +775,11 @@ work_pending:
>> */
>> ret_to_user:
>> disable_irq // disable interrupts
>> + ldr x2, [tsk, #TSK_TI_ADDR_LIMIT] // check addr limit change
>> + mov x1, #TASK_SIZE_64
>> + cmp x2, x1
>> + b.ne addr_limit_fail
>
> Same here.
>
>> +
>> ldr x1, [tsk, #TSK_TI_FLAGS]
>> and x2, x1, #_TIF_WORK_MASK
>> cbnz x2, work_pending
>> @@ -779,6 +788,12 @@ finish_ret_to_user:
>> kernel_exit 0
>> ENDPROC(ret_to_user)
>>
>> +addr_limit_fail:
>> + stp x0, lr, [sp,#-16]!
>> + bl asm_verify_pre_usermode_state
>> + ldp x0, lr, [sp],#16
>> + ret lr
>
> Where is this supposed to return? What is the value of lr when branching
> to addr_limit_fail?
It is not supposed to return. Do you think I should remove stp, ldp,
ret and jut add a brk 0x100 or jmp/call a break/bug function?
>
> --
> Catalin
--
Thomas
On Wed, Apr 05, 2017 at 07:36:17AM -0700, Thomas Garnier wrote:
> On Wed, Apr 5, 2017 at 7:22 AM, Catalin Marinas <[email protected]> wrote:
> > On Tue, Apr 04, 2017 at 10:47:27AM -0700, Thomas Garnier wrote:
> >> +
> >> ldr x1, [tsk, #TSK_TI_FLAGS]
> >> and x2, x1, #_TIF_WORK_MASK
> >> cbnz x2, work_pending
> >> @@ -779,6 +788,12 @@ finish_ret_to_user:
> >> kernel_exit 0
> >> ENDPROC(ret_to_user)
> >>
> >> +addr_limit_fail:
> >> + stp x0, lr, [sp,#-16]!
> >> + bl asm_verify_pre_usermode_state
> >> + ldp x0, lr, [sp],#16
> >> + ret lr
> >
> > Where is this supposed to return? What is the value of lr when branching
> > to addr_limit_fail?
>
> It is not supposed to return. Do you think I should remove stp, ldp,
> ret and jut add a brk 0x100 or jmp/call a break/bug function?
Can you not just make addr_limit_fail a C function which never returns
(similar to what we to with bad_mode() on arm64)? Since addr_limit_fail
is only called when the segment is not the right one, I don't really see
why you need another call to asm_verify_pre_usermode_state() to do a
similar check again. Just panic in addr_limit_fail (unless I
misunderstood what you are trying to achieve).
--
Catalin
On Wed, Apr 5, 2017 at 10:49 AM, Catalin Marinas
<[email protected]> wrote:
> On Wed, Apr 05, 2017 at 07:36:17AM -0700, Thomas Garnier wrote:
>> On Wed, Apr 5, 2017 at 7:22 AM, Catalin Marinas <[email protected]> wrote:
>> > On Tue, Apr 04, 2017 at 10:47:27AM -0700, Thomas Garnier wrote:
>> >> +
>> >> ldr x1, [tsk, #TSK_TI_FLAGS]
>> >> and x2, x1, #_TIF_WORK_MASK
>> >> cbnz x2, work_pending
>> >> @@ -779,6 +788,12 @@ finish_ret_to_user:
>> >> kernel_exit 0
>> >> ENDPROC(ret_to_user)
>> >>
>> >> +addr_limit_fail:
>> >> + stp x0, lr, [sp,#-16]!
>> >> + bl asm_verify_pre_usermode_state
>> >> + ldp x0, lr, [sp],#16
>> >> + ret lr
>> >
>> > Where is this supposed to return? What is the value of lr when branching
>> > to addr_limit_fail?
>>
>> It is not supposed to return. Do you think I should remove stp, ldp,
>> ret and jut add a brk 0x100 or jmp/call a break/bug function?
>
> Can you not just make addr_limit_fail a C function which never returns
> (similar to what we to with bad_mode() on arm64)? Since addr_limit_fail
> is only called when the segment is not the right one, I don't really see
> why you need another call to asm_verify_pre_usermode_state() to do a
> similar check again. Just panic in addr_limit_fail (unless I
> misunderstood what you are trying to achieve).
Calling asm_verify_pre_usermode_state has the advantage of having a
clear BUG_ON for the error (versus a panic description).
What do you think about replacing asm_verify_pre_usermode_state by a
"address_limit_fail" function that still calls
verify_pre_usermode_state but panic afterwards (because it should
never return)?
The assembly code would be easier to understand and in case of error
the BUG_ON is clear for the user.
>
> --
> Catalin
--
Thomas
On Wed, Apr 05, 2017 at 11:14:34AM -0700, Thomas Garnier wrote:
> On Wed, Apr 5, 2017 at 10:49 AM, Catalin Marinas
> <[email protected]> wrote:
> > On Wed, Apr 05, 2017 at 07:36:17AM -0700, Thomas Garnier wrote:
> >> On Wed, Apr 5, 2017 at 7:22 AM, Catalin Marinas <[email protected]> wrote:
> >> > On Tue, Apr 04, 2017 at 10:47:27AM -0700, Thomas Garnier wrote:
> >> >> +
> >> >> ldr x1, [tsk, #TSK_TI_FLAGS]
> >> >> and x2, x1, #_TIF_WORK_MASK
> >> >> cbnz x2, work_pending
> >> >> @@ -779,6 +788,12 @@ finish_ret_to_user:
> >> >> kernel_exit 0
> >> >> ENDPROC(ret_to_user)
> >> >>
> >> >> +addr_limit_fail:
> >> >> + stp x0, lr, [sp,#-16]!
> >> >> + bl asm_verify_pre_usermode_state
> >> >> + ldp x0, lr, [sp],#16
> >> >> + ret lr
> >> >
> >> > Where is this supposed to return? What is the value of lr when branching
> >> > to addr_limit_fail?
> >>
> >> It is not supposed to return. Do you think I should remove stp, ldp,
> >> ret and jut add a brk 0x100 or jmp/call a break/bug function?
> >
> > Can you not just make addr_limit_fail a C function which never returns
> > (similar to what we to with bad_mode() on arm64)? Since addr_limit_fail
> > is only called when the segment is not the right one, I don't really see
> > why you need another call to asm_verify_pre_usermode_state() to do a
> > similar check again. Just panic in addr_limit_fail (unless I
> > misunderstood what you are trying to achieve).
>
> Calling asm_verify_pre_usermode_state has the advantage of having a
> clear BUG_ON for the error (versus a panic description).
>
> What do you think about replacing asm_verify_pre_usermode_state by a
> "address_limit_fail" function that still calls
> verify_pre_usermode_state but panic afterwards (because it should
> never return)?
>
> The assembly code would be easier to understand and in case of error
> the BUG_ON is clear for the user.
It looks fine to me, though I'd have to see the patch.
--
Catalin