From: Lai Jiangshan <[email protected]>
When X86_BUG_CPU_MELTDOWN & KPTI, cpu_current_top_of_stack lives in the
TSS which is also in the user CR3 and it becomes a coveted fruit. An
attacker can fetch the kernel stack top from it and continue next steps
of actions based on the kernel stack.
The address might not be very usefull for attacker, but it is not so
necessary to be in TSS either. It is only accessed when CR3 is kernel CR3
and gs_base is kernel gs_base which means it can be in any percpu variable.
The major reason it is in TSS might be performance because it is hot in
cache and tlb since we just access sp2 as the scratch space in syscall.
So we can move it to a percpu variable near other hot percpu variables,
such as current_task, __preempt_count, and they are in the same
cache line.
tools/testing/selftests/seccomp/seccomp_benchmark desn't show any
performance lost in "getpid native" result. And actually, the result
changes from 93ns before patch to 92ns after patch when !KPTI, and the
test is very stable although the test desn't show a higher degree of
precision but enough to know it doesn't cause degression for the test.
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/include/asm/processor.h | 10 ----------
arch/x86/include/asm/switch_to.h | 7 +------
arch/x86/include/asm/thread_info.h | 6 ------
arch/x86/kernel/cpu/common.c | 3 +++
arch/x86/kernel/process.c | 8 ++------
arch/x86/mm/pti.c | 7 +++----
6 files changed, 9 insertions(+), 32 deletions(-)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c20a52b5534b..886d32da1318 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -314,11 +314,6 @@ struct x86_hw_tss {
struct x86_hw_tss {
u32 reserved1;
u64 sp0;
-
- /*
- * We store cpu_current_top_of_stack in sp1 so it's always accessible.
- * Linux does not use ring 1, so sp1 is not otherwise needed.
- */
u64 sp1;
/*
@@ -428,12 +423,7 @@ struct irq_stack {
DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
-#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
-#else
-/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
-#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
-#endif
#ifdef CONFIG_X86_64
struct fixed_percpu_data {
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 9f69cc497f4b..4f0bc8533a54 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -71,12 +71,7 @@ static inline void update_task_stack(struct task_struct *task)
else
this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
#else
- /*
- * x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That
- * doesn't work on x86-32 because sp1 and
- * cpu_current_top_of_stack have different values (because of
- * the non-zero stack-padding on 32bit).
- */
+ /* XENPV keeps its entry stack to be kernel stack. */
if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task_top_of_stack(task));
#endif
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 0d751d5da702..3dc93d8df425 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -197,12 +197,6 @@ static inline int arch_within_stack_frames(const void * const stack,
#endif
}
-#else /* !__ASSEMBLY__ */
-
-#ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
-#endif
-
#endif
#ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 35ad8480c464..f3d7fd7e9684 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1745,6 +1745,9 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
+EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
+
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 145a7ac0c19a..7c4d0184a44a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -63,14 +63,10 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
*/
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
- /*
- * .sp1 is cpu_current_top_of_stack. The init task never
- * runs user code, but cpu_current_top_of_stack should still
- * be well defined before the first context switch.
- */
+#ifdef CONFIG_X86_32
+ /* .sp1 is used via TSS_entry2task_stack when swtiching stack */
.sp1 = TOP_OF_INIT_STACK,
-#ifdef CONFIG_X86_32
.ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS,
#endif
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 1aab92930569..e101cd87d038 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -440,10 +440,9 @@ static void __init pti_clone_user_shared(void)
for_each_possible_cpu(cpu) {
/*
- * The SYSCALL64 entry code needs to be able to find the
- * thread stack and needs one word of scratch space in which
- * to spill a register. All of this lives in the TSS, in
- * the sp1 and sp2 slots.
+ * The SYSCALL64 entry code needs one word of scratch space
+ * in which to spill a register. It lives in the sp2 slot
+ * of the CPU's TSS.
*
* This is done for all possible CPUs during boot to ensure
* that it's propagated to all mms.
--
2.19.1.6.gb485710b
On Fri, Jan 22, 2021 at 11:48 PM Lai Jiangshan <[email protected]> wrote:
>
> From: Lai Jiangshan <[email protected]>
>
> When X86_BUG_CPU_MELTDOWN & KPTI, cpu_current_top_of_stack lives in the
> TSS which is also in the user CR3 and it becomes a coveted fruit. An
> attacker can fetch the kernel stack top from it and continue next steps
> of actions based on the kernel stack.
>
> The address might not be very usefull for attacker, but it is not so
> necessary to be in TSS either. It is only accessed when CR3 is kernel CR3
> and gs_base is kernel gs_base which means it can be in any percpu variable.
>
> The major reason it is in TSS might be performance because it is hot in
> cache and tlb since we just access sp2 as the scratch space in syscall.
>
> So we can move it to a percpu variable near other hot percpu variables,
> such as current_task, __preempt_count, and they are in the same
> cache line.
>
> tools/testing/selftests/seccomp/seccomp_benchmark desn't show any
> performance lost in "getpid native" result. And actually, the result
> changes from 93ns before patch to 92ns after patch when !KPTI, and the
> test is very stable although the test desn't show a higher degree of
> precision but enough to know it doesn't cause degression for the test.
I'm okay with this concept, and it's a decent cleanup. But the patch
is incomplete. There are a whole bunch of other sp1 users in
arch/x86, and we should get rid of all of them at once. Most notably,
the 32-bit code uses both the percpu variable and sp1 right now, and
that should be cleaned up. (It's also quite obfuscated in the current
code.)
See below for minor additional comments.
>
> Signed-off-by: Lai Jiangshan <[email protected]>
> ---
> arch/x86/include/asm/processor.h | 10 ----------
> arch/x86/include/asm/switch_to.h | 7 +------
> arch/x86/include/asm/thread_info.h | 6 ------
> arch/x86/kernel/cpu/common.c | 3 +++
> arch/x86/kernel/process.c | 8 ++------
> arch/x86/mm/pti.c | 7 +++----
> 6 files changed, 9 insertions(+), 32 deletions(-)
>
> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> index c20a52b5534b..886d32da1318 100644
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -314,11 +314,6 @@ struct x86_hw_tss {
> struct x86_hw_tss {
> u32 reserved1;
> u64 sp0;
> -
> - /*
> - * We store cpu_current_top_of_stack in sp1 so it's always accessible.
> - * Linux does not use ring 1, so sp1 is not otherwise needed.
> - */
> u64 sp1;
>
> /*
> @@ -428,12 +423,7 @@ struct irq_stack {
>
> DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
>
> -#ifdef CONFIG_X86_32
> DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
> -#else
> -/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
> -#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
> -#endif
>
> #ifdef CONFIG_X86_64
> struct fixed_percpu_data {
> diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
> index 9f69cc497f4b..4f0bc8533a54 100644
> --- a/arch/x86/include/asm/switch_to.h
> +++ b/arch/x86/include/asm/switch_to.h
> @@ -71,12 +71,7 @@ static inline void update_task_stack(struct task_struct *task)
> else
> this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
> #else
> - /*
> - * x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That
> - * doesn't work on x86-32 because sp1 and
> - * cpu_current_top_of_stack have different values (because of
> - * the non-zero stack-padding on 32bit).
> - */
> + /* XENPV keeps its entry stack to be kernel stack. */
How about:
Xen PV enters the kernel on the thread stack.
> if (static_cpu_has(X86_FEATURE_XENPV))
> load_sp0(task_top_of_stack(task));
> #endif
> diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
> index 0d751d5da702..3dc93d8df425 100644
> --- a/arch/x86/include/asm/thread_info.h
> +++ b/arch/x86/include/asm/thread_info.h
> @@ -197,12 +197,6 @@ static inline int arch_within_stack_frames(const void * const stack,
> #endif
> }
>
> -#else /* !__ASSEMBLY__ */
> -
> -#ifdef CONFIG_X86_64
> -# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
A complete patch would also remove TSS_sp1 from asm-offsets.
From: Lai Jiangshan <[email protected]>
In x86_64, tss.sp1 is reused as cpu_current_top_of_stack. But we can
directly use percpu since CR3 and gs_base is correct when it is used.
In x86_32, tss.sp1 is resued as thread.sp0 in three places in entry
code. We have the correct CR3 and %fs at two of the places. The last
one is sysenter. This patchset makes %fs available earlier so that
we can also use percpu in sysenter. And add a percpu cpu_current_thread_sp0
for thread.sp0 instead of tss.sp1
Changed from V1
Requested from Andy to also fix sp1 for x86_32.
Update comments in the x86_64 patch as Andy sugguested.
Lai Jiangshan (6):
x86_64: move cpu_current_top_of_stack out of TSS
x86_32: use percpu instead of offset-calculation to get thread.sp0
when SWITCH_TO_KERNEL_STACK
x86_32/sysenter: switch to the task stack without emptying the entry
stack
x86_32/sysenter: restore %fs before switching stack
x86_32/sysenter: use percpu to get thread.sp0 when sysenter
x86_32: use cpu_current_thread_sp0 instead of cpu_tss_rw.x86_tss.sp1
arch/x86/entry/entry_32.S | 38 +++++++++++++++++-------------
arch/x86/include/asm/processor.h | 12 ++--------
arch/x86/include/asm/switch_to.h | 9 ++-----
arch/x86/include/asm/thread_info.h | 6 -----
arch/x86/kernel/asm-offsets.c | 1 -
arch/x86/kernel/asm-offsets_32.c | 10 --------
arch/x86/kernel/cpu/common.c | 12 +++++++++-
arch/x86/kernel/process.c | 7 ------
arch/x86/mm/pti.c | 7 +++---
9 files changed, 40 insertions(+), 62 deletions(-)
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
TSS_entry2task_stack is used to refer to tss.sp1 which is stored the value
of thread.sp0.
At the code where TSS_entry2task_stack is used in SWITCH_TO_KERNEL_STACK,
the CR3 is already kernel CR3 and kernel segments is loaded.
So we can directly use the percpu to get tss.sp1(thread.sp0) instead of
the complex offset-calculation.
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/entry/entry_32.S | 9 ++-------
1 file changed, 2 insertions(+), 7 deletions(-)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index df8c017e6161..3b4d1a63d1f0 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -465,16 +465,11 @@
cmpl $SIZEOF_entry_stack, %ecx
jae .Lend_\@
- /* Load stack pointer into %esi and %edi */
+ /* Load stack pointer into %esi */
movl %esp, %esi
- movl %esi, %edi
-
- /* Move %edi to the top of the entry stack */
- andl $(MASK_entry_stack), %edi
- addl $(SIZEOF_entry_stack), %edi
/* Load top of task-stack into %edi */
- movl TSS_entry2task_stack(%edi), %edi
+ movl PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %edi
/* Special case - entry from kernel mode via entry stack */
#ifdef CONFIG_VM86
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
sp1 is not used by hardware and is used as thread.sp0. We should just
use new percpu variable.
And remove unneeded TSS_sp1.
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/entry/entry_32.S | 6 +++---
arch/x86/include/asm/processor.h | 2 ++
arch/x86/include/asm/switch_to.h | 2 +-
arch/x86/kernel/asm-offsets.c | 1 -
arch/x86/kernel/cpu/common.c | 9 ++++++++-
arch/x86/kernel/process.c | 2 --
6 files changed, 14 insertions(+), 8 deletions(-)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 3cb42efb3c04..22cd3d8fd23e 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -472,7 +472,7 @@
movl %esp, %esi
/* Load top of task-stack into %edi */
- movl PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %edi
+ movl PER_CPU_VAR(cpu_current_thread_sp0), %edi
/* Special case - entry from kernel mode via entry stack */
#ifdef CONFIG_VM86
@@ -658,7 +658,7 @@
movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
/* Bytes on the task-stack to ecx */
- movl PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %ecx
+ movl PER_CPU_VAR(cpu_current_thread_sp0), %ecx
subl %esi, %ecx
/* Allocate stack-frame on entry-stack */
@@ -916,7 +916,7 @@ SYM_FUNC_START(entry_SYSENTER_32)
/* Switch to task stack */
movl %esp, %eax
- movl PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %esp
+ movl PER_CPU_VAR(cpu_current_thread_sp0), %esp
.Lsysenter_past_esp:
pushl $__USER_DS /* pt_regs->ss */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 886d32da1318..4265884c33e7 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -774,6 +774,8 @@ static inline void spin_lock_prefetch(const void *x)
#define KSTK_ESP(task) (task_pt_regs(task)->sp)
+DECLARE_PER_CPU(unsigned long, cpu_current_thread_sp0);
+
#else
#define INIT_THREAD { }
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index b5f0d2ff47e4..e27eb7974797 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -69,7 +69,7 @@ static inline void update_task_stack(struct task_struct *task)
if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task->thread.sp0);
else
- this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
+ this_cpu_write(cpu_current_thread_sp0, task->thread.sp0);
#else
/* Xen PV enters the kernel on the thread stack. */
if (static_cpu_has(X86_FEATURE_XENPV))
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 60b9f42ce3c1..3b63b6062792 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -98,6 +98,5 @@ static void __used common(void)
/* Offset for fields in tss_struct */
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
- OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f3d7fd7e9684..b2c37d369137 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1789,12 +1789,19 @@ EXPORT_PER_CPU_SYMBOL(__preempt_count);
/*
* On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
* the top of the kernel stack. Use an extra percpu variable to track the
- * top of the kernel stack directly.
+ * top of the kernel stack directly and an percpu variable to track the
+ * thread.sp0 for using in entry code. cpu_current_top_of_stack and
+ * cpu_current_thread_sp0 are different value because of the non-zero
+ * stack-padding on 32bit. See more comment at TOP_OF_KERNEL_STACK_PADDING
+ * and vm86.
*/
DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
(unsigned long)&init_thread_union + THREAD_SIZE;
EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
+DEFINE_PER_CPU(unsigned long, cpu_current_thread_sp0) = TOP_OF_INIT_STACK;
+EXPORT_PER_CPU_SYMBOL(cpu_current_thread_sp0);
+
#ifdef CONFIG_STACKPROTECTOR
DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
#endif
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 296de77da4b2..e6d4b5399a81 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -64,8 +64,6 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
#ifdef CONFIG_X86_32
- .sp1 = TOP_OF_INIT_STACK,
-
.ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS,
#endif
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
TSS_entry2task_stack is used to refer to tss.sp1 which is stored the value
of thread.sp0.
At the code where TSS_entry2task_stack is used in sysenter, the CR3 is
already kernel CR3 and kernel segments is loaded.
So that we can directly use percpu for it instead of offset-calculation.
And we remove the unused TSS_entry2task_stack.
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/entry/entry_32.S | 2 +-
arch/x86/kernel/asm-offsets_32.c | 10 ----------
2 files changed, 1 insertion(+), 11 deletions(-)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index a8d2640394f9..3cb42efb3c04 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -916,7 +916,7 @@ SYM_FUNC_START(entry_SYSENTER_32)
/* Switch to task stack */
movl %esp, %eax
- movl (3*4+TSS_entry2task_stack)(%esp), %esp
+ movl PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %esp
.Lsysenter_past_esp:
pushl $__USER_DS /* pt_regs->ss */
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 6e043f295a60..6d4143cfbf03 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -43,16 +43,6 @@ void foo(void)
OFFSET(saved_context_gdt_desc, saved_context, gdt_desc);
BLANK();
- /*
- * Offset from the entry stack to task stack stored in TSS. Kernel entry
- * happens on the per-cpu entry-stack, and the asm code switches to the
- * task-stack pointer stored in x86_tss.sp1, which is a copy of
- * task->thread.sp0 where entry code can find it.
- */
- DEFINE(TSS_entry2task_stack,
- offsetof(struct cpu_entry_area, tss.x86_tss.sp1) -
- offsetofend(struct cpu_entry_area, entry_stack_page.stack));
-
#ifdef CONFIG_STACKPROTECTOR
BLANK();
OFFSET(stack_canary_offset, stack_canary, canary);
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
Like the way x86_64 uses the "old" stack, we can save the entry stack
pointer to a register and switch to the task stack. So that we have
space on the "old" stack to save more things or scratch registers.
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/entry/entry_32.S | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 3b4d1a63d1f0..4513702ba45d 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -905,19 +905,18 @@ SYM_FUNC_START(entry_SYSENTER_32)
pushl %eax
BUG_IF_WRONG_CR3 no_user_check=1
SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
- popl %eax
- popfl
- /* Stack empty again, switch to task stack */
- movl TSS_entry2task_stack(%esp), %esp
+ /* Switch to task stack */
+ movl %esp, %eax
+ movl (2*4+TSS_entry2task_stack)(%esp), %esp
.Lsysenter_past_esp:
pushl $__USER_DS /* pt_regs->ss */
pushl $0 /* pt_regs->sp (placeholder) */
- pushfl /* pt_regs->flags (except IF = 0) */
+ pushl 4(%eax) /* pt_regs->flags (except IF = 0) */
pushl $__USER_CS /* pt_regs->cs */
pushl $0 /* pt_regs->ip = 0 (placeholder) */
- pushl %eax /* pt_regs->orig_ax */
+ pushl (%eax) /* pt_regs->orig_ax */
SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest, stack already switched */
/*
--
2.19.1.6.gb485710b
From: Lai Jiangshan <[email protected]>
When X86_BUG_CPU_MELTDOWN & KPTI, cpu_current_top_of_stack lives in the
TSS which is also in the user CR3 and it becomes a coveted fruit. An
attacker can fetch the kernel stack top from it and continue next steps
of actions based on the kernel stack.
The address might not be very usefull for attacker, but it is not so
necessary to be in TSS either. It is only accessed when CR3 is kernel CR3
and gs_base is kernel gs_base which means it can be in any percpu variable.
The major reason it is in TSS might be performance because it is hot in
cache and tlb since we just access sp2 as the scratch space in syscall.
So we can move it to a percpu variable near other hot percpu variables,
such as current_task, __preempt_count, and they are in the same
cache line.
tools/testing/selftests/seccomp/seccomp_benchmark desn't show any
performance lost in "getpid native" result. And actually, the result
changes from 93ns before patch to 92ns after patch when !KPTI, and the
test is very stable although the test desn't show a higher degree of
precision but enough to know it doesn't cause degression for the test.
Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/include/asm/processor.h | 10 ----------
arch/x86/include/asm/switch_to.h | 7 +------
arch/x86/include/asm/thread_info.h | 6 ------
arch/x86/kernel/cpu/common.c | 3 +++
arch/x86/kernel/process.c | 7 +------
arch/x86/mm/pti.c | 7 +++----
6 files changed, 8 insertions(+), 32 deletions(-)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c20a52b5534b..886d32da1318 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -314,11 +314,6 @@ struct x86_hw_tss {
struct x86_hw_tss {
u32 reserved1;
u64 sp0;
-
- /*
- * We store cpu_current_top_of_stack in sp1 so it's always accessible.
- * Linux does not use ring 1, so sp1 is not otherwise needed.
- */
u64 sp1;
/*
@@ -428,12 +423,7 @@ struct irq_stack {
DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
-#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
-#else
-/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
-#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
-#endif
#ifdef CONFIG_X86_64
struct fixed_percpu_data {
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 9f69cc497f4b..b5f0d2ff47e4 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -71,12 +71,7 @@ static inline void update_task_stack(struct task_struct *task)
else
this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
#else
- /*
- * x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That
- * doesn't work on x86-32 because sp1 and
- * cpu_current_top_of_stack have different values (because of
- * the non-zero stack-padding on 32bit).
- */
+ /* Xen PV enters the kernel on the thread stack. */
if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task_top_of_stack(task));
#endif
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 0d751d5da702..3dc93d8df425 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -197,12 +197,6 @@ static inline int arch_within_stack_frames(const void * const stack,
#endif
}
-#else /* !__ASSEMBLY__ */
-
-#ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
-#endif
-
#endif
#ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 35ad8480c464..f3d7fd7e9684 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1745,6 +1745,9 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
+EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
+
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 145a7ac0c19a..296de77da4b2 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -63,14 +63,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
*/
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
- /*
- * .sp1 is cpu_current_top_of_stack. The init task never
- * runs user code, but cpu_current_top_of_stack should still
- * be well defined before the first context switch.
- */
+#ifdef CONFIG_X86_32
.sp1 = TOP_OF_INIT_STACK,
-#ifdef CONFIG_X86_32
.ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS,
#endif
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 1aab92930569..e101cd87d038 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -440,10 +440,9 @@ static void __init pti_clone_user_shared(void)
for_each_possible_cpu(cpu) {
/*
- * The SYSCALL64 entry code needs to be able to find the
- * thread stack and needs one word of scratch space in which
- * to spill a register. All of this lives in the TSS, in
- * the sp1 and sp2 slots.
+ * The SYSCALL64 entry code needs one word of scratch space
+ * in which to spill a register. It lives in the sp2 slot
+ * of the CPU's TSS.
*
* This is done for all possible CPUs during boot to ensure
* that it's propagated to all mms.
--
2.19.1.6.gb485710b
On Mon, Jan 25, 2021 at 11:35 AM Lai Jiangshan <[email protected]> wrote:
>
> From: Lai Jiangshan <[email protected]>
>
> Like the way x86_64 uses the "old" stack, we can save the entry stack
> pointer to a register and switch to the task stack. So that we have
> space on the "old" stack to save more things or scratch registers.
>
> Signed-off-by: Lai Jiangshan <[email protected]>
> ---
> arch/x86/entry/entry_32.S | 11 +++++------
> 1 file changed, 5 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
> index 3b4d1a63d1f0..4513702ba45d 100644
> --- a/arch/x86/entry/entry_32.S
> +++ b/arch/x86/entry/entry_32.S
> @@ -905,19 +905,18 @@ SYM_FUNC_START(entry_SYSENTER_32)
> pushl %eax
> BUG_IF_WRONG_CR3 no_user_check=1
> SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
> - popl %eax
> - popfl
>
> - /* Stack empty again, switch to task stack */
> - movl TSS_entry2task_stack(%esp), %esp
> + /* Switch to task stack */
> + movl %esp, %eax
> + movl (2*4+TSS_entry2task_stack)(%esp), %esp
>
> .Lsysenter_past_esp:
> pushl $__USER_DS /* pt_regs->ss */
> pushl $0 /* pt_regs->sp (placeholder) */
> - pushfl /* pt_regs->flags (except IF = 0) */
> + pushl 4(%eax) /* pt_regs->flags (except IF = 0) */
__KERNEL_DS isn't loaded at this point, so this needs an explicit %ss:
override. You probably didn't catch this because the default
__USER_DS was still loaded.
> pushl $__USER_CS /* pt_regs->cs */
> pushl $0 /* pt_regs->ip = 0 (placeholder) */
> - pushl %eax /* pt_regs->orig_ax */
> + pushl (%eax) /* pt_regs->orig_ax */
Add an %ss: override here too.
> SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest, stack already switched */
>
> /*
> --
> 2.19.1.6.gb485710b
>
--
Brian Gerst
The following commit has been merged into the x86/cleanups branch of tip:
Commit-ID: 1591584e2e762edecefde403c44d9c26c9ff72c9
Gitweb: https://git.kernel.org/tip/1591584e2e762edecefde403c44d9c26c9ff72c9
Author: Lai Jiangshan <[email protected]>
AuthorDate: Tue, 26 Jan 2021 01:34:29 +08:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Sun, 28 Mar 2021 22:40:10 +02:00
x86/process/64: Move cpu_current_top_of_stack out of TSS
cpu_current_top_of_stack is currently stored in TSS.sp1. TSS is exposed
through the cpu_entry_area which is visible with user CR3 when PTI is
enabled and active.
This makes it a coveted fruit for attackers. An attacker can fetch the
kernel stack top from it and continue next steps of actions based on the
kernel stack.
But it is actualy not necessary to be stored in the TSS. It is only
accessed after the entry code switched to kernel CR3 and kernel GS_BASE
which means it can be in any regular percpu variable.
The reason why it is in TSS is historical (pre PTI) because TSS is also
used as scratch space in SYSCALL_64 and therefore cache hot.
A syscall also needs the per CPU variable current_task and eventually
__preempt_count, so placing cpu_current_top_of_stack next to them makes it
likely that they end up in the same cache line which should avoid
performance regressions. This is not enforced as the compiler is free to
place these variables, so these entry relevant variables should move into
a data structure to make this enforceable.
The seccomp_benchmark doesn't show any performance loss in the "getpid
native" test result. Actually, the result changes from 93ns before to 92ns
with this change when KPTI is disabled. The test is very stable and
although the test doesn't show a higher degree of precision it gives enough
confidence that moving cpu_current_top_of_stack does not cause a
regression.
[ tglx: Removed unneeded export. Massaged changelog ]
Signed-off-by: Lai Jiangshan <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
arch/x86/include/asm/processor.h | 10 ----------
arch/x86/include/asm/switch_to.h | 7 +------
arch/x86/include/asm/thread_info.h | 8 +-------
arch/x86/kernel/cpu/common.c | 2 ++
arch/x86/kernel/process.c | 7 +------
arch/x86/mm/pti.c | 7 +++----
6 files changed, 8 insertions(+), 33 deletions(-)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 8b3ed21..185142b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -314,11 +314,6 @@ struct x86_hw_tss {
struct x86_hw_tss {
u32 reserved1;
u64 sp0;
-
- /*
- * We store cpu_current_top_of_stack in sp1 so it's always accessible.
- * Linux does not use ring 1, so sp1 is not otherwise needed.
- */
u64 sp1;
/*
@@ -426,12 +421,7 @@ struct irq_stack {
char stack[IRQ_STACK_SIZE];
} __aligned(IRQ_STACK_SIZE);
-#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
-#else
-/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
-#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
-#endif
#ifdef CONFIG_X86_64
struct fixed_percpu_data {
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 9f69cc4..b5f0d2f 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -71,12 +71,7 @@ static inline void update_task_stack(struct task_struct *task)
else
this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
#else
- /*
- * x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That
- * doesn't work on x86-32 because sp1 and
- * cpu_current_top_of_stack have different values (because of
- * the non-zero stack-padding on 32bit).
- */
+ /* Xen PV enters the kernel on the thread stack. */
if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task_top_of_stack(task));
#endif
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 06b740b..de406d9 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -197,13 +197,7 @@ static inline int arch_within_stack_frames(const void * const stack,
#endif
}
-#else /* !__ASSEMBLY__ */
-
-#ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
-#endif
-
-#endif
+#endif /* !__ASSEMBLY__ */
/*
* Thread-synchronous status.
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 1aa5f0a..3401078 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1748,6 +1748,8 @@ DEFINE_PER_CPU(bool, hardirq_stack_inuse);
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
+
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index cdfe5b4..43cbfc8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -63,14 +63,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
*/
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
- /*
- * .sp1 is cpu_current_top_of_stack. The init task never
- * runs user code, but cpu_current_top_of_stack should still
- * be well defined before the first context switch.
- */
+#ifdef CONFIG_X86_32
.sp1 = TOP_OF_INIT_STACK,
-#ifdef CONFIG_X86_32
.ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS,
#endif
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index b377604..5d5c7bb 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -440,10 +440,9 @@ static void __init pti_clone_user_shared(void)
for_each_possible_cpu(cpu) {
/*
- * The SYSCALL64 entry code needs to be able to find the
- * thread stack and needs one word of scratch space in which
- * to spill a register. All of this lives in the TSS, in
- * the sp1 and sp2 slots.
+ * The SYSCALL64 entry code needs one word of scratch space
+ * in which to spill a register. It lives in the sp2 slot
+ * of the CPU's TSS.
*
* This is done for all possible CPUs during boot to ensure
* that it's propagated to all mms.