2017-09-06 21:37:06

by Andy Lutomirski

[permalink] [raw]
Subject: [RFC 00/17] Pile o' entry stack changes

Hi all-

Here's a pile of entry changes. In brief summary:

- Lots of people (Linus included) have asked to convert the entry
code to pop registers on exit instead of movqing them off the
stack. This makes a bunch of progress in that direction.

- Linux's sp0 handling has annoyed me for a while. We have
thread_struct::sp0, which never made much sense to me. This
series removes it on x86_64 and removes most references on
x86_32.

- Xen PV's cpuinit code did incomprehensible things with stack
pointers. This makes it comprehensible.

Juergen, this needs a bit of help on Xen -- see the NMI patch for details.

Reviews would be appreciated :)

Andy Lutomirski (17):
x86/asm/64: Remove the restore_c_regs_and_iret label
x86/asm/64: Split the iret-to-user and iret-to-kernel paths
x86/asm/64: Move SWAPGS into the common iret-to-usermode path
x86/asm/64: Simplify reg restore code in the standard IRET paths
x86/asm/64: Shrink paranoid_exit_restore and make labels local
x86/asm/64: Use pop instead of movq in syscall_return_via_sysret
x86/asm/64: Merge the fast and slow SYSRET paths
x86/asm/64: De-Xen-ify our NMI code
x86/asm/32: Pull MSR_IA32_SYSENTER_CS update code out of
native_load_sp0()
x86/asm/64: Pass sp0 directly to load_sp0()
x86/asm: Add task_top_of_stack() to find the top of a task's stack
x86/xen/64: Clean up SP code in cpu_initialize_context()
x86/boot/64: Stop initializing TSS.sp0 at boot
x86/asm/64: Remove all remaining direct thread_struct::sp0 reads
x86/boot/32: Fix cpu_current_top_of_stack initialization at boot
x86/asm/64: Remove thread_struct::sp0
x86/traps: Use a new on_thread_stack() helper to clean up an assertion

arch/x86/entry/calling.h | 9 +++
arch/x86/entry/entry_64.S | 133 ++++++++++++++++++----------------
arch/x86/entry/entry_64_compat.S | 3 +-
arch/x86/include/asm/compat.h | 1 +
arch/x86/include/asm/paravirt.h | 5 +-
arch/x86/include/asm/paravirt_types.h | 2 +-
arch/x86/include/asm/processor.h | 68 +++++++++--------
arch/x86/include/asm/switch_to.h | 23 ++++++
arch/x86/include/asm/thread_info.h | 11 ---
arch/x86/kernel/cpu/common.c | 12 ++-
arch/x86/kernel/head_64.S | 2 +-
arch/x86/kernel/process.c | 3 +-
arch/x86/kernel/process_32.c | 3 +-
arch/x86/kernel/process_64.c | 5 +-
arch/x86/kernel/smpboot.c | 3 +-
arch/x86/kernel/traps.c | 3 +-
arch/x86/kernel/vm86_32.c | 14 ++--
arch/x86/lguest/boot.c | 7 +-
arch/x86/xen/enlighten_pv.c | 7 +-
arch/x86/xen/smp_pv.c | 17 ++++-
20 files changed, 192 insertions(+), 139 deletions(-)

--
2.13.5


2017-09-06 21:37:19

by Andy Lutomirski

[permalink] [raw]
Subject: [RFC 16/17] x86/asm/64: Remove thread_struct::sp0

On x86_64, we can easily calculate sp0 when needed instead of
storing it in thread_struct.

On x86_32, a similar cleanup would be possible, but it would require
cleaning up the vm86 code first, and that can wait for a later
cleanup series.

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/include/asm/compat.h | 1 +
arch/x86/include/asm/processor.h | 33 +++++++++++++++------------------
arch/x86/include/asm/switch_to.h | 6 ++++++
arch/x86/kernel/process_64.c | 1 -
4 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 5343c19814b3..948b6d8ec46f 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -6,6 +6,7 @@
*/
#include <linux/types.h>
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <asm/processor.h>
#include <asm/user32.h>
#include <asm/unistd.h>
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f83fbf1b6dd9..4c137472f530 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -423,7 +423,9 @@ typedef struct {
struct thread_struct {
/* Cached TLS descriptors: */
struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
+#ifdef CONFIG_X86_32
unsigned long sp0;
+#endif
unsigned long sp;
#ifdef CONFIG_X86_32
unsigned long sysenter_cs;
@@ -790,6 +792,13 @@ static inline void spin_lock_prefetch(const void *x)

#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1))

+#define task_pt_regs(task) \
+({ \
+ unsigned long __ptr = (unsigned long)task_stack_page(task); \
+ __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \
+ ((struct pt_regs *)__ptr) - 1; \
+})
+
#ifdef CONFIG_X86_32
/*
* User space process size: 3GB (default).
@@ -807,23 +816,6 @@ static inline void spin_lock_prefetch(const void *x)
.addr_limit = KERNEL_DS, \
}

-/*
- * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack.
- * This is necessary to guarantee that the entire "struct pt_regs"
- * is accessible even if the CPU haven't stored the SS/ESP registers
- * on the stack (interrupt gate does not save these registers
- * when switching to the same priv ring).
- * Therefore beware: accessing the ss/esp fields of the
- * "struct pt_regs" is possible, but they may contain the
- * completely wrong values.
- */
-#define task_pt_regs(task) \
-({ \
- unsigned long __ptr = (unsigned long)task_stack_page(task); \
- __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \
- ((struct pt_regs *)__ptr) - 1; \
-})
-
#define KSTK_ESP(task) (task_pt_regs(task)->sp)

#else
@@ -852,12 +844,17 @@ static inline void spin_lock_prefetch(const void *x)
#define STACK_TOP TASK_SIZE
#define STACK_TOP_MAX TASK_SIZE_MAX

+#ifdef CONFIG_X86_32
#define INIT_THREAD { \
.sp0 = TOP_OF_INIT_STACK, \
.addr_limit = KERNEL_DS, \
}
+#else
+#define INIT_THREAD { \
+ .addr_limit = KERNEL_DS, \
+}
+#endif

-#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
extern unsigned long KSTK_ESP(struct task_struct *task);

#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index d9bb491ba45c..c557b7526cc7 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_SWITCH_TO_H
#define _ASM_X86_SWITCH_TO_H

+#include <linux/sched/task_stack.h>
+
struct task_struct; /* one of the stranger aspects of C forward declarations */

struct task_struct *__switch_to_asm(struct task_struct *prev,
@@ -86,7 +88,11 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread)
/* This is used when switching tasks or entering/exiting vm86 mode. */
static inline void update_sp0(struct task_struct *task)
{
+#ifdef CONFIG_X86_32
load_sp0(task->thread.sp0);
+#else
+ load_sp0(task_top_of_stack(task));
+#endif
}

#endif /* _ASM_X86_SWITCH_TO_H */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c33f8ad297bb..6d9d11ee42ec 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -158,7 +158,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
struct inactive_task_frame *frame;
struct task_struct *me = current;

- p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
childregs = task_pt_regs(p);
fork_frame = container_of(childregs, struct fork_frame, regs);
frame = &fork_frame->frame;
--
2.13.5

2017-09-06 21:37:31

by Andy Lutomirski

[permalink] [raw]
Subject: [RFC 17/17] x86/traps: Use a new on_thread_stack() helper to clean up an assertion

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/include/asm/processor.h | 17 +++++++++++++++++
arch/x86/include/asm/thread_info.h | 11 -----------
arch/x86/kernel/traps.c | 3 +--
3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 4c137472f530..b6f8dc11c222 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -534,6 +534,23 @@ static inline unsigned long current_top_of_stack(void)
#endif
}

+static inline unsigned long current_stack_pointer(void)
+{
+ unsigned long sp;
+#ifdef CONFIG_X86_64
+ asm("mov %%rsp,%0" : "=g" (sp));
+#else
+ asm("mov %%esp,%0" : "=g" (sp));
+#endif
+ return sp;
+}
+
+static inline bool on_thread_stack(void)
+{
+ return (unsigned long)(current_top_of_stack() -
+ current_stack_pointer()) < THREAD_SIZE;
+}
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e00e1bd6e7b3..90e1f9b84534 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -155,17 +155,6 @@ struct thread_info {
*/
#ifndef __ASSEMBLY__

-static inline unsigned long current_stack_pointer(void)
-{
- unsigned long sp;
-#ifdef CONFIG_X86_64
- asm("mov %%rsp,%0" : "=g" (sp));
-#else
- asm("mov %%esp,%0" : "=g" (sp));
-#endif
- return sp;
-}
-
/*
* Walks up the stack frames to make sure that the specified object is
* entirely contained by a single stack frame.
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index bf54309b85da..0347ed41c92d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -153,8 +153,7 @@ void ist_begin_non_atomic(struct pt_regs *regs)
* will catch asm bugs and any attempt to use ist_preempt_enable
* from double_fault.
*/
- BUG_ON((unsigned long)(current_top_of_stack() -
- current_stack_pointer()) >= THREAD_SIZE);
+ BUG_ON(!on_thread_stack());

preempt_enable_no_resched();
}
--
2.13.5

2017-09-06 21:37:17

by Andy Lutomirski

[permalink] [raw]
Subject: [RFC 13/17] x86/boot/64: Stop initializing TSS.sp0 at boot

In my quest to get rid of thread_struct::sp0, I want to clean up or
remove all of its readers. Two of them are in cpu_init() (32-bit and
64-bit), and they aren't needed. This is because we never enter
userspace at all on the threads that CPUs are initialized in.

Poison the initial TSS.sp0 and stop initializing it on CPU init.

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/kernel/cpu/common.c | 12 ++++++++++--
arch/x86/kernel/process.c | 3 ++-
2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 40312b3ef9de..b3c621272e6b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1554,9 +1554,13 @@ void cpu_init(void)
BUG_ON(me->mm);
enter_lazy_tlb(&init_mm, me);

- load_sp0(current->thread.sp0);
+ /*
+ * Initialize the TSS. Don't bother initializing sp0, as the initial
+ * task never enters user mode.
+ */
set_tss_desc(cpu, t);
load_TR_desc();
+
load_mm_ldt(&init_mm);

clear_all_debug_regs();
@@ -1608,9 +1612,13 @@ void cpu_init(void)
BUG_ON(curr->mm);
enter_lazy_tlb(&init_mm, curr);

- load_sp0(thread->sp0);
+ /*
+ * Initialize the TSS. Don't bother initializing sp0, as the initial
+ * task never enters user mode.
+ */
set_tss_desc(cpu, t);
load_TR_desc();
+
load_mm_ldt(&init_mm);

t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3ca198080ea9..df478002b07d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -48,7 +48,8 @@
*/
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
.x86_tss = {
- .sp0 = TOP_OF_INIT_STACK,
+ /* Initialize sp0 to a value that is definitely invalid. */
+ .sp0 = 0x8000000000000001,
#ifdef CONFIG_X86_32
.ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS,
--
2.13.5

2017-09-06 21:38:11

by Andy Lutomirski

[permalink] [raw]
Subject: [RFC 15/17] x86/boot/32: Fix cpu_current_top_of_stack initialization at boot

cpu_current_top_of_stack's initialization forgot about
TOP_OF_KERNEL_STACK_PADDING. This bug didn't matter because the
idle threads never enter user mode.

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/kernel/smpboot.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b474c8de7fba..e6544bac7c51 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -958,8 +958,7 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
- per_cpu(cpu_current_top_of_stack, cpu) =
- (unsigned long)task_stack_page(idle) + THREAD_SIZE;
+ per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
#else
initial_gs = per_cpu_offset(cpu);
#endif
--
2.13.5

2017-09-06 21:37:16

by Andy Lutomirski

[permalink] [raw]
Subject: [RFC 08/17] x86/asm/64: De-Xen-ify our NMI code

Xen PV is fundamentally incompatible with our fancy NMI code: it
doesn't use IST at all, and Xen entries clobber two stack slots
below the hardware frame.

Drop Xen PV support from our NMI code entirely.

XXX: Juergen: could you write and test the tiny patch needed to
make Xen PV have a xen_nmi entry that handles NMIs? I don't know
how to test it.

Cc: Juergen Gross <[email protected]>
Cc: Boris Ostrovsky <[email protected]>
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/entry_64.S | 41 +++++++++++++++++------------------------
1 file changed, 17 insertions(+), 24 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index a9e318f7cc9b..c81e05fb999e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1171,21 +1171,12 @@ ENTRY(error_exit)
jmp retint_user
END(error_exit)

-/* Runs on exception stack */
+/*
+ * Runs on exception stack. Xen PV does not go through this path at all,
+ * so we can use real assembly here.
+ */
ENTRY(nmi)
/*
- * Fix up the exception frame if we're on Xen.
- * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most
- * one value to the stack on native, so it may clobber the rdx
- * scratch slot, but it won't clobber any of the important
- * slots past it.
- *
- * Xen is a different story, because the Xen frame itself overlaps
- * the "NMI executing" variable.
- */
- PARAVIRT_ADJUST_EXCEPTION_FRAME
-
- /*
* We allow breakpoints in NMIs. If a breakpoint occurs, then
* the iretq it performs will take us out of NMI context.
* This means that we can have nested NMIs where the next
@@ -1240,7 +1231,7 @@ ENTRY(nmi)
* stacks lest we corrupt the "NMI executing" variable.
*/

- SWAPGS_UNSAFE_STACK
+ swapgs
cld
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1402,7 +1393,7 @@ nested_nmi_out:
popq %rdx

/* We are returning to kernel mode, so this cannot result in a fault. */
- INTERRUPT_RETURN
+ iretq

first_nmi:
/* Restore rdx. */
@@ -1432,7 +1423,7 @@ first_nmi:
pushfq /* RFLAGS */
pushq $__KERNEL_CS /* CS */
pushq $1f /* RIP */
- INTERRUPT_RETURN /* continues at repeat_nmi below */
+ iretq /* continues at repeat_nmi below */
1:
#endif

@@ -1502,20 +1493,22 @@ nmi_restore:
/*
* Clear "NMI executing". Set DF first so that we can easily
* distinguish the remaining code between here and IRET from
- * the SYSCALL entry and exit paths. On a native kernel, we
- * could just inspect RIP, but, on paravirt kernels,
- * INTERRUPT_RETURN can translate into a jump into a
- * hypercall page.
+ * the SYSCALL entry and exit paths.
+ *
+ * We arguably should just inspect RIP instead, but I (Andy) wrote
+ * this code when I had the misapprehension that Xen PV supported
+ * NMIs, and Xen PV would break that approach.
*/
std
movq $0, 5*8(%rsp) /* clear "NMI executing" */

/*
- * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
- * stack in a single instruction. We are returning to kernel
- * mode, so this cannot result in a fault.
+ * iretq reads the "iret" frame and exits the NMI stack in a
+ * single instruction. We are returning to kernel mode, so this
+ * cannot result in a fault. Similarly, we don't need to worry
+ * about espfix64 on the way back to kernel mode.
*/
- INTERRUPT_RETURN
+ iretq
END(nmi)

ENTRY(ignore_sysret)
--
2.13.5

2017-09-06 21:38:35

by Andy Lutomirski

[permalink] [raw]
Subject: [RFC 14/17] x86/asm/64: Remove all remaining direct thread_struct::sp0 reads

The only remaining readers in context switch code or vm86(), and
they all just want to update TSS.sp0 to match the current task.
Replace them all with a new helper update_sp0().

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/include/asm/switch_to.h | 6 ++++++
arch/x86/kernel/process_32.c | 2 +-
arch/x86/kernel/process_64.c | 2 +-
arch/x86/kernel/vm86_32.c | 4 ++--
4 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index f3fa19925ae1..d9bb491ba45c 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -83,4 +83,10 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread)
}
#endif

+/* This is used when switching tasks or entering/exiting vm86 mode. */
+static inline void update_sp0(struct task_struct *task)
+{
+ load_sp0(task->thread.sp0);
+}
+
#endif /* _ASM_X86_SWITCH_TO_H */
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index e06a98e3a772..aab973504ca5 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -286,7 +286,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* Reload esp0 and cpu_current_top_of_stack. This changes
* current_thread_info().
*/
- load_sp0(next->sp0);
+ update_sp0(next);
refresh_sysenter_cs(next); /* in case prev or next is vm86 */
this_cpu_write(cpu_current_top_of_stack,
(unsigned long)task_stack_page(next_p) +
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 06c4393b19b3..c33f8ad297bb 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -437,7 +437,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
this_cpu_write(current_task, next_p);

/* Reload sp0. */
- load_sp0(next->sp0);
+ update_sp0(next_p);

/*
* Now maybe reload the debug registers and handle I/O bitmaps
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 3b98b7771f15..33dc5d3b9a4a 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -147,7 +147,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)

tsk->thread.sp0 = vm86->saved_sp0;
tsk->thread.sysenter_cs = __KERNEL_CS;
- load_sp0(tsk->thread.sp0);
+ update_sp0(tsk);
refresh_sysenter_cs(&tsk->thread);
vm86->saved_sp0 = 0;
put_cpu();
@@ -371,7 +371,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
refresh_sysenter_cs(&tsk->thread);
}

- load_sp0(tsk->thread.sp0);
+ update_sp0(tsk);
put_cpu();

if (vm86->flags & VM86_SCREEN_BITMAP)
--
2.13.5

2017-09-06 21:37:14

by Andy Lutomirski

[permalink] [raw]
Subject: [RFC 09/17] x86/asm/32: Pull MSR_IA32_SYSENTER_CS update code out of native_load_sp0()

This causees the MSR_IA32_SYSENTER_CS write to move out of the
paravirt hook. This shouldn't affect Xen or lgeust: Xen already
ignores MSR_IA32_SYSENTER_ESP writes and lguest doesn't support
SYSENTER at all. In any event, Xen doesn't support vm86()
in a useful way.

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/include/asm/processor.h | 7 -------
arch/x86/include/asm/switch_to.h | 11 +++++++++++
arch/x86/kernel/process_32.c | 1 +
arch/x86/kernel/process_64.c | 2 +-
arch/x86/kernel/vm86_32.c | 6 +++++-
5 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 028245e1c42b..ee37fb86900a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -513,13 +513,6 @@ static inline void
native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
{
tss->x86_tss.sp0 = thread->sp0;
-#ifdef CONFIG_X86_32
- /* Only happens when SEP is enabled, no need to test "SEP"arately: */
- if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
- tss->x86_tss.ss1 = thread->sysenter_cs;
- wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
- }
-#endif
}

static inline void native_swapgs(void)
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index fcc5cd387fd1..f3fa19925ae1 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -72,4 +72,15 @@ do { \
((last) = __switch_to_asm((prev), (next))); \
} while (0)

+#ifdef CONFIG_X86_32
+static inline void refresh_sysenter_cs(struct thread_struct *thread)
+{
+ /* Only happens when SEP is enabled, no need to test "SEP"arately: */
+ if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs))
+ return;
+
+ this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs);
+}
+#endif
+
#endif /* _ASM_X86_SWITCH_TO_H */
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index c6d6dc5f8bb2..9f217b5ef438 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -287,6 +287,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* current_thread_info().
*/
load_sp0(tss, next);
+ refresh_sysenter_cs(); /* in case prev or next is vm86 */
this_cpu_write(cpu_current_top_of_stack,
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c3169be4c596..985e7569ab57 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -436,7 +436,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
this_cpu_write(current_task, next_p);

- /* Reload esp0 and ss1. This changes current_thread_info(). */
+ /* Reload sp0. */
load_sp0(tss, next);

/*
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 7924a5356c8a..9edda73dcc90 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -54,6 +54,7 @@
#include <asm/irq.h>
#include <asm/traps.h>
#include <asm/vm86.h>
+#include <asm/switch_to.h>

/*
* Known problems:
@@ -149,6 +150,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
tsk->thread.sp0 = vm86->saved_sp0;
tsk->thread.sysenter_cs = __KERNEL_CS;
load_sp0(tss, &tsk->thread);
+ refresh_sysenter_cs();
vm86->saved_sp0 = 0;
put_cpu();

@@ -368,8 +370,10 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
/* make room for real-mode segments */
tsk->thread.sp0 += 16;

- if (static_cpu_has(X86_FEATURE_SEP))
+ if (static_cpu_has(X86_FEATURE_SEP)) {
tsk->thread.sysenter_cs = 0;
+ refresh_sysenter_cs();
+ }

load_sp0(tss, &tsk->thread);
put_cpu();
--
2.13.5

2017-09-06 21:38:52

by Andy Lutomirski

[permalink] [raw]
Subject: [RFC 12/17] x86/xen/64: Clean up SP code in cpu_initialize_context()

I'm removing thread_struct::sp0, and Xen's usage of it is slightly
dubious and unnecessary. Use task_top_of_stack() instead.

While we're at at, reorder the code slightly to make it more obvious
what's going on.

Cc: Juergen Gross <[email protected]>
Cc: Boris Ostrovsky <[email protected]>
Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/xen/smp_pv.c | 17 ++++++++++++++---
1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 51471408fdd1..ad18988b9cd6 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -13,6 +13,7 @@
* single-threaded.
*/
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/smp.h>
@@ -293,12 +294,19 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
#endif
memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));

+ /*
+ * Bring up the CPU in cpu_bringup_and_idle() with the stack
+ * pointing just below where pt_regs would be if it were a normal
+ * kernel entry.
+ */
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
ctxt->flags = VGCF_IN_KERNEL;
ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
ctxt->user_regs.ds = __USER_DS;
ctxt->user_regs.es = __USER_DS;
ctxt->user_regs.ss = __KERNEL_DS;
+ ctxt->user_regs.cs = __KERNEL_CS;
+ ctxt->user_regs.esp = task_top_of_stack(idle) - sizeof(struct pt_regs);

xen_copy_trap_info(ctxt->trap_ctxt);

@@ -313,8 +321,13 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
ctxt->gdt_frames[0] = gdt_mfn;
ctxt->gdt_ents = GDT_ENTRIES;

+ /*
+ * Set SS:SP that Xen will use when entering guest kernel mode
+ * from guest user mode. Subsequent calls to load_sp0() can
+ * change this value.
+ */
ctxt->kernel_ss = __KERNEL_DS;
- ctxt->kernel_sp = idle->thread.sp0;
+ ctxt->kernel_sp = task_top_of_stack(idle);

#ifdef CONFIG_X86_32
ctxt->event_callback_cs = __KERNEL_CS;
@@ -326,10 +339,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
(unsigned long)xen_hypervisor_callback;
ctxt->failsafe_callback_eip =
(unsigned long)xen_failsafe_callback;
- ctxt->user_regs.cs = __KERNEL_CS;
per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);

- ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir));
if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt))
BUG();
--
2.13.5

2017-09-06 21:39:11

by Andy Lutomirski

[permalink] [raw]
Subject: [RFC 11/17] x86/asm: Add task_top_of_stack() to find the top of a task's stack

This will let us get rid of a few places that hardcode accesses to
thread.sp0.

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/include/asm/processor.h | 2 ++
1 file changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 85ddfc1a9bb5..f83fbf1b6dd9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -788,6 +788,8 @@ static inline void spin_lock_prefetch(const void *x)
#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
TOP_OF_KERNEL_STACK_PADDING)

+#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1))
+
#ifdef CONFIG_X86_32
/*
* User space process size: 3GB (default).
--
2.13.5

2017-09-06 21:37:10

by Andy Lutomirski

[permalink] [raw]
Subject: [RFC 05/17] x86/asm/64: Shrink paranoid_exit_restore and make labels local

paranoid_exit_restore was a copy of
restore_regs_and_return_to_kernel. Merge them and make the
paranoid_exit internal labels local.

Keeping .Lparanoid_exit makes the code a bit shorter because it
allows a 2-byte jnz instead of a 5-byte jnz.

Saves 96 bytes of text.

(This is still a bit suboptimal in a non-CONFIG_TRACE_IRQFLAGS
kernel, but fixing that would make the code rather messy.)

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/entry_64.S | 13 +++++--------
1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 7f1a83b17b4a..741b7dc1758b 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1057,17 +1057,14 @@ ENTRY(paranoid_exit)
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF_DEBUG
testl %ebx, %ebx /* swapgs needed? */
- jnz paranoid_exit_no_swapgs
+ jnz .Lparanoid_exit_no_swapgs
TRACE_IRQS_IRETQ
SWAPGS_UNSAFE_STACK
- jmp paranoid_exit_restore
-paranoid_exit_no_swapgs:
+ jmp .Lparanoid_exit_restore
+.Lparanoid_exit_no_swapgs:
TRACE_IRQS_IRETQ_DEBUG
-paranoid_exit_restore:
- RESTORE_EXTRA_REGS
- RESTORE_C_REGS
- REMOVE_PT_GPREGS_FROM_STACK 8
- INTERRUPT_RETURN
+.Lparanoid_exit_restore:
+ jmp restore_regs_and_return_to_kernel
END(paranoid_exit)

/*
--
2.13.5

2017-09-06 21:39:27

by Andy Lutomirski

[permalink] [raw]
Subject: [RFC 10/17] x86/asm/64: Pass sp0 directly to load_sp0()

load_sp0() had an odd signature:

void load_sp0(struct tss_struct *tss, struct thread_struct *thread);

Simplify it to:

void load_sp0(unsigned long sp0);

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/include/asm/paravirt.h | 5 ++---
arch/x86/include/asm/paravirt_types.h | 2 +-
arch/x86/include/asm/processor.h | 9 ++++-----
arch/x86/kernel/cpu/common.c | 4 ++--
arch/x86/kernel/process_32.c | 4 ++--
arch/x86/kernel/process_64.c | 2 +-
arch/x86/kernel/vm86_32.c | 12 ++++--------
arch/x86/lguest/boot.c | 7 +++----
arch/x86/xen/enlighten_pv.c | 7 +++----
9 files changed, 22 insertions(+), 30 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 9ccac1926587..209dada6207e 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -15,10 +15,9 @@
#include <linux/cpumask.h>
#include <asm/frame.h>

-static inline void load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
+static inline void load_sp0(unsigned long sp0)
{
- PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
+ PVOP_VCALL1(pv_cpu_ops.load_sp0, sp0);
}

/* The paravirtualized CPUID instruction. */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 9ffc36bfe4cd..04a091615c08 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -136,7 +136,7 @@ struct pv_cpu_ops {
void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries);
void (*free_ldt)(struct desc_struct *ldt, unsigned entries);

- void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
+ void (*load_sp0)(unsigned long sp0);

void (*set_iopl_mask)(unsigned mask);

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ee37fb86900a..85ddfc1a9bb5 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -510,9 +510,9 @@ static inline void native_set_iopl_mask(unsigned mask)
}

static inline void
-native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
+native_load_sp0(unsigned long sp0)
{
- tss->x86_tss.sp0 = thread->sp0;
+ this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
}

static inline void native_swapgs(void)
@@ -537,10 +537,9 @@ static inline unsigned long current_top_of_stack(void)
#else
#define __cpuid native_cpuid

-static inline void load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
+static inline void load_sp0(unsigned long sp0)
{
- native_load_sp0(tss, thread);
+ native_load_sp0(sp0);
}

#define set_iopl_mask native_set_iopl_mask
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c8b39870f33e..40312b3ef9de 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1554,7 +1554,7 @@ void cpu_init(void)
BUG_ON(me->mm);
enter_lazy_tlb(&init_mm, me);

- load_sp0(t, &current->thread);
+ load_sp0(current->thread.sp0);
set_tss_desc(cpu, t);
load_TR_desc();
load_mm_ldt(&init_mm);
@@ -1608,7 +1608,7 @@ void cpu_init(void)
BUG_ON(curr->mm);
enter_lazy_tlb(&init_mm, curr);

- load_sp0(t, thread);
+ load_sp0(thread->sp0);
set_tss_desc(cpu, t);
load_TR_desc();
load_mm_ldt(&init_mm);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 9f217b5ef438..e06a98e3a772 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -286,8 +286,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* Reload esp0 and cpu_current_top_of_stack. This changes
* current_thread_info().
*/
- load_sp0(tss, next);
- refresh_sysenter_cs(); /* in case prev or next is vm86 */
+ load_sp0(next->sp0);
+ refresh_sysenter_cs(next); /* in case prev or next is vm86 */
this_cpu_write(cpu_current_top_of_stack,
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 985e7569ab57..06c4393b19b3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -437,7 +437,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
this_cpu_write(current_task, next_p);

/* Reload sp0. */
- load_sp0(tss, next);
+ load_sp0(next->sp0);

/*
* Now maybe reload the debug registers and handle I/O bitmaps
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 9edda73dcc90..3b98b7771f15 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -94,7 +94,6 @@

void save_v86_state(struct kernel_vm86_regs *regs, int retval)
{
- struct tss_struct *tss;
struct task_struct *tsk = current;
struct vm86plus_struct __user *user;
struct vm86 *vm86 = current->thread.vm86;
@@ -146,11 +145,10 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
do_exit(SIGSEGV);
}

- tss = &per_cpu(cpu_tss, get_cpu());
tsk->thread.sp0 = vm86->saved_sp0;
tsk->thread.sysenter_cs = __KERNEL_CS;
- load_sp0(tss, &tsk->thread);
- refresh_sysenter_cs();
+ load_sp0(tsk->thread.sp0);
+ refresh_sysenter_cs(&tsk->thread);
vm86->saved_sp0 = 0;
put_cpu();

@@ -238,7 +236,6 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)

static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
{
- struct tss_struct *tss;
struct task_struct *tsk = current;
struct vm86 *vm86 = tsk->thread.vm86;
struct kernel_vm86_regs vm86regs;
@@ -366,16 +363,15 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
vm86->saved_sp0 = tsk->thread.sp0;
lazy_save_gs(vm86->regs32.gs);

- tss = &per_cpu(cpu_tss, get_cpu());
/* make room for real-mode segments */
tsk->thread.sp0 += 16;

if (static_cpu_has(X86_FEATURE_SEP)) {
tsk->thread.sysenter_cs = 0;
- refresh_sysenter_cs();
+ refresh_sysenter_cs(&tsk->thread);
}

- load_sp0(tss, &tsk->thread);
+ load_sp0(tsk->thread.sp0);
put_cpu();

if (vm86->flags & VM86_SCREEN_BITMAP)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 99472698c931..07bb8e9a69ac 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1054,12 +1054,11 @@ static void lguest_time_init(void)
* will not tolerate us trying to use that), the stack pointer, and the number
* of pages in the stack.
*/
-static void lguest_load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
+static void lguest_load_sp0(unsigned long sp0)
{
- lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
+ lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, sp0,
THREAD_SIZE / PAGE_SIZE);
- tss->x86_tss.sp0 = thread->sp0;
+ this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
}

/* Let's just say, I wouldn't do debugging under a Guest. */
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 811e4ddb3f37..6c803c3ee831 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -771,15 +771,14 @@ static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
}
}

-static void xen_load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
+static void xen_load_sp0(unsigned long sp0)
{
struct multicall_space mcs;

mcs = xen_mc_entry(0);
- MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
+ MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
xen_mc_issue(PARAVIRT_LAZY_CPU);
- tss->x86_tss.sp0 = thread->sp0;
+ this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
}

void xen_set_iopl_mask(unsigned mask)
--
2.13.5

2017-09-06 21:39:45

by Andy Lutomirski

[permalink] [raw]
Subject: [RFC 07/17] x86/asm/64: Merge the fast and slow SYSRET paths

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/entry_64.S | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 45d4cb8fd81b..a9e318f7cc9b 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -216,9 +216,8 @@ entry_SYSCALL_64_fastpath:
TRACE_IRQS_ON /* user mode is traced as IRQs on */
movq RIP(%rsp), %rcx
movq EFLAGS(%rsp), %r11
- RESTORE_C_REGS_EXCEPT_RCX_R11
- movq RSP(%rsp), %rsp
- USERGS_SYSRET64
+ addq $6*8, %rsp
+ jmp .Lpop_c_regs_and_sysret

1:
/*
@@ -309,6 +308,7 @@ return_from_SYSCALL_64:
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
POP_EXTRA_REGS
+.Lpop_c_regs_and_sysret:
popq %rsi /* skip r11 */
popq %r10
popq %r9
--
2.13.5

2017-09-06 21:37:08

by Andy Lutomirski

[permalink] [raw]
Subject: [RFC 02/17] x86/asm/64: Split the iret-to-user and iret-to-kernel paths

These code paths will diverge soon.

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/entry_64.S | 20 +++++++++++---------
arch/x86/entry/entry_64_compat.S | 2 +-
arch/x86/kernel/head_64.S | 2 +-
3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index a0ebeeb9d12f..34156d50c3d6 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -315,7 +315,7 @@ syscall_return_via_sysret:

opportunistic_sysret_failed:
SWAPGS
- jmp restore_regs_and_iret
+ jmp restore_regs_and_return_to_usermode
END(entry_SYSCALL_64)

ENTRY(stub_ptregs_64)
@@ -412,7 +412,7 @@ ENTRY(ret_from_fork)
call syscall_return_slowpath /* returns with IRQs disabled */
TRACE_IRQS_ON /* user mode is traced as IRQS on */
SWAPGS
- jmp restore_regs_and_iret
+ jmp restore_regs_and_return_to_usermode

1:
/* kernel thread */
@@ -524,7 +524,13 @@ GLOBAL(retint_user)
call prepare_exit_to_usermode
TRACE_IRQS_IRETQ
SWAPGS
- jmp restore_regs_and_iret
+
+GLOBAL(restore_regs_and_return_to_usermode)
+ RESTORE_EXTRA_REGS
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
+ INTERRUPT_RETURN
+

/* Returning to kernel space */
retint_kernel:
@@ -544,11 +550,7 @@ retint_kernel:
*/
TRACE_IRQS_IRETQ

-/*
- * At this label, code paths which return to kernel and to user,
- * which come from interrupts/exception and from syscalls, merge.
- */
-GLOBAL(restore_regs_and_iret)
+GLOBAL(restore_regs_and_return_to_kernel)
RESTORE_EXTRA_REGS
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8
@@ -1264,7 +1266,7 @@ ENTRY(nmi)
* work, because we don't want to enable interrupts.
*/
SWAPGS
- jmp restore_regs_and_iret
+ jmp restore_regs_and_return_to_usermode

.Lnmi_from_kernel:
/*
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 5314d7b8e5ad..e4ab8ffa94b1 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -338,7 +338,7 @@ ENTRY(entry_INT80_compat)
/* Go back to user mode. */
TRACE_IRQS_ON
SWAPGS
- jmp restore_regs_and_iret
+ jmp restore_regs_and_return_to_usermode
END(entry_INT80_compat)

ALIGN
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 6225550883df..63fa56ead56e 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -310,7 +310,7 @@ early_idt_handler_common:

20:
decl early_recursion_flag(%rip)
- jmp restore_regs_and_iret
+ jmp restore_regs_and_return_to_kernel
ENDPROC(early_idt_handler_common)

__INITDATA
--
2.13.5

2017-09-06 21:40:03

by Andy Lutomirski

[permalink] [raw]
Subject: [RFC 06/17] x86/asm/64: Use pop instead of movq in syscall_return_via_sysret

Saves 64 bytes.

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/entry_64.S | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 741b7dc1758b..45d4cb8fd81b 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -308,9 +308,17 @@ return_from_SYSCALL_64:
*/
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
- RESTORE_EXTRA_REGS
- RESTORE_C_REGS_EXCEPT_RCX_R11
- movq RSP(%rsp), %rsp
+ POP_EXTRA_REGS
+ popq %rsi /* skip r11 */
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rax
+ popq %rsi /* skip rcx */
+ popq %rdx
+ popq %rsi
+ popq %rdi
+ movq RSP-ORIG_RAX(%rsp), %rsp
USERGS_SYSRET64
END(entry_SYSCALL_64)

--
2.13.5

2017-09-06 21:40:19

by Andy Lutomirski

[permalink] [raw]
Subject: [RFC 04/17] x86/asm/64: Simplify reg restore code in the standard IRET paths

The old code restored all the registers with movq instead of pop.
In theory, this was done because some CPUs have higher movq
throughput, but any gain there would be tiny and is almost certainly
outweighed by the higher text size.

This saves 96 bytes of text.

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/calling.h | 9 +++++++++
arch/x86/entry/entry_64.S | 28 ++++++++++++++++++++++------
2 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 05ed3d393da7..0a2c73fe2cfc 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -147,6 +147,15 @@ For 32-bit we have the following conventions - kernel is built with
movq 5*8+\offset(%rsp), %rbx
.endm

+ .macro POP_EXTRA_REGS
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbp
+ popq %rbx
+ .endm
+
.macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
.if \rstor_r11
movq 6*8(%rsp), %r11
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 2cd01ed9cd59..7f1a83b17b4a 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -521,9 +521,17 @@ GLOBAL(retint_user)

GLOBAL(swapgs_restore_regs_and_return_to_usermode)
SWAPGS
- RESTORE_EXTRA_REGS
- RESTORE_C_REGS
- REMOVE_PT_GPREGS_FROM_STACK 8
+ POP_EXTRA_REGS
+ popq %r11
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rax
+ popq %rcx
+ popq %rdx
+ popq %rsi
+ popq %rdi
+ addq $8, %rsp
INTERRUPT_RETURN


@@ -546,9 +554,17 @@ retint_kernel:
TRACE_IRQS_IRETQ

GLOBAL(restore_regs_and_return_to_kernel)
- RESTORE_EXTRA_REGS
- RESTORE_C_REGS
- REMOVE_PT_GPREGS_FROM_STACK 8
+ POP_EXTRA_REGS
+ popq %r11
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rax
+ popq %rcx
+ popq %rdx
+ popq %rsi
+ popq %rdi
+ addq $8, %rsp
INTERRUPT_RETURN

ENTRY(native_iret)
--
2.13.5

2017-09-06 21:40:39

by Andy Lutomirski

[permalink] [raw]
Subject: [RFC 03/17] x86/asm/64: Move SWAPGS into the common iret-to-usermode path

All of the code paths that ended up doing IRET to usermode did
SWAPGS immediately beforehand. Move the SWAPGS into the common
code.

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/entry_64.S | 26 ++++++++++----------------
arch/x86/entry/entry_64_compat.S | 3 +--
2 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 34156d50c3d6..2cd01ed9cd59 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -249,7 +249,7 @@ return_from_SYSCALL_64:
movq RCX(%rsp), %rcx
movq RIP(%rsp), %r11
cmpq %rcx, %r11 /* RCX == RIP */
- jne opportunistic_sysret_failed
+ jne swapgs_restore_regs_and_return_to_usermode

/*
* On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
@@ -267,14 +267,14 @@ return_from_SYSCALL_64:

/* If this changed %rcx, it was not canonical */
cmpq %rcx, %r11
- jne opportunistic_sysret_failed
+ jne swapgs_restore_regs_and_return_to_usermode

cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
- jne opportunistic_sysret_failed
+ jne swapgs_restore_regs_and_return_to_usermode

movq R11(%rsp), %r11
cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
- jne opportunistic_sysret_failed
+ jne swapgs_restore_regs_and_return_to_usermode

/*
* SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
@@ -295,12 +295,12 @@ return_from_SYSCALL_64:
* would never get past 'stuck_here'.
*/
testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
- jnz opportunistic_sysret_failed
+ jnz swapgs_restore_regs_and_return_to_usermode

/* nothing to check for RSP */

cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
- jne opportunistic_sysret_failed
+ jne swapgs_restore_regs_and_return_to_usermode

/*
* We win! This label is here just for ease of understanding
@@ -312,10 +312,6 @@ syscall_return_via_sysret:
RESTORE_C_REGS_EXCEPT_RCX_R11
movq RSP(%rsp), %rsp
USERGS_SYSRET64
-
-opportunistic_sysret_failed:
- SWAPGS
- jmp restore_regs_and_return_to_usermode
END(entry_SYSCALL_64)

ENTRY(stub_ptregs_64)
@@ -411,8 +407,7 @@ ENTRY(ret_from_fork)
movq %rsp, %rdi
call syscall_return_slowpath /* returns with IRQs disabled */
TRACE_IRQS_ON /* user mode is traced as IRQS on */
- SWAPGS
- jmp restore_regs_and_return_to_usermode
+ jmp swapgs_restore_regs_and_return_to_usermode

1:
/* kernel thread */
@@ -523,9 +518,9 @@ GLOBAL(retint_user)
mov %rsp,%rdi
call prepare_exit_to_usermode
TRACE_IRQS_IRETQ
- SWAPGS

-GLOBAL(restore_regs_and_return_to_usermode)
+GLOBAL(swapgs_restore_regs_and_return_to_usermode)
+ SWAPGS
RESTORE_EXTRA_REGS
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8
@@ -1265,8 +1260,7 @@ ENTRY(nmi)
* Return back to user mode. We must *not* do the normal exit
* work, because we don't want to enable interrupts.
*/
- SWAPGS
- jmp restore_regs_and_return_to_usermode
+ jmp swapgs_restore_regs_and_return_to_usermode

.Lnmi_from_kernel:
/*
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index e4ab8ffa94b1..72e143497ece 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -337,8 +337,7 @@ ENTRY(entry_INT80_compat)

/* Go back to user mode. */
TRACE_IRQS_ON
- SWAPGS
- jmp restore_regs_and_return_to_usermode
+ jmp swapgs_restore_regs_and_return_to_usermode
END(entry_INT80_compat)

ALIGN
--
2.13.5

2017-09-06 21:41:05

by Andy Lutomirski

[permalink] [raw]
Subject: [RFC 01/17] x86/asm/64: Remove the restore_c_regs_and_iret label

The only user was the 64-bit opportunistic SYSRET failure path, and
that path didn't really need it. This change makes the
opportunistic SYSRET code a bit more straightforward and gets rid of
the label.

Signed-off-by: Andy Lutomirski <[email protected]>
---
arch/x86/entry/entry_64.S | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index e995ea828789..a0ebeeb9d12f 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -240,7 +240,6 @@ entry_SYSCALL64_slow_path:
call do_syscall_64 /* returns with IRQs disabled */

return_from_SYSCALL_64:
- RESTORE_EXTRA_REGS
TRACE_IRQS_IRETQ /* we're about to change IF */

/*
@@ -309,13 +308,14 @@ return_from_SYSCALL_64:
*/
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
+ RESTORE_EXTRA_REGS
RESTORE_C_REGS_EXCEPT_RCX_R11
movq RSP(%rsp), %rsp
USERGS_SYSRET64

opportunistic_sysret_failed:
SWAPGS
- jmp restore_c_regs_and_iret
+ jmp restore_regs_and_iret
END(entry_SYSCALL_64)

ENTRY(stub_ptregs_64)
@@ -550,7 +550,6 @@ retint_kernel:
*/
GLOBAL(restore_regs_and_iret)
RESTORE_EXTRA_REGS
-restore_c_regs_and_iret:
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8
INTERRUPT_RETURN
--
2.13.5

2017-09-06 22:16:51

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC 00/17] Pile o' entry stack changes

Andy Lutomirski <[email protected]> writes:
>
> - Lots of people (Linus included) have asked to convert the entry
> code to pop registers on exit instead of movqing them off the
> stack. This makes a bunch of progress in that direction.

You should benchmark it on Atoms. Likely it's a regression there
because they don't have the special PUSH/POP acceleration.

-Andi

2017-09-07 00:01:47

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC 00/17] Pile o' entry stack changes

On Wed, Sep 6, 2017 at 3:16 PM, Andi Kleen <[email protected]> wrote:
> Andy Lutomirski <[email protected]> writes:
>>
>> - Lots of people (Linus included) have asked to convert the entry
>> code to pop registers on exit instead of movqing them off the
>> stack. This makes a bunch of progress in that direction.
>
> You should benchmark it on Atoms. Likely it's a regression there
> because they don't have the special PUSH/POP acceleration.

I'm not entirely sure this is a worthwhile reason. Atom will lose a
few cycles due to POP throughput, but there's a lot less decode
bandwidth needed and we save a cache line or two.

--Andy

2017-09-07 07:04:52

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC 00/17] Pile o' entry stack changes


* Andy Lutomirski <[email protected]> wrote:

> On Wed, Sep 6, 2017 at 3:16 PM, Andi Kleen <[email protected]> wrote:
> > Andy Lutomirski <[email protected]> writes:
> >>
> >> - Lots of people (Linus included) have asked to convert the entry
> >> code to pop registers on exit instead of movqing them off the
> >> stack. This makes a bunch of progress in that direction.
> >
> > You should benchmark it on Atoms. Likely it's a regression there
> > because they don't have the special PUSH/POP acceleration.
>
> I'm not entirely sure this is a worthwhile reason. Atom will lose a
> few cycles due to POP throughput, but there's a lot less decode
> bandwidth needed and we save a cache line or two.

I think we can also safely assume that Atom will eventually either join the
21st century or die out - mild Atom micro-costs are not a good reason to
complicate the entry code...

Thanks,

Ingo

2017-09-07 09:34:48

by Jürgen Groß

[permalink] [raw]
Subject: Re: [RFC 08/17] x86/asm/64: De-Xen-ify our NMI code

On 06/09/17 23:36, Andy Lutomirski wrote:
> Xen PV is fundamentally incompatible with our fancy NMI code: it
> doesn't use IST at all, and Xen entries clobber two stack slots
> below the hardware frame.
>
> Drop Xen PV support from our NMI code entirely.
>
> XXX: Juergen: could you write and test the tiny patch needed to
> make Xen PV have a xen_nmi entry that handles NMIs? I don't know
> how to test it.

You mean something like the attached one?

Seems to work at least for the "normal" case of a NMI coming in at
a random point in time.

Regarding testing: in case you have a Xen setup you can easily send
a NMI to a domain from dom0:

xl trigger <domain> nmi


Juergen

>
> Cc: Juergen Gross <[email protected]>
> Cc: Boris Ostrovsky <[email protected]>
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
> arch/x86/entry/entry_64.S | 41 +++++++++++++++++------------------------
> 1 file changed, 17 insertions(+), 24 deletions(-)
>
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index a9e318f7cc9b..c81e05fb999e 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -1171,21 +1171,12 @@ ENTRY(error_exit)
> jmp retint_user
> END(error_exit)
>
> -/* Runs on exception stack */
> +/*
> + * Runs on exception stack. Xen PV does not go through this path at all,
> + * so we can use real assembly here.
> + */
> ENTRY(nmi)
> /*
> - * Fix up the exception frame if we're on Xen.
> - * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most
> - * one value to the stack on native, so it may clobber the rdx
> - * scratch slot, but it won't clobber any of the important
> - * slots past it.
> - *
> - * Xen is a different story, because the Xen frame itself overlaps
> - * the "NMI executing" variable.
> - */
> - PARAVIRT_ADJUST_EXCEPTION_FRAME
> -
> - /*
> * We allow breakpoints in NMIs. If a breakpoint occurs, then
> * the iretq it performs will take us out of NMI context.
> * This means that we can have nested NMIs where the next
> @@ -1240,7 +1231,7 @@ ENTRY(nmi)
> * stacks lest we corrupt the "NMI executing" variable.
> */
>
> - SWAPGS_UNSAFE_STACK
> + swapgs
> cld
> movq %rsp, %rdx
> movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
> @@ -1402,7 +1393,7 @@ nested_nmi_out:
> popq %rdx
>
> /* We are returning to kernel mode, so this cannot result in a fault. */
> - INTERRUPT_RETURN
> + iretq
>
> first_nmi:
> /* Restore rdx. */
> @@ -1432,7 +1423,7 @@ first_nmi:
> pushfq /* RFLAGS */
> pushq $__KERNEL_CS /* CS */
> pushq $1f /* RIP */
> - INTERRUPT_RETURN /* continues at repeat_nmi below */
> + iretq /* continues at repeat_nmi below */
> 1:
> #endif
>
> @@ -1502,20 +1493,22 @@ nmi_restore:
> /*
> * Clear "NMI executing". Set DF first so that we can easily
> * distinguish the remaining code between here and IRET from
> - * the SYSCALL entry and exit paths. On a native kernel, we
> - * could just inspect RIP, but, on paravirt kernels,
> - * INTERRUPT_RETURN can translate into a jump into a
> - * hypercall page.
> + * the SYSCALL entry and exit paths.
> + *
> + * We arguably should just inspect RIP instead, but I (Andy) wrote
> + * this code when I had the misapprehension that Xen PV supported
> + * NMIs, and Xen PV would break that approach.
> */
> std
> movq $0, 5*8(%rsp) /* clear "NMI executing" */
>
> /*
> - * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
> - * stack in a single instruction. We are returning to kernel
> - * mode, so this cannot result in a fault.
> + * iretq reads the "iret" frame and exits the NMI stack in a
> + * single instruction. We are returning to kernel mode, so this
> + * cannot result in a fault. Similarly, we don't need to worry
> + * about espfix64 on the way back to kernel mode.
> */
> - INTERRUPT_RETURN
> + iretq
> END(nmi)
>
> ENTRY(ignore_sysret)
>


Attachments:
0001-xen-add-xen-nmi-trap-entry.patch (3.09 kB)

2017-09-07 09:40:24

by Borislav Petkov

[permalink] [raw]
Subject: Re: [RFC 01/17] x86/asm/64: Remove the restore_c_regs_and_iret label

On Wed, Sep 06, 2017 at 02:36:46PM -0700, Andy Lutomirski wrote:
> The only user was the 64-bit opportunistic SYSRET failure path, and
> that path didn't really need it. This change makes the
> opportunistic SYSRET code a bit more straightforward and gets rid of
> the label.
>
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
> arch/x86/entry/entry_64.S | 5 ++---
> 1 file changed, 2 insertions(+), 3 deletions(-)

Btw, you need to refresh your stuff because of those UNWIND_HINT_EMPTY
things. I get:

checking file arch/x86/entry/entry_64.S
Hunk #1 succeeded at 245 (offset 5 lines).
Hunk #2 FAILED at 308.
Hunk #3 succeeded at 637 (offset 88 lines).
1 out of 3 hunks FAILED

Otherwise

Reviewed-by: Borislav Petkov <[email protected]>

--
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

2017-09-07 09:46:45

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC 01/17] x86/asm/64: Remove the restore_c_regs_and_iret label


* Borislav Petkov <[email protected]> wrote:

> On Wed, Sep 06, 2017 at 02:36:46PM -0700, Andy Lutomirski wrote:
> > The only user was the 64-bit opportunistic SYSRET failure path, and
> > that path didn't really need it. This change makes the
> > opportunistic SYSRET code a bit more straightforward and gets rid of
> > the label.
> >
> > Signed-off-by: Andy Lutomirski <[email protected]>
> > ---
> > arch/x86/entry/entry_64.S | 5 ++---
> > 1 file changed, 2 insertions(+), 3 deletions(-)
>
> Btw, you need to refresh your stuff because of those UNWIND_HINT_EMPTY
> things. I get:
>
> checking file arch/x86/entry/entry_64.S
> Hunk #1 succeeded at 245 (offset 5 lines).
> Hunk #2 FAILED at 308.
> Hunk #3 succeeded at 637 (offset 88 lines).
> 1 out of 3 hunks FAILED
>
> Otherwise
>
> Reviewed-by: Borislav Petkov <[email protected]>

I'd suggest tip:master or upstream 24e700e291d5 as a post-merge-window base for
x86 bits.

Thanks,

Ingo

2017-09-07 09:49:21

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC 01/17] x86/asm/64: Remove the restore_c_regs_and_iret label


* Ingo Molnar <[email protected]> wrote:

>
> * Borislav Petkov <[email protected]> wrote:
>
> > On Wed, Sep 06, 2017 at 02:36:46PM -0700, Andy Lutomirski wrote:
> > > The only user was the 64-bit opportunistic SYSRET failure path, and
> > > that path didn't really need it. This change makes the
> > > opportunistic SYSRET code a bit more straightforward and gets rid of
> > > the label.
> > >
> > > Signed-off-by: Andy Lutomirski <[email protected]>
> > > ---
> > > arch/x86/entry/entry_64.S | 5 ++---
> > > 1 file changed, 2 insertions(+), 3 deletions(-)
> >
> > Btw, you need to refresh your stuff because of those UNWIND_HINT_EMPTY
> > things. I get:
> >
> > checking file arch/x86/entry/entry_64.S
> > Hunk #1 succeeded at 245 (offset 5 lines).
> > Hunk #2 FAILED at 308.
> > Hunk #3 succeeded at 637 (offset 88 lines).
> > 1 out of 3 hunks FAILED
> >
> > Otherwise
> >
> > Reviewed-by: Borislav Petkov <[email protected]>
>
> I'd suggest tip:master or upstream 24e700e291d5 as a post-merge-window base for
> x86 bits.

Actually, scratch that, 1c9fe4409ce3 is probably the best base, it includes the
PCID fixes. Haven't had much time to test that base though.

Thanks,

Ingo

2017-09-07 09:57:52

by Borislav Petkov

[permalink] [raw]
Subject: Re: [RFC 01/17] x86/asm/64: Remove the restore_c_regs_and_iret label

On Thu, Sep 07, 2017 at 11:49:16AM +0200, Ingo Molnar wrote:
> Actually, scratch that, 1c9fe4409ce3 is probably the best base, it includes the
> PCID fixes. Haven't had much time to test that base though.

Ok, I'll use that and scream if something's sh*tting in its pants.

--
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

2017-09-07 10:29:20

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC 01/17] x86/asm/64: Remove the restore_c_regs_and_iret label


* Borislav Petkov <[email protected]> wrote:

> On Thu, Sep 07, 2017 at 11:49:16AM +0200, Ingo Molnar wrote:
> > Actually, scratch that, 1c9fe4409ce3 is probably the best base, it includes the
> > PCID fixes. Haven't had much time to test that base though.
>
> Ok, I'll use that and scream if something's sh*tting in its pants.

not the best of kernels, 32-bit allyesconfig doesn't even appear to build:

net/netfilter/xt_hashlimit.o: In function `hashlimit_mt_common.isra.6':
xt_hashlimit.c:(.text+0x1146): undefined reference to `__udivdi3'


:-/

Thanks,

Ingo

2017-09-07 18:39:18

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [RFC 08/17] x86/asm/64: De-Xen-ify our NMI code

On Thu, Sep 7, 2017 at 2:34 AM, Juergen Gross <[email protected]> wrote:
> On 06/09/17 23:36, Andy Lutomirski wrote:
>> Xen PV is fundamentally incompatible with our fancy NMI code: it
>> doesn't use IST at all, and Xen entries clobber two stack slots
>> below the hardware frame.
>>
>> Drop Xen PV support from our NMI code entirely.
>>
>> XXX: Juergen: could you write and test the tiny patch needed to
>> make Xen PV have a xen_nmi entry that handles NMIs? I don't know
>> how to test it.
>
> You mean something like the attached one?

Yes. Mind if I add it to my series?

>
> Seems to work at least for the "normal" case of a NMI coming in at
> a random point in time.
>
> Regarding testing: in case you have a Xen setup you can easily send
> a NMI to a domain from dom0:
>
> xl trigger <domain> nmi

Thanks!

>
>
> Juergen
>
>>
>> Cc: Juergen Gross <[email protected]>
>> Cc: Boris Ostrovsky <[email protected]>
>> Signed-off-by: Andy Lutomirski <[email protected]>
>> ---
>> arch/x86/entry/entry_64.S | 41 +++++++++++++++++------------------------
>> 1 file changed, 17 insertions(+), 24 deletions(-)
>>
>> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
>> index a9e318f7cc9b..c81e05fb999e 100644
>> --- a/arch/x86/entry/entry_64.S
>> +++ b/arch/x86/entry/entry_64.S
>> @@ -1171,21 +1171,12 @@ ENTRY(error_exit)
>> jmp retint_user
>> END(error_exit)
>>
>> -/* Runs on exception stack */
>> +/*
>> + * Runs on exception stack. Xen PV does not go through this path at all,
>> + * so we can use real assembly here.
>> + */
>> ENTRY(nmi)
>> /*
>> - * Fix up the exception frame if we're on Xen.
>> - * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most
>> - * one value to the stack on native, so it may clobber the rdx
>> - * scratch slot, but it won't clobber any of the important
>> - * slots past it.
>> - *
>> - * Xen is a different story, because the Xen frame itself overlaps
>> - * the "NMI executing" variable.
>> - */
>> - PARAVIRT_ADJUST_EXCEPTION_FRAME
>> -
>> - /*
>> * We allow breakpoints in NMIs. If a breakpoint occurs, then
>> * the iretq it performs will take us out of NMI context.
>> * This means that we can have nested NMIs where the next
>> @@ -1240,7 +1231,7 @@ ENTRY(nmi)
>> * stacks lest we corrupt the "NMI executing" variable.
>> */
>>
>> - SWAPGS_UNSAFE_STACK
>> + swapgs
>> cld
>> movq %rsp, %rdx
>> movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
>> @@ -1402,7 +1393,7 @@ nested_nmi_out:
>> popq %rdx
>>
>> /* We are returning to kernel mode, so this cannot result in a fault. */
>> - INTERRUPT_RETURN
>> + iretq
>>
>> first_nmi:
>> /* Restore rdx. */
>> @@ -1432,7 +1423,7 @@ first_nmi:
>> pushfq /* RFLAGS */
>> pushq $__KERNEL_CS /* CS */
>> pushq $1f /* RIP */
>> - INTERRUPT_RETURN /* continues at repeat_nmi below */
>> + iretq /* continues at repeat_nmi below */
>> 1:
>> #endif
>>
>> @@ -1502,20 +1493,22 @@ nmi_restore:
>> /*
>> * Clear "NMI executing". Set DF first so that we can easily
>> * distinguish the remaining code between here and IRET from
>> - * the SYSCALL entry and exit paths. On a native kernel, we
>> - * could just inspect RIP, but, on paravirt kernels,
>> - * INTERRUPT_RETURN can translate into a jump into a
>> - * hypercall page.
>> + * the SYSCALL entry and exit paths.
>> + *
>> + * We arguably should just inspect RIP instead, but I (Andy) wrote
>> + * this code when I had the misapprehension that Xen PV supported
>> + * NMIs, and Xen PV would break that approach.
>> */
>> std
>> movq $0, 5*8(%rsp) /* clear "NMI executing" */
>>
>> /*
>> - * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
>> - * stack in a single instruction. We are returning to kernel
>> - * mode, so this cannot result in a fault.
>> + * iretq reads the "iret" frame and exits the NMI stack in a
>> + * single instruction. We are returning to kernel mode, so this
>> + * cannot result in a fault. Similarly, we don't need to worry
>> + * about espfix64 on the way back to kernel mode.
>> */
>> - INTERRUPT_RETURN
>> + iretq
>> END(nmi)
>>
>> ENTRY(ignore_sysret)
>>
>

2017-09-08 04:26:50

by Jürgen Groß

[permalink] [raw]
Subject: Re: [RFC 08/17] x86/asm/64: De-Xen-ify our NMI code

On 07/09/17 20:38, Andy Lutomirski wrote:
> On Thu, Sep 7, 2017 at 2:34 AM, Juergen Gross <[email protected]> wrote:
>> On 06/09/17 23:36, Andy Lutomirski wrote:
>>> Xen PV is fundamentally incompatible with our fancy NMI code: it
>>> doesn't use IST at all, and Xen entries clobber two stack slots
>>> below the hardware frame.
>>>
>>> Drop Xen PV support from our NMI code entirely.
>>>
>>> XXX: Juergen: could you write and test the tiny patch needed to
>>> make Xen PV have a xen_nmi entry that handles NMIs? I don't know
>>> how to test it.
>>
>> You mean something like the attached one?
>
> Yes. Mind if I add it to my series?

Go ahead!


Juergen

2017-09-12 20:05:50

by Josh Poimboeuf

[permalink] [raw]
Subject: Re: [RFC 04/17] x86/asm/64: Simplify reg restore code in the standard IRET paths

On Wed, Sep 06, 2017 at 02:36:49PM -0700, Andy Lutomirski wrote:
> The old code restored all the registers with movq instead of pop.
> In theory, this was done because some CPUs have higher movq
> throughput, but any gain there would be tiny and is almost certainly
> outweighed by the higher text size.
>
> This saves 96 bytes of text.
>
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
> arch/x86/entry/calling.h | 9 +++++++++
> arch/x86/entry/entry_64.S | 28 ++++++++++++++++++++++------
> 2 files changed, 31 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
> index 05ed3d393da7..0a2c73fe2cfc 100644
> --- a/arch/x86/entry/calling.h
> +++ b/arch/x86/entry/calling.h
> @@ -147,6 +147,15 @@ For 32-bit we have the following conventions - kernel is built with
> movq 5*8+\offset(%rsp), %rbx
> .endm
>
> + .macro POP_EXTRA_REGS
> + popq %r15
> + popq %r14
> + popq %r13
> + popq %r12
> + popq %rbp
> + popq %rbx
> + .endm
> +
> .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
> .if \rstor_r11
> movq 6*8(%rsp), %r11
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index 2cd01ed9cd59..7f1a83b17b4a 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -521,9 +521,17 @@ GLOBAL(retint_user)
>
> GLOBAL(swapgs_restore_regs_and_return_to_usermode)
> SWAPGS
> - RESTORE_EXTRA_REGS
> - RESTORE_C_REGS
> - REMOVE_PT_GPREGS_FROM_STACK 8
> + POP_EXTRA_REGS
> + popq %r11
> + popq %r10
> + popq %r9
> + popq %r8
> + popq %rax
> + popq %rcx
> + popq %rdx
> + popq %rsi
> + popq %rdi
> + addq $8, %rsp
> INTERRUPT_RETURN
>
>
> @@ -546,9 +554,17 @@ retint_kernel:
> TRACE_IRQS_IRETQ
>
> GLOBAL(restore_regs_and_return_to_kernel)
> - RESTORE_EXTRA_REGS
> - RESTORE_C_REGS
> - REMOVE_PT_GPREGS_FROM_STACK 8
> + POP_EXTRA_REGS
> + popq %r11
> + popq %r10
> + popq %r9
> + popq %r8
> + popq %rax
> + popq %rcx
> + popq %rdx
> + popq %rsi
> + popq %rdi
> + addq $8, %rsp
> INTERRUPT_RETURN

Any reason why these aren't in a POP_C_REGS macro? I think that would
make it easier to verify correctness when reading the code.

--
Josh

2017-09-12 20:06:29

by Josh Poimboeuf

[permalink] [raw]
Subject: Re: [RFC 09/17] x86/asm/32: Pull MSR_IA32_SYSENTER_CS update code out of native_load_sp0()

On Wed, Sep 06, 2017 at 02:36:54PM -0700, Andy Lutomirski wrote:
> This causees the MSR_IA32_SYSENTER_CS write to move out of the
> paravirt hook. This shouldn't affect Xen or lgeust: Xen already
> ignores MSR_IA32_SYSENTER_ESP writes and lguest doesn't support
> SYSENTER at all. In any event, Xen doesn't support vm86()
> in a useful way.

Thanks to Juergen, lguest is no more :-)

--
Josh

2017-09-12 20:09:54

by Josh Poimboeuf

[permalink] [raw]
Subject: Re: [RFC 12/17] x86/xen/64: Clean up SP code in cpu_initialize_context()

On Wed, Sep 06, 2017 at 02:36:57PM -0700, Andy Lutomirski wrote:
> @@ -293,12 +294,19 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
> #endif
> memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
>
> + /*
> + * Bring up the CPU in cpu_bringup_and_idle() with the stack
> + * pointing just below where pt_regs would be if it were a normal
> + * kernel entry.
> + */
> ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
> ctxt->flags = VGCF_IN_KERNEL;
> ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
> ctxt->user_regs.ds = __USER_DS;
> ctxt->user_regs.es = __USER_DS;
> ctxt->user_regs.ss = __KERNEL_DS;
> + ctxt->user_regs.cs = __KERNEL_CS;
> + ctxt->user_regs.esp = task_top_of_stack(idle) - sizeof(struct pt_regs);

Isn't this the same as task_pt_regs(idle)?

--
Josh

2017-09-12 20:11:45

by Josh Poimboeuf

[permalink] [raw]
Subject: Re: [RFC 17/17] x86/traps: Use a new on_thread_stack() helper to clean up an assertion

On Wed, Sep 06, 2017 at 02:37:02PM -0700, Andy Lutomirski wrote:
> Signed-off-by: Andy Lutomirski <[email protected]>
> ---
> arch/x86/include/asm/processor.h | 17 +++++++++++++++++
> arch/x86/include/asm/thread_info.h | 11 -----------
> arch/x86/kernel/traps.c | 3 +--
> 3 files changed, 18 insertions(+), 13 deletions(-)
>
> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> index 4c137472f530..b6f8dc11c222 100644
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -534,6 +534,23 @@ static inline unsigned long current_top_of_stack(void)
> #endif
> }
>
> +static inline unsigned long current_stack_pointer(void)
> +{
> + unsigned long sp;
> +#ifdef CONFIG_X86_64
> + asm("mov %%rsp,%0" : "=g" (sp));
> +#else
> + asm("mov %%esp,%0" : "=g" (sp));
> +#endif
> + return sp;
> +}

I know you're just moving the function, but this function could also be
cleaned up by using _ASM_SP and getting rid of the ifdef.

--
Josh

2017-09-12 20:26:27

by Andrew Cooper

[permalink] [raw]
Subject: Re: [RFC 17/17] x86/traps: Use a new on_thread_stack() helper to clean up an assertion

On 12/09/2017 21:11, Josh Poimboeuf wrote:
> On Wed, Sep 06, 2017 at 02:37:02PM -0700, Andy Lutomirski wrote:
>> Signed-off-by: Andy Lutomirski <[email protected]>
>> ---
>> arch/x86/include/asm/processor.h | 17 +++++++++++++++++
>> arch/x86/include/asm/thread_info.h | 11 -----------
>> arch/x86/kernel/traps.c | 3 +--
>> 3 files changed, 18 insertions(+), 13 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
>> index 4c137472f530..b6f8dc11c222 100644
>> --- a/arch/x86/include/asm/processor.h
>> +++ b/arch/x86/include/asm/processor.h
>> @@ -534,6 +534,23 @@ static inline unsigned long current_top_of_stack(void)
>> #endif
>> }
>>
>> +static inline unsigned long current_stack_pointer(void)
>> +{
>> + unsigned long sp;
>> +#ifdef CONFIG_X86_64
>> + asm("mov %%rsp,%0" : "=g" (sp));
>> +#else
>> + asm("mov %%esp,%0" : "=g" (sp));
>> +#endif
>> + return sp;
>> +}
> I know you're just moving the function, but this function could also be
> cleaned up by using _ASM_SP and getting rid of the ifdef.
>

For GCC, there is a rather more efficient way of doing this, which
allows the compiler much more flexibility than forcing the use of a mov
instruction.

http://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=xen/include/asm-x86/current.h;h=89849929ebf98e9ba04659f59b5521982a718b2d;hb=HEAD#l47

I don't know how much you care in Linux, but in Xen it makes a massive
code-volume difference, as that construct is the core of current
(because we can't use %fs/%gs).

~Andrew