2015-02-24 00:13:05

by Denys Vlasenko

[permalink] [raw]
Subject: [PATCH 1/6] x86: add comments about various syscall instructions, no code changes

SYSCALL/SYSRET and SYSENTER/SYSEXIT have weird semantics.
Moreover, they differ in 32- and 64-bit mode.
What is saved? What is not? Is rsp set? Are interrupts disabled?
People tend to not remember these details well enough.

This patch adds comments which explain in detail
what registers are modified by each of these instructions.
The comments are placed immediately before corresponding
entry and exit points.

Signed-off-by: Denys Vlasenko <[email protected]>
CC: Linus Torvalds <[email protected]>
CC: Oleg Nesterov <[email protected]>
CC: Borislav Petkov <[email protected]>
CC: "H. Peter Anvin" <[email protected]>
CC: Andy Lutomirski <[email protected]>
CC: Frederic Weisbecker <[email protected]>
CC: X86 ML <[email protected]>
CC: Alexei Starovoitov <[email protected]>
CC: Will Drewry <[email protected]>
CC: Kees Cook <[email protected]>
CC: [email protected]
---
arch/x86/ia32/ia32entry.S | 133 ++++++++++++++++++++++++++++-----------------
arch/x86/kernel/entry_64.S | 32 ++++++-----
2 files changed, 102 insertions(+), 63 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index e99f8a5..b567056 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -99,22 +99,25 @@ ENDPROC(native_irq_enable_sysexit)
/*
* 32bit SYSENTER instruction entry.
*
+ * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
+ * IF and VM in rflags are cleared (IOW: interrupts are off).
+ * SYSENTER does not save anything on the stack,
+ * and does not save old rip (!!!) and rflags.
+ *
* Arguments:
- * %eax System call number.
- * %ebx Arg1
- * %ecx Arg2
- * %edx Arg3
- * %esi Arg4
- * %edi Arg5
- * %ebp user stack
- * 0(%ebp) Arg6
- *
- * Interrupts off.
- *
+ * eax system call number
+ * ebx arg1
+ * ecx arg2
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * ebp user stack
+ * 0(%ebp) arg6
+ *
* This is purely a fast path. For anything complicated we use the int 0x80
- * path below. Set up a complete hardware stack frame to share code
+ * path below. We set up a complete hardware stack frame to share code
* with the int 0x80 path.
- */
+ */
ENTRY(ia32_sysenter_target)
CFI_STARTPROC32 simple
CFI_SIGNAL_FRAME
@@ -128,6 +131,7 @@ ENTRY(ia32_sysenter_target)
* disabled irqs, here we enable it straight after entry:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
+ /* Construct iret frame (ss,rsp,rflags,cs,rip) */
movl %ebp,%ebp /* zero extension */
pushq_cfi $__USER32_DS
/*CFI_REL_OFFSET ss,0*/
@@ -140,14 +144,19 @@ ENTRY(ia32_sysenter_target)
pushq_cfi $__USER32_CS
/*CFI_REL_OFFSET cs,0*/
movl %eax, %eax
+ /* Store thread_info->sysenter_return in rip stack slot */
pushq_cfi %r10
CFI_REL_OFFSET rip,0
+ /* Store orig_ax */
pushq_cfi %rax
+ /* Construct the rest of "struct pt_regs" */
cld
ALLOC_PT_GPREGS_ON_STACK
SAVE_C_REGS_EXCEPT_R891011
- /* no need to do an access_ok check here because rbp has been
- 32bit zero extended */
+ /*
+ * no need to do an access_ok check here because rbp has been
+ * 32bit zero extended
+ */
ASM_STAC
1: movl (%rbp),%ebp
_ASM_EXTABLE(1b,ia32_badarg)
@@ -184,6 +193,7 @@ sysexit_from_sys_call:
movl RIP(%rsp),%edx /* User %eip */
CFI_REGISTER rip,rdx
RESTORE_RSI_RDI
+ /* pop everything except ss,rsp,rflags slots */
REMOVE_PT_GPREGS_FROM_STACK 3*8
xorq %r8,%r8
xorq %r9,%r9
@@ -194,6 +204,10 @@ sysexit_from_sys_call:
popq_cfi %rcx /* User %esp */
CFI_REGISTER rsp,rcx
TRACE_IRQS_ON
+ /*
+ * 32bit SYSEXIT restores eip from edx, esp from ecx.
+ * cs and ss are loaded from MSRs.
+ */
ENABLE_INTERRUPTS_SYSEXIT32

CFI_RESTORE_STATE
@@ -274,23 +288,33 @@ ENDPROC(ia32_sysenter_target)
/*
* 32bit SYSCALL instruction entry.
*
+ * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * then loads new ss, cs, and rip from previously programmed MSRs.
+ * rflags gets masked by a value from another MSR (so CLD and CLAC
+ * are not needed). SYSCALL does not save anything on the stack
+ * and does not change rsp.
+ *
+ * Note: rflags saving+masking-with-MSR happens only in Long mode
+ * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it).
+ * Don't get confused: rflags saving+masking depends on Long Mode Active bit
+ * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
+ * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
+ *
* Arguments:
- * %eax System call number.
- * %ebx Arg1
- * %ecx return EIP
- * %edx Arg3
- * %esi Arg4
- * %edi Arg5
- * %ebp Arg2 [note: not saved in the stack frame, should not be touched]
- * %esp user stack
- * 0(%esp) Arg6
- *
- * Interrupts off.
- *
+ * eax system call number
+ * ecx return address
+ * ebx arg1
+ * ebp arg2 (note: not saved in the stack frame, should not be touched)
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * esp user stack
+ * 0(%esp) arg6
+ *
* This is purely a fast path. For anything complicated we use the int 0x80
- * path below. Set up a complete hardware stack frame to share code
- * with the int 0x80 path.
- */
+ * path below. We set up a complete hardware stack frame to share code
+ * with the int 0x80 path.
+ */
ENTRY(ia32_cstar_target)
CFI_STARTPROC32 simple
CFI_SIGNAL_FRAME
@@ -306,7 +330,7 @@ ENTRY(ia32_cstar_target)
* disabled irqs and here we enable it straight after entry:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- ALLOC_PT_GPREGS_ON_STACK 8
+ ALLOC_PT_GPREGS_ON_STACK 8 /* +8: space for orig_ax */
SAVE_C_REGS_EXCEPT_RCX_R891011
movl %eax,%eax /* zero extension */
movq %rax,ORIG_RAX(%rsp)
@@ -320,9 +344,11 @@ ENTRY(ia32_cstar_target)
/*CFI_REL_OFFSET rflags,EFLAGS*/
movq %r8,RSP(%rsp)
CFI_REL_OFFSET rsp,RSP
- /* no need to do an access_ok check here because r8 has been
- 32bit zero extended */
- /* hardware stack frame is complete now */
+ /* iret stack frame is complete now */
+ /*
+ * no need to do an access_ok check here because r8 has been
+ * 32bit zero extended
+ */
ASM_STAC
1: movl (%r8),%r9d
_ASM_EXTABLE(1b,ia32_badarg)
@@ -355,8 +381,15 @@ sysretl_from_sys_call:
TRACE_IRQS_ON
movl RSP(%rsp),%esp
CFI_RESTORE rsp
+ /*
+ * 64bit->32bit SYSRET restores eip from ecx,
+ * eflags from r11 (but RF and VM bits are forced to 0),
+ * cs and ss are loaded from MSRs.
+ * (Note: 32bit->32bit SYSRET is different: since r11
+ * does not exist, it merely sets eflags.IF=1).
+ */
USERGS_SYSRET32
-
+
#ifdef CONFIG_AUDITSYSCALL
cstar_auditsys:
CFI_RESTORE_STATE
@@ -394,26 +427,26 @@ ia32_badarg:
jmp ia32_sysret
CFI_ENDPROC

-/*
- * Emulated IA32 system calls via int 0x80.
+/*
+ * Emulated IA32 system calls via int 0x80.
*
- * Arguments:
- * %eax System call number.
- * %ebx Arg1
- * %ecx Arg2
- * %edx Arg3
- * %esi Arg4
- * %edi Arg5
- * %ebp Arg6 [note: not saved in the stack frame, should not be touched]
+ * Arguments:
+ * eax system call number
+ * ebx arg1
+ * ecx arg2
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * ebp arg6 (note: not saved in the stack frame, should not be touched)
*
* Notes:
- * Uses the same stack frame as the x86-64 version.
- * All registers except %eax must be saved (but ptrace may violate that)
+ * Uses the same stack frame as the x86-64 version.
+ * All registers except eax must be saved (but ptrace may violate that).
* Arguments are zero extended. For system calls that want sign extension and
* take long arguments a wrapper is needed. Most calls can just be called
* directly.
- * Assumes it is only called from user space and entered with interrupts off.
- */
+ * Assumes it is only called from user space and entered with interrupts off.
+ */

ENTRY(ia32_syscall)
CFI_STARTPROC32 simple
@@ -432,7 +465,7 @@ ENTRY(ia32_syscall)
*/
ENABLE_INTERRUPTS(CLBR_NONE)
movl %eax,%eax
- pushq_cfi %rax
+ pushq_cfi %rax /* store orig_ax */
cld
/* note the registers are not zero extended to the sf.
this could be a problem. */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index be2b14c..63e7ccd 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -256,25 +256,25 @@ ENTRY(ret_from_fork)
END(ret_from_fork)

/*
- * System call entry. Up to 6 arguments in registers are supported.
+ * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
- * SYSCALL does not save anything on the stack and does not change the
- * stack pointer. However, it does mask the flags register for us, so
- * CLD and CLAC are not needed.
- */
-
-/*
- * Register setup:
+ * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * then loads new ss, cs, and rip from previously programmed MSRs.
+ * rflags gets masked by a value from another MSR (so CLD and CLAC
+ * are not needed). SYSCALL does not save anything on the stack
+ * and does not change rsp.
+ *
+ * Registers on entry:
* rax system call number
+ * rcx return address
+ * r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
* rdi arg0
- * rcx return address for syscall/sysret, C arg3
* rsi arg1
* rdx arg2
- * r10 arg3 (--> moved to rcx for C)
+ * r10 arg3 (needs to be moved to rcx to conform to C ABI)
* r8 arg4
* r9 arg5
- * r11 eflags for syscall/sysret, temporary for C
- * r12-r15,rbp,rbx saved by C code, not touched.
+ * (note: r12-r15,rbp,rbx are callee-preserved in C ABI)
*
* Interrupts are off on entry.
* Only called from user space.
@@ -302,13 +302,14 @@ ENTRY(system_call)
GLOBAL(system_call_after_swapgs)

movq %rsp,PER_CPU_VAR(old_rsp)
+ /* kernel_stack is set so that 5 slots (iret frame) are preallocated */
movq PER_CPU_VAR(kernel_stack),%rsp
/*
* No need to follow this irqs off/on section - it's straight
* and short:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- ALLOC_PT_GPREGS_ON_STACK 8
+ ALLOC_PT_GPREGS_ON_STACK 8 /* +8: space for orig_ax */
SAVE_C_REGS_EXCEPT_RAX_RCX
movq $-ENOSYS,RAX(%rsp)
movq_cfi rax,ORIG_RAX
@@ -348,6 +349,11 @@ ret_from_sys_call:
CFI_REGISTER rip,rcx
/*CFI_REGISTER rflags,r11*/
movq PER_CPU_VAR(old_rsp), %rsp
+ /*
+ * 64bit SYSRET restores rip from rcx,
+ * rflags from r11 (but RF and VM bits are forced to 0),
+ * cs and ss are loaded from MSRs.
+ */
USERGS_SYSRET64

CFI_RESTORE_STATE
--
1.8.1.4


2015-02-24 00:13:11

by Denys Vlasenko

[permalink] [raw]
Subject: [PATCH 2/6] x86: entry_64.S: move save_paranoid and ret_from_fork closer to their users

For some odd reason, these two functions are at the very top of the file.
save_paranoid's caller is approximately in the middle of it, move it there.
Move ret_from_fork to be right after fork/exec helpers.

This is a pure block move, nothing is changed in the function bodies.

Signed-off-by: Denys Vlasenko <[email protected]>
CC: Linus Torvalds <[email protected]>
CC: Oleg Nesterov <[email protected]>
CC: Borislav Petkov <[email protected]>
CC: "H. Peter Anvin" <[email protected]>
CC: Andy Lutomirski <[email protected]>
CC: Frederic Weisbecker <[email protected]>
CC: X86 ML <[email protected]>
CC: Alexei Starovoitov <[email protected]>
CC: Will Drewry <[email protected]>
CC: Kees Cook <[email protected]>
CC: [email protected]
---
arch/x86/kernel/entry_64.S | 106 ++++++++++++++++++++++-----------------------
1 file changed, 53 insertions(+), 53 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 63e7ccd..71b549a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -202,59 +202,6 @@ ENDPROC(native_usergs_sysret64)
CFI_REL_OFFSET r15, R15+\offset
.endm

-ENTRY(save_paranoid)
- XCPT_FRAME 1 RDI+8
- cld
- SAVE_C_REGS 8
- SAVE_EXTRA_REGS 8
- movl $1,%ebx
- movl $MSR_GS_BASE,%ecx
- rdmsr
- testl %edx,%edx
- js 1f /* negative -> in kernel */
- SWAPGS
- xorl %ebx,%ebx
-1: ret
- CFI_ENDPROC
-END(save_paranoid)
-
-/*
- * A newly forked process directly context switches into this address.
- *
- * rdi: prev task we switched from
- */
-ENTRY(ret_from_fork)
- DEFAULT_FRAME
-
- LOCK ; btr $TIF_FORK,TI_flags(%r8)
-
- pushq_cfi $0x0002
- popfq_cfi # reset kernel eflags
-
- call schedule_tail # rdi: 'prev' task parameter
-
- GET_THREAD_INFO(%rcx)
-
- RESTORE_EXTRA_REGS
-
- testl $3,CS(%rsp) # from kernel_thread?
- jz 1f
-
- testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
- jnz int_ret_from_sys_call
-
- RESTORE_TOP_OF_STACK %rdi
- jmp ret_from_sys_call # go to the SYSRET fastpath
-
-1:
- movq %rbp, %rdi
- call *%rbx
- movl $0, RAX(%rsp)
- RESTORE_EXTRA_REGS
- jmp int_ret_from_sys_call
- CFI_ENDPROC
-END(ret_from_fork)
-
/*
* 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
@@ -581,6 +528,43 @@ END(stub_x32_execveat)
#endif

/*
+ * A newly forked process directly context switches into this address.
+ *
+ * rdi: prev task we switched from
+ */
+ENTRY(ret_from_fork)
+ DEFAULT_FRAME
+
+ LOCK ; btr $TIF_FORK,TI_flags(%r8)
+
+ pushq_cfi $0x0002
+ popfq_cfi # reset kernel eflags
+
+ call schedule_tail # rdi: 'prev' task parameter
+
+ GET_THREAD_INFO(%rcx)
+
+ RESTORE_EXTRA_REGS
+
+ testl $3,CS(%rsp) # from kernel_thread?
+ jz 1f
+
+ testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
+ jnz int_ret_from_sys_call
+
+ RESTORE_TOP_OF_STACK %rdi
+ jmp ret_from_sys_call # go to the SYSRET fastpath
+
+1:
+ movq %rbp, %rdi
+ call *%rbx
+ movl $0, RAX(%rsp)
+ RESTORE_EXTRA_REGS
+ jmp int_ret_from_sys_call
+ CFI_ENDPROC
+END(ret_from_fork)
+
+/*
* Build the entry stubs and pointer table with some assembler magic.
* We pack 7 stubs into a single 32-byte chunk, which will fit in a
* single cache line on all modern x86 implementations.
@@ -1269,6 +1253,22 @@ idtentry async_page_fault do_async_page_fault has_error_code=1
idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
#endif

+ENTRY(save_paranoid)
+ XCPT_FRAME 1 RDI+8
+ cld
+ SAVE_C_REGS 8
+ SAVE_EXTRA_REGS 8
+ movl $1,%ebx
+ movl $MSR_GS_BASE,%ecx
+ rdmsr
+ testl %edx,%edx
+ js 1f /* negative -> in kernel */
+ SWAPGS
+ xorl %ebx,%ebx
+1: ret
+ CFI_ENDPROC
+END(save_paranoid)
+
/*
* "Paranoid" exit path from exception stack. This is invoked
* only on return from non-NMI IST interrupts that came
--
1.8.1.4

2015-02-24 00:13:27

by Denys Vlasenko

[permalink] [raw]
Subject: [PATCH 3/6] x86: entry_64.S: rename save_paranoid to paranoid_entry, no code changes

This patch does a lot of cleanup in comments and formatting,
but it does not change any code.

Rename save_paranoid to paranoid_entry: this makes naming
similar to its "non-paranoid" sibling, error_entry,
and to its counterpart, paranoid_exit.

Use the same CFI annotation atop paranoid_entry and error_entry.

Fix irregular indentation of assembler operands.

Add/fix comments on top of paranoid_entry and error_entry.
Remove stale comment about "oldrax".
Make comments about "no swapgs" flag in ebx more prominent.
Deindent wrongly indented top-level comment atop paranoid_exit.
Indent wrongly deindented comment inside error_entry.

Signed-off-by: Denys Vlasenko <[email protected]>
CC: Linus Torvalds <[email protected]>
CC: Oleg Nesterov <[email protected]>
CC: Borislav Petkov <[email protected]>
CC: "H. Peter Anvin" <[email protected]>
CC: Andy Lutomirski <[email protected]>
CC: Frederic Weisbecker <[email protected]>
CC: X86 ML <[email protected]>
CC: Alexei Starovoitov <[email protected]>
CC: Will Drewry <[email protected]>
CC: Kees Cook <[email protected]>
CC: [email protected]
---
arch/x86/kernel/entry_64.S | 68 ++++++++++++++++++++++++----------------------
1 file changed, 36 insertions(+), 32 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 71b549a..03498d0 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -981,10 +981,11 @@ ENTRY(\sym)
testl $3, CS(%rsp) /* If coming from userspace, switch */
jnz 1f /* stacks. */
.endif
- call save_paranoid
+ call paranoid_entry
.else
call error_entry
.endif
+ /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */

DEFAULT_FRAME 0

@@ -1015,10 +1016,11 @@ ENTRY(\sym)
addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
.endif

+ /* these procedures expect "no swapgs" flag in ebx */
.if \paranoid
- jmp paranoid_exit /* %ebx: no swapgs flag */
+ jmp paranoid_exit
.else
- jmp error_exit /* %ebx: no swapgs flag */
+ jmp error_exit
.endif

.if \paranoid == 1
@@ -1253,8 +1255,13 @@ idtentry async_page_fault do_async_page_fault has_error_code=1
idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
#endif

-ENTRY(save_paranoid)
- XCPT_FRAME 1 RDI+8
+/*
+ * Save all registers in pt_regs, and switch gs if needed.
+ * Use slow, but surefire "are we in kernel?" check.
+ * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ */
+ENTRY(paranoid_entry)
+ XCPT_FRAME 1 15*8
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
@@ -1267,20 +1274,19 @@ ENTRY(save_paranoid)
xorl %ebx,%ebx
1: ret
CFI_ENDPROC
-END(save_paranoid)
-
- /*
- * "Paranoid" exit path from exception stack. This is invoked
- * only on return from non-NMI IST interrupts that came
- * from kernel space.
- *
- * We may be returning to very strange contexts (e.g. very early
- * in syscall entry), so checking for preemption here would
- * be complicated. Fortunately, we there's no good reason
- * to try to handle preemption here.
- */
+END(paranoid_entry)

- /* ebx: no swapgs flag */
+/*
+ * "Paranoid" exit path from exception stack. This is invoked
+ * only on return from non-NMI IST interrupts that came
+ * from kernel space.
+ *
+ * We may be returning to very strange contexts (e.g. very early
+ * in syscall entry), so checking for preemption here would
+ * be complicated. Fortunately, we there's no good reason
+ * to try to handle preemption here.
+ */
+/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
ENTRY(paranoid_exit)
DEFAULT_FRAME
DISABLE_INTERRUPTS(CLBR_NONE)
@@ -1301,13 +1307,11 @@ paranoid_exit_restore:
END(paranoid_exit)

/*
- * Exception entry point. This expects an error code/orig_rax on the stack.
- * returns in "no swapgs flag" in %ebx.
+ * Save all registers in pt_regs, and switch gs if needed.
+ * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
*/
ENTRY(error_entry)
- XCPT_FRAME
- CFI_ADJUST_CFA_OFFSET 15*8
- /* oldrax contains error code */
+ XCPT_FRAME 1 15*8
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
@@ -1320,12 +1324,12 @@ error_sti:
TRACE_IRQS_OFF
ret

-/*
- * There are two places in the kernel that can potentially fault with
- * usergs. Handle them here. B stepping K8s sometimes report a
- * truncated RIP for IRET exceptions returning to compat mode. Check
- * for these here too.
- */
+ /*
+ * There are two places in the kernel that can potentially fault with
+ * usergs. Handle them here. B stepping K8s sometimes report a
+ * truncated RIP for IRET exceptions returning to compat mode. Check
+ * for these here too.
+ */
error_kernelspace:
CFI_REL_OFFSET rcx, RCX+8
incl %ebx
@@ -1355,7 +1359,7 @@ error_bad_iret:
END(error_entry)


-/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
+/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
ENTRY(error_exit)
DEFAULT_FRAME
movl %ebx,%eax
@@ -1581,13 +1585,13 @@ end_repeat_nmi:
ALLOC_PT_GPREGS_ON_STACK

/*
- * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
+ * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
* as we should not be calling schedule in NMI context.
* Even with normal interrupts enabled. An NMI should not be
* setting NEED_RESCHED or anything that normal interrupts and
* exceptions might do.
*/
- call save_paranoid
+ call paranoid_entry
DEFAULT_FRAME 0

/*
--
1.8.1.4

2015-02-24 00:13:33

by Denys Vlasenko

[permalink] [raw]
Subject: [PATCH 4/6] x86: entry_64.S: fold test_in_nmi macro into its only user

No code changes.

Signed-off-by: Denys Vlasenko <[email protected]>
CC: Linus Torvalds <[email protected]>
CC: Oleg Nesterov <[email protected]>
CC: Borislav Petkov <[email protected]>
CC: "H. Peter Anvin" <[email protected]>
CC: Andy Lutomirski <[email protected]>
CC: Frederic Weisbecker <[email protected]>
CC: X86 ML <[email protected]>
CC: Alexei Starovoitov <[email protected]>
CC: Will Drewry <[email protected]>
CC: Kees Cook <[email protected]>
CC: [email protected]
---
arch/x86/kernel/entry_64.S | 24 +++++++++---------------
1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 03498d0..c628a1b 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1378,19 +1378,7 @@ ENTRY(error_exit)
CFI_ENDPROC
END(error_exit)

-/*
- * Test if a given stack is an NMI stack or not.
- */
- .macro test_in_nmi reg stack nmi_ret normal_ret
- cmpq %\reg, \stack
- ja \normal_ret
- subq $EXCEPTION_STKSZ, %\reg
- cmpq %\reg, \stack
- jb \normal_ret
- jmp \nmi_ret
- .endm
-
- /* runs on exception stack */
+/* Runs on exception stack */
ENTRY(nmi)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
@@ -1451,8 +1439,14 @@ ENTRY(nmi)
* We check the variable because the first NMI could be in a
* breakpoint routine using a breakpoint stack.
*/
- lea 6*8(%rsp), %rdx
- test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
+ lea 6*8(%rsp), %rdx
+ cmpq %rdx, 4*8(%rsp)
+ ja first_nmi
+ subq $EXCEPTION_STKSZ, %rdx
+ cmpq %rdx, 4*8(%rsp)
+ jb first_nmi
+ jmp nested_nmi
+
CFI_REMEMBER_STATE

nested_nmi:
--
1.8.1.4

2015-02-24 00:13:38

by Denys Vlasenko

[permalink] [raw]
Subject: [PATCH 5/6] x86: ia32entry.S: fold IA32_ARG_FIXUP macro into its callers

Use of a small macro - one with conditional expansion - does more harm
than good. It obfuscates code, with minimal code reuse.

For example, because of obfuscation it's not obvious that
in ia32_sysenter_target, we can optimize loading of r9 -
currently it is loaded with a detour through ebp.

This patch folds IA32_ARG_FIXUP macro into its callers.

No code changes.

Signed-off-by: Denys Vlasenko <[email protected]>
CC: Linus Torvalds <[email protected]>
CC: Oleg Nesterov <[email protected]>
CC: Borislav Petkov <[email protected]>
CC: "H. Peter Anvin" <[email protected]>
CC: Andy Lutomirski <[email protected]>
CC: Frederic Weisbecker <[email protected]>
CC: X86 ML <[email protected]>
CC: Alexei Starovoitov <[email protected]>
CC: Will Drewry <[email protected]>
CC: Kees Cook <[email protected]>
CC: [email protected]
---
arch/x86/ia32/ia32entry.S | 32 ++++++++++++++++++--------------
1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index b567056..6dcd372 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -30,17 +30,6 @@

.section .entry.text, "ax"

- .macro IA32_ARG_FIXUP noebp=0
- movl %edi,%r8d
- .if \noebp
- .else
- movl %ebp,%r9d
- .endif
- xchg %ecx,%esi
- movl %ebx,%edi
- movl %edx,%edx /* zero extension */
- .endm
-
/* clobbers %rax */
.macro CLEAR_RREGS _r9=rax
xorl %eax,%eax
@@ -178,7 +167,12 @@ sysenter_flags_fixed:
cmpq $(IA32_NR_syscalls-1),%rax
ja ia32_badsys
sysenter_do_call:
- IA32_ARG_FIXUP
+ /* 32bit syscall -> 64bit C ABI argument conversion */
+ movl %edi,%r8d /* arg5 */
+ movl %ebp,%r9d /* arg6 */
+ xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
+ movl %ebx,%edi /* arg1 */
+ movl %edx,%edx /* arg3 (zero extension) */
sysenter_dispatch:
call *ia32_sys_call_table(,%rax,8)
movq %rax,RAX(%rsp)
@@ -360,7 +354,12 @@ ENTRY(ia32_cstar_target)
cmpq $IA32_NR_syscalls-1,%rax
ja ia32_badsys
cstar_do_call:
- IA32_ARG_FIXUP 1
+ /* 32bit syscall -> 64bit C ABI argument conversion */
+ movl %edi,%r8d /* arg5 */
+ /* r9 already loaded */ /* arg6 */
+ xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
+ movl %ebx,%edi /* arg1 */
+ movl %edx,%edx /* arg3 (zero extension) */
cstar_dispatch:
call *ia32_sys_call_table(,%rax,8)
movq %rax,RAX(%rsp)
@@ -477,7 +476,12 @@ ENTRY(ia32_syscall)
cmpq $(IA32_NR_syscalls-1),%rax
ja ia32_badsys
ia32_do_call:
- IA32_ARG_FIXUP
+ /* 32bit syscall -> 64bit C ABI argument conversion */
+ movl %edi,%r8d /* arg5 */
+ movl %ebp,%r9d /* arg6 */
+ xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
+ movl %ebx,%edi /* arg1 */
+ movl %edx,%edx /* arg3 (zero extension) */
call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
ia32_sysret:
movq %rax,RAX(%rsp)
--
1.8.1.4

2015-02-24 00:13:47

by Denys Vlasenko

[permalink] [raw]
Subject: [PATCH 6/6] x86: entry_64.S: use more understandable constants

Constants such as SS+8 or SS+8-RIP are mysterious.
In most cases, SS+8 is just meant to be SIZEOF_PTREGS,
SS+8-RIP is RIP's offset in iret frame.

This patch changes some of these constants to be less mysterious.

No code changes (verified with objdump).

Signed-off-by: Denys Vlasenko <[email protected]>
CC: Linus Torvalds <[email protected]>
CC: Oleg Nesterov <[email protected]>
CC: Borislav Petkov <[email protected]>
CC: "H. Peter Anvin" <[email protected]>
CC: Andy Lutomirski <[email protected]>
CC: Frederic Weisbecker <[email protected]>
CC: X86 ML <[email protected]>
CC: Alexei Starovoitov <[email protected]>
CC: Will Drewry <[email protected]>
CC: Kees Cook <[email protected]>
CC: [email protected]
---
arch/x86/include/asm/calling.h | 2 ++
arch/x86/kernel/entry_64.S | 28 ++++++++++++++++------------
2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 4a7ceb9..3374235 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -88,6 +88,8 @@ For 32-bit we have the following conventions - kernel is built with
#define RSP 19*8
#define SS 20*8

+#define SIZEOF_PTREGS 21*8
+
.macro ALLOC_PT_GPREGS_ON_STACK addskip=0
subq $15*8+\addskip, %rsp
CFI_ADJUST_CFA_OFFSET 15*8+\addskip
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c628a1b..2fa9e59 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -164,12 +164,12 @@ ENDPROC(native_usergs_sysret64)
* initial frame state for interrupts (and exceptions without error code)
*/
.macro INTR_FRAME start=1 offset=0
- EMPTY_FRAME \start, SS+8+\offset-RIP
- /*CFI_REL_OFFSET ss, SS+\offset-RIP*/
- CFI_REL_OFFSET rsp, RSP+\offset-RIP
- /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
- /*CFI_REL_OFFSET cs, CS+\offset-RIP*/
- CFI_REL_OFFSET rip, RIP+\offset-RIP
+ EMPTY_FRAME \start, 5*8+\offset
+ /*CFI_REL_OFFSET ss, 4*8+\offset*/
+ CFI_REL_OFFSET rsp, 3*8+\offset
+ /*CFI_REL_OFFSET rflags, 2*8+\offset*/
+ /*CFI_REL_OFFSET cs, 1*8+\offset*/
+ CFI_REL_OFFSET rip, 0*8+\offset
.endm

/*
@@ -177,7 +177,7 @@ ENDPROC(native_usergs_sysret64)
* with vector already pushed)
*/
.macro XCPT_FRAME start=1 offset=0
- INTR_FRAME \start, RIP+\offset-ORIG_RAX
+ INTR_FRAME \start, 1*8+\offset
.endm

/*
@@ -644,10 +644,14 @@ END(interrupt)
cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
CFI_DEF_CFA_REGISTER rsi
pushq %rsi
+ /*
+ * For debugger:
+ * "CFA (Current Frame Address) is the value on stack + offset"
+ */
CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
- 0x77 /* DW_OP_breg7 */, 0, \
+ 0x77 /* DW_OP_breg7 (rsp) */, 0, \
0x06 /* DW_OP_deref */, \
- 0x08 /* DW_OP_const1u */, SS+8-RBP, \
+ 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \
0x22 /* DW_OP_plus */
/* We entered an interrupt context - irqs are off: */
TRACE_IRQS_OFF
@@ -673,7 +677,7 @@ ret_from_intr:

/* Restore saved previous stack */
popq %rsi
- CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */
+ CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */
/* return code expects complete pt_regs - adjust rsp accordingly: */
leaq -RBP(%rsi),%rsp
CFI_DEF_CFA_REGISTER rsp
@@ -1539,7 +1543,7 @@ first_nmi:
.rept 5
pushq_cfi 11*8(%rsp)
.endr
- CFI_DEF_CFA_OFFSET SS+8-RIP
+ CFI_DEF_CFA_OFFSET 5*8

/* Everything up to here is safe from nested NMIs */

@@ -1567,7 +1571,7 @@ repeat_nmi:
pushq_cfi -6*8(%rsp)
.endr
subq $(5*8), %rsp
- CFI_DEF_CFA_OFFSET SS+8-RIP
+ CFI_DEF_CFA_OFFSET 5*8
end_repeat_nmi:

/*
--
1.8.1.4

2015-02-24 00:34:40

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH 4/6] x86: entry_64.S: fold test_in_nmi macro into its only user

On Mon, Feb 23, 2015 at 4:12 PM, Denys Vlasenko <[email protected]> wrote:
> No code changes.

This needs to address Steven's comments, I think, and I'd like his ack, too.

--Andy

>
> Signed-off-by: Denys Vlasenko <[email protected]>
> CC: Linus Torvalds <[email protected]>
> CC: Oleg Nesterov <[email protected]>
> CC: Borislav Petkov <[email protected]>
> CC: "H. Peter Anvin" <[email protected]>
> CC: Andy Lutomirski <[email protected]>
> CC: Frederic Weisbecker <[email protected]>
> CC: X86 ML <[email protected]>
> CC: Alexei Starovoitov <[email protected]>
> CC: Will Drewry <[email protected]>
> CC: Kees Cook <[email protected]>
> CC: [email protected]
> ---
> arch/x86/kernel/entry_64.S | 24 +++++++++---------------
> 1 file changed, 9 insertions(+), 15 deletions(-)
>
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index 03498d0..c628a1b 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -1378,19 +1378,7 @@ ENTRY(error_exit)
> CFI_ENDPROC
> END(error_exit)
>
> -/*
> - * Test if a given stack is an NMI stack or not.
> - */
> - .macro test_in_nmi reg stack nmi_ret normal_ret
> - cmpq %\reg, \stack
> - ja \normal_ret
> - subq $EXCEPTION_STKSZ, %\reg
> - cmpq %\reg, \stack
> - jb \normal_ret
> - jmp \nmi_ret
> - .endm
> -
> - /* runs on exception stack */
> +/* Runs on exception stack */
> ENTRY(nmi)
> INTR_FRAME
> PARAVIRT_ADJUST_EXCEPTION_FRAME
> @@ -1451,8 +1439,14 @@ ENTRY(nmi)
> * We check the variable because the first NMI could be in a
> * breakpoint routine using a breakpoint stack.
> */
> - lea 6*8(%rsp), %rdx
> - test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
> + lea 6*8(%rsp), %rdx
> + cmpq %rdx, 4*8(%rsp)
> + ja first_nmi
> + subq $EXCEPTION_STKSZ, %rdx
> + cmpq %rdx, 4*8(%rsp)
> + jb first_nmi
> + jmp nested_nmi
> +
> CFI_REMEMBER_STATE
>
> nested_nmi:
> --
> 1.8.1.4
>



--
Andy Lutomirski
AMA Capital Management, LLC

2015-02-24 00:35:55

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH 5/6] x86: ia32entry.S: fold IA32_ARG_FIXUP macro into its callers

On Mon, Feb 23, 2015 at 4:12 PM, Denys Vlasenko <[email protected]> wrote:
> Use of a small macro - one with conditional expansion - does more harm
> than good. It obfuscates code, with minimal code reuse.
>
> For example, because of obfuscation it's not obvious that
> in ia32_sysenter_target, we can optimize loading of r9 -
> currently it is loaded with a detour through ebp.
>
> This patch folds IA32_ARG_FIXUP macro into its callers.
>
> No code changes.
>

Applied.

> Signed-off-by: Denys Vlasenko <[email protected]>
> CC: Linus Torvalds <[email protected]>
> CC: Oleg Nesterov <[email protected]>
> CC: Borislav Petkov <[email protected]>
> CC: "H. Peter Anvin" <[email protected]>
> CC: Andy Lutomirski <[email protected]>
> CC: Frederic Weisbecker <[email protected]>
> CC: X86 ML <[email protected]>
> CC: Alexei Starovoitov <[email protected]>
> CC: Will Drewry <[email protected]>
> CC: Kees Cook <[email protected]>
> CC: [email protected]
> ---
> arch/x86/ia32/ia32entry.S | 32 ++++++++++++++++++--------------
> 1 file changed, 18 insertions(+), 14 deletions(-)
>
> diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
> index b567056..6dcd372 100644
> --- a/arch/x86/ia32/ia32entry.S
> +++ b/arch/x86/ia32/ia32entry.S
> @@ -30,17 +30,6 @@
>
> .section .entry.text, "ax"
>
> - .macro IA32_ARG_FIXUP noebp=0
> - movl %edi,%r8d
> - .if \noebp
> - .else
> - movl %ebp,%r9d
> - .endif
> - xchg %ecx,%esi
> - movl %ebx,%edi
> - movl %edx,%edx /* zero extension */
> - .endm
> -
> /* clobbers %rax */
> .macro CLEAR_RREGS _r9=rax
> xorl %eax,%eax
> @@ -178,7 +167,12 @@ sysenter_flags_fixed:
> cmpq $(IA32_NR_syscalls-1),%rax
> ja ia32_badsys
> sysenter_do_call:
> - IA32_ARG_FIXUP
> + /* 32bit syscall -> 64bit C ABI argument conversion */
> + movl %edi,%r8d /* arg5 */
> + movl %ebp,%r9d /* arg6 */
> + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
> + movl %ebx,%edi /* arg1 */
> + movl %edx,%edx /* arg3 (zero extension) */
> sysenter_dispatch:
> call *ia32_sys_call_table(,%rax,8)
> movq %rax,RAX(%rsp)
> @@ -360,7 +354,12 @@ ENTRY(ia32_cstar_target)
> cmpq $IA32_NR_syscalls-1,%rax
> ja ia32_badsys
> cstar_do_call:
> - IA32_ARG_FIXUP 1
> + /* 32bit syscall -> 64bit C ABI argument conversion */
> + movl %edi,%r8d /* arg5 */
> + /* r9 already loaded */ /* arg6 */
> + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
> + movl %ebx,%edi /* arg1 */
> + movl %edx,%edx /* arg3 (zero extension) */
> cstar_dispatch:
> call *ia32_sys_call_table(,%rax,8)
> movq %rax,RAX(%rsp)
> @@ -477,7 +476,12 @@ ENTRY(ia32_syscall)
> cmpq $(IA32_NR_syscalls-1),%rax
> ja ia32_badsys
> ia32_do_call:
> - IA32_ARG_FIXUP
> + /* 32bit syscall -> 64bit C ABI argument conversion */
> + movl %edi,%r8d /* arg5 */
> + movl %ebp,%r9d /* arg6 */
> + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
> + movl %ebx,%edi /* arg1 */
> + movl %edx,%edx /* arg3 (zero extension) */
> call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
> ia32_sysret:
> movq %rax,RAX(%rsp)
> --
> 1.8.1.4
>



--
Andy Lutomirski
AMA Capital Management, LLC

2015-02-24 00:37:37

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH 6/6] x86: entry_64.S: use more understandable constants

On Mon, Feb 23, 2015 at 4:12 PM, Denys Vlasenko <[email protected]> wrote:
> Constants such as SS+8 or SS+8-RIP are mysterious.
> In most cases, SS+8 is just meant to be SIZEOF_PTREGS,
> SS+8-RIP is RIP's offset in iret frame.
>
> This patch changes some of these constants to be less mysterious.
>
> No code changes (verified with objdump).

Applied.

>
> Signed-off-by: Denys Vlasenko <[email protected]>
> CC: Linus Torvalds <[email protected]>
> CC: Oleg Nesterov <[email protected]>
> CC: Borislav Petkov <[email protected]>
> CC: "H. Peter Anvin" <[email protected]>
> CC: Andy Lutomirski <[email protected]>
> CC: Frederic Weisbecker <[email protected]>
> CC: X86 ML <[email protected]>
> CC: Alexei Starovoitov <[email protected]>
> CC: Will Drewry <[email protected]>
> CC: Kees Cook <[email protected]>
> CC: [email protected]
> ---
> arch/x86/include/asm/calling.h | 2 ++
> arch/x86/kernel/entry_64.S | 28 ++++++++++++++++------------
> 2 files changed, 18 insertions(+), 12 deletions(-)
>
> diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
> index 4a7ceb9..3374235 100644
> --- a/arch/x86/include/asm/calling.h
> +++ b/arch/x86/include/asm/calling.h
> @@ -88,6 +88,8 @@ For 32-bit we have the following conventions - kernel is built with
> #define RSP 19*8
> #define SS 20*8
>
> +#define SIZEOF_PTREGS 21*8
> +
> .macro ALLOC_PT_GPREGS_ON_STACK addskip=0
> subq $15*8+\addskip, %rsp
> CFI_ADJUST_CFA_OFFSET 15*8+\addskip
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index c628a1b..2fa9e59 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -164,12 +164,12 @@ ENDPROC(native_usergs_sysret64)
> * initial frame state for interrupts (and exceptions without error code)
> */
> .macro INTR_FRAME start=1 offset=0
> - EMPTY_FRAME \start, SS+8+\offset-RIP
> - /*CFI_REL_OFFSET ss, SS+\offset-RIP*/
> - CFI_REL_OFFSET rsp, RSP+\offset-RIP
> - /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
> - /*CFI_REL_OFFSET cs, CS+\offset-RIP*/
> - CFI_REL_OFFSET rip, RIP+\offset-RIP
> + EMPTY_FRAME \start, 5*8+\offset
> + /*CFI_REL_OFFSET ss, 4*8+\offset*/
> + CFI_REL_OFFSET rsp, 3*8+\offset
> + /*CFI_REL_OFFSET rflags, 2*8+\offset*/
> + /*CFI_REL_OFFSET cs, 1*8+\offset*/
> + CFI_REL_OFFSET rip, 0*8+\offset
> .endm
>
> /*
> @@ -177,7 +177,7 @@ ENDPROC(native_usergs_sysret64)
> * with vector already pushed)
> */
> .macro XCPT_FRAME start=1 offset=0
> - INTR_FRAME \start, RIP+\offset-ORIG_RAX
> + INTR_FRAME \start, 1*8+\offset
> .endm
>
> /*
> @@ -644,10 +644,14 @@ END(interrupt)
> cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
> CFI_DEF_CFA_REGISTER rsi
> pushq %rsi
> + /*
> + * For debugger:
> + * "CFA (Current Frame Address) is the value on stack + offset"
> + */
> CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
> - 0x77 /* DW_OP_breg7 */, 0, \
> + 0x77 /* DW_OP_breg7 (rsp) */, 0, \
> 0x06 /* DW_OP_deref */, \
> - 0x08 /* DW_OP_const1u */, SS+8-RBP, \
> + 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \
> 0x22 /* DW_OP_plus */
> /* We entered an interrupt context - irqs are off: */
> TRACE_IRQS_OFF
> @@ -673,7 +677,7 @@ ret_from_intr:
>
> /* Restore saved previous stack */
> popq %rsi
> - CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */
> + CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */
> /* return code expects complete pt_regs - adjust rsp accordingly: */
> leaq -RBP(%rsi),%rsp
> CFI_DEF_CFA_REGISTER rsp
> @@ -1539,7 +1543,7 @@ first_nmi:
> .rept 5
> pushq_cfi 11*8(%rsp)
> .endr
> - CFI_DEF_CFA_OFFSET SS+8-RIP
> + CFI_DEF_CFA_OFFSET 5*8
>
> /* Everything up to here is safe from nested NMIs */
>
> @@ -1567,7 +1571,7 @@ repeat_nmi:
> pushq_cfi -6*8(%rsp)
> .endr
> subq $(5*8), %rsp
> - CFI_DEF_CFA_OFFSET SS+8-RIP
> + CFI_DEF_CFA_OFFSET 5*8
> end_repeat_nmi:
>
> /*
> --
> 1.8.1.4
>



--
Andy Lutomirski
AMA Capital Management, LLC

2015-02-24 00:37:49

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH 1/6] x86: add comments about various syscall instructions, no code changes

On Mon, Feb 23, 2015 at 4:12 PM, Denys Vlasenko <[email protected]> wrote:
> SYSCALL/SYSRET and SYSENTER/SYSEXIT have weird semantics.
> Moreover, they differ in 32- and 64-bit mode.
> What is saved? What is not? Is rsp set? Are interrupts disabled?
> People tend to not remember these details well enough.
>
> This patch adds comments which explain in detail
> what registers are modified by each of these instructions.
> The comments are placed immediately before corresponding
> entry and exit points.

Applied.

>
> Signed-off-by: Denys Vlasenko <[email protected]>
> CC: Linus Torvalds <[email protected]>
> CC: Oleg Nesterov <[email protected]>
> CC: Borislav Petkov <[email protected]>
> CC: "H. Peter Anvin" <[email protected]>
> CC: Andy Lutomirski <[email protected]>
> CC: Frederic Weisbecker <[email protected]>
> CC: X86 ML <[email protected]>
> CC: Alexei Starovoitov <[email protected]>
> CC: Will Drewry <[email protected]>
> CC: Kees Cook <[email protected]>
> CC: [email protected]
> ---
> arch/x86/ia32/ia32entry.S | 133 ++++++++++++++++++++++++++++-----------------
> arch/x86/kernel/entry_64.S | 32 ++++++-----
> 2 files changed, 102 insertions(+), 63 deletions(-)
>
> diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
> index e99f8a5..b567056 100644
> --- a/arch/x86/ia32/ia32entry.S
> +++ b/arch/x86/ia32/ia32entry.S
> @@ -99,22 +99,25 @@ ENDPROC(native_irq_enable_sysexit)
> /*
> * 32bit SYSENTER instruction entry.
> *
> + * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
> + * IF and VM in rflags are cleared (IOW: interrupts are off).
> + * SYSENTER does not save anything on the stack,
> + * and does not save old rip (!!!) and rflags.
> + *
> * Arguments:
> - * %eax System call number.
> - * %ebx Arg1
> - * %ecx Arg2
> - * %edx Arg3
> - * %esi Arg4
> - * %edi Arg5
> - * %ebp user stack
> - * 0(%ebp) Arg6
> - *
> - * Interrupts off.
> - *
> + * eax system call number
> + * ebx arg1
> + * ecx arg2
> + * edx arg3
> + * esi arg4
> + * edi arg5
> + * ebp user stack
> + * 0(%ebp) arg6
> + *
> * This is purely a fast path. For anything complicated we use the int 0x80
> - * path below. Set up a complete hardware stack frame to share code
> + * path below. We set up a complete hardware stack frame to share code
> * with the int 0x80 path.
> - */
> + */
> ENTRY(ia32_sysenter_target)
> CFI_STARTPROC32 simple
> CFI_SIGNAL_FRAME
> @@ -128,6 +131,7 @@ ENTRY(ia32_sysenter_target)
> * disabled irqs, here we enable it straight after entry:
> */
> ENABLE_INTERRUPTS(CLBR_NONE)
> + /* Construct iret frame (ss,rsp,rflags,cs,rip) */
> movl %ebp,%ebp /* zero extension */
> pushq_cfi $__USER32_DS
> /*CFI_REL_OFFSET ss,0*/
> @@ -140,14 +144,19 @@ ENTRY(ia32_sysenter_target)
> pushq_cfi $__USER32_CS
> /*CFI_REL_OFFSET cs,0*/
> movl %eax, %eax
> + /* Store thread_info->sysenter_return in rip stack slot */
> pushq_cfi %r10
> CFI_REL_OFFSET rip,0
> + /* Store orig_ax */
> pushq_cfi %rax
> + /* Construct the rest of "struct pt_regs" */
> cld
> ALLOC_PT_GPREGS_ON_STACK
> SAVE_C_REGS_EXCEPT_R891011
> - /* no need to do an access_ok check here because rbp has been
> - 32bit zero extended */
> + /*
> + * no need to do an access_ok check here because rbp has been
> + * 32bit zero extended
> + */
> ASM_STAC
> 1: movl (%rbp),%ebp
> _ASM_EXTABLE(1b,ia32_badarg)
> @@ -184,6 +193,7 @@ sysexit_from_sys_call:
> movl RIP(%rsp),%edx /* User %eip */
> CFI_REGISTER rip,rdx
> RESTORE_RSI_RDI
> + /* pop everything except ss,rsp,rflags slots */
> REMOVE_PT_GPREGS_FROM_STACK 3*8
> xorq %r8,%r8
> xorq %r9,%r9
> @@ -194,6 +204,10 @@ sysexit_from_sys_call:
> popq_cfi %rcx /* User %esp */
> CFI_REGISTER rsp,rcx
> TRACE_IRQS_ON
> + /*
> + * 32bit SYSEXIT restores eip from edx, esp from ecx.
> + * cs and ss are loaded from MSRs.
> + */
> ENABLE_INTERRUPTS_SYSEXIT32
>
> CFI_RESTORE_STATE
> @@ -274,23 +288,33 @@ ENDPROC(ia32_sysenter_target)
> /*
> * 32bit SYSCALL instruction entry.
> *
> + * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
> + * then loads new ss, cs, and rip from previously programmed MSRs.
> + * rflags gets masked by a value from another MSR (so CLD and CLAC
> + * are not needed). SYSCALL does not save anything on the stack
> + * and does not change rsp.
> + *
> + * Note: rflags saving+masking-with-MSR happens only in Long mode
> + * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it).
> + * Don't get confused: rflags saving+masking depends on Long Mode Active bit
> + * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
> + * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
> + *
> * Arguments:
> - * %eax System call number.
> - * %ebx Arg1
> - * %ecx return EIP
> - * %edx Arg3
> - * %esi Arg4
> - * %edi Arg5
> - * %ebp Arg2 [note: not saved in the stack frame, should not be touched]
> - * %esp user stack
> - * 0(%esp) Arg6
> - *
> - * Interrupts off.
> - *
> + * eax system call number
> + * ecx return address
> + * ebx arg1
> + * ebp arg2 (note: not saved in the stack frame, should not be touched)
> + * edx arg3
> + * esi arg4
> + * edi arg5
> + * esp user stack
> + * 0(%esp) arg6
> + *
> * This is purely a fast path. For anything complicated we use the int 0x80
> - * path below. Set up a complete hardware stack frame to share code
> - * with the int 0x80 path.
> - */
> + * path below. We set up a complete hardware stack frame to share code
> + * with the int 0x80 path.
> + */
> ENTRY(ia32_cstar_target)
> CFI_STARTPROC32 simple
> CFI_SIGNAL_FRAME
> @@ -306,7 +330,7 @@ ENTRY(ia32_cstar_target)
> * disabled irqs and here we enable it straight after entry:
> */
> ENABLE_INTERRUPTS(CLBR_NONE)
> - ALLOC_PT_GPREGS_ON_STACK 8
> + ALLOC_PT_GPREGS_ON_STACK 8 /* +8: space for orig_ax */
> SAVE_C_REGS_EXCEPT_RCX_R891011
> movl %eax,%eax /* zero extension */
> movq %rax,ORIG_RAX(%rsp)
> @@ -320,9 +344,11 @@ ENTRY(ia32_cstar_target)
> /*CFI_REL_OFFSET rflags,EFLAGS*/
> movq %r8,RSP(%rsp)
> CFI_REL_OFFSET rsp,RSP
> - /* no need to do an access_ok check here because r8 has been
> - 32bit zero extended */
> - /* hardware stack frame is complete now */
> + /* iret stack frame is complete now */
> + /*
> + * no need to do an access_ok check here because r8 has been
> + * 32bit zero extended
> + */
> ASM_STAC
> 1: movl (%r8),%r9d
> _ASM_EXTABLE(1b,ia32_badarg)
> @@ -355,8 +381,15 @@ sysretl_from_sys_call:
> TRACE_IRQS_ON
> movl RSP(%rsp),%esp
> CFI_RESTORE rsp
> + /*
> + * 64bit->32bit SYSRET restores eip from ecx,
> + * eflags from r11 (but RF and VM bits are forced to 0),
> + * cs and ss are loaded from MSRs.
> + * (Note: 32bit->32bit SYSRET is different: since r11
> + * does not exist, it merely sets eflags.IF=1).
> + */
> USERGS_SYSRET32
> -
> +
> #ifdef CONFIG_AUDITSYSCALL
> cstar_auditsys:
> CFI_RESTORE_STATE
> @@ -394,26 +427,26 @@ ia32_badarg:
> jmp ia32_sysret
> CFI_ENDPROC
>
> -/*
> - * Emulated IA32 system calls via int 0x80.
> +/*
> + * Emulated IA32 system calls via int 0x80.
> *
> - * Arguments:
> - * %eax System call number.
> - * %ebx Arg1
> - * %ecx Arg2
> - * %edx Arg3
> - * %esi Arg4
> - * %edi Arg5
> - * %ebp Arg6 [note: not saved in the stack frame, should not be touched]
> + * Arguments:
> + * eax system call number
> + * ebx arg1
> + * ecx arg2
> + * edx arg3
> + * esi arg4
> + * edi arg5
> + * ebp arg6 (note: not saved in the stack frame, should not be touched)
> *
> * Notes:
> - * Uses the same stack frame as the x86-64 version.
> - * All registers except %eax must be saved (but ptrace may violate that)
> + * Uses the same stack frame as the x86-64 version.
> + * All registers except eax must be saved (but ptrace may violate that).
> * Arguments are zero extended. For system calls that want sign extension and
> * take long arguments a wrapper is needed. Most calls can just be called
> * directly.
> - * Assumes it is only called from user space and entered with interrupts off.
> - */
> + * Assumes it is only called from user space and entered with interrupts off.
> + */
>
> ENTRY(ia32_syscall)
> CFI_STARTPROC32 simple
> @@ -432,7 +465,7 @@ ENTRY(ia32_syscall)
> */
> ENABLE_INTERRUPTS(CLBR_NONE)
> movl %eax,%eax
> - pushq_cfi %rax
> + pushq_cfi %rax /* store orig_ax */
> cld
> /* note the registers are not zero extended to the sf.
> this could be a problem. */
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index be2b14c..63e7ccd 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -256,25 +256,25 @@ ENTRY(ret_from_fork)
> END(ret_from_fork)
>
> /*
> - * System call entry. Up to 6 arguments in registers are supported.
> + * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
> *
> - * SYSCALL does not save anything on the stack and does not change the
> - * stack pointer. However, it does mask the flags register for us, so
> - * CLD and CLAC are not needed.
> - */
> -
> -/*
> - * Register setup:
> + * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
> + * then loads new ss, cs, and rip from previously programmed MSRs.
> + * rflags gets masked by a value from another MSR (so CLD and CLAC
> + * are not needed). SYSCALL does not save anything on the stack
> + * and does not change rsp.
> + *
> + * Registers on entry:
> * rax system call number
> + * rcx return address
> + * r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
> * rdi arg0
> - * rcx return address for syscall/sysret, C arg3
> * rsi arg1
> * rdx arg2
> - * r10 arg3 (--> moved to rcx for C)
> + * r10 arg3 (needs to be moved to rcx to conform to C ABI)
> * r8 arg4
> * r9 arg5
> - * r11 eflags for syscall/sysret, temporary for C
> - * r12-r15,rbp,rbx saved by C code, not touched.
> + * (note: r12-r15,rbp,rbx are callee-preserved in C ABI)
> *
> * Interrupts are off on entry.
> * Only called from user space.
> @@ -302,13 +302,14 @@ ENTRY(system_call)
> GLOBAL(system_call_after_swapgs)
>
> movq %rsp,PER_CPU_VAR(old_rsp)
> + /* kernel_stack is set so that 5 slots (iret frame) are preallocated */
> movq PER_CPU_VAR(kernel_stack),%rsp
> /*
> * No need to follow this irqs off/on section - it's straight
> * and short:
> */
> ENABLE_INTERRUPTS(CLBR_NONE)
> - ALLOC_PT_GPREGS_ON_STACK 8
> + ALLOC_PT_GPREGS_ON_STACK 8 /* +8: space for orig_ax */
> SAVE_C_REGS_EXCEPT_RAX_RCX
> movq $-ENOSYS,RAX(%rsp)
> movq_cfi rax,ORIG_RAX
> @@ -348,6 +349,11 @@ ret_from_sys_call:
> CFI_REGISTER rip,rcx
> /*CFI_REGISTER rflags,r11*/
> movq PER_CPU_VAR(old_rsp), %rsp
> + /*
> + * 64bit SYSRET restores rip from rcx,
> + * rflags from r11 (but RF and VM bits are forced to 0),
> + * cs and ss are loaded from MSRs.
> + */
> USERGS_SYSRET64
>
> CFI_RESTORE_STATE
> --
> 1.8.1.4
>



--
Andy Lutomirski
AMA Capital Management, LLC

2015-02-24 00:38:00

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH 2/6] x86: entry_64.S: move save_paranoid and ret_from_fork closer to their users

On Mon, Feb 23, 2015 at 4:12 PM, Denys Vlasenko <[email protected]> wrote:
> For some odd reason, these two functions are at the very top of the file.
> save_paranoid's caller is approximately in the middle of it, move it there.
> Move ret_from_fork to be right after fork/exec helpers.
>
> This is a pure block move, nothing is changed in the function bodies.

Applied.

>
> Signed-off-by: Denys Vlasenko <[email protected]>
> CC: Linus Torvalds <[email protected]>
> CC: Oleg Nesterov <[email protected]>
> CC: Borislav Petkov <[email protected]>
> CC: "H. Peter Anvin" <[email protected]>
> CC: Andy Lutomirski <[email protected]>
> CC: Frederic Weisbecker <[email protected]>
> CC: X86 ML <[email protected]>
> CC: Alexei Starovoitov <[email protected]>
> CC: Will Drewry <[email protected]>
> CC: Kees Cook <[email protected]>
> CC: [email protected]
> ---
> arch/x86/kernel/entry_64.S | 106 ++++++++++++++++++++++-----------------------
> 1 file changed, 53 insertions(+), 53 deletions(-)
>
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index 63e7ccd..71b549a 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -202,59 +202,6 @@ ENDPROC(native_usergs_sysret64)
> CFI_REL_OFFSET r15, R15+\offset
> .endm
>
> -ENTRY(save_paranoid)
> - XCPT_FRAME 1 RDI+8
> - cld
> - SAVE_C_REGS 8
> - SAVE_EXTRA_REGS 8
> - movl $1,%ebx
> - movl $MSR_GS_BASE,%ecx
> - rdmsr
> - testl %edx,%edx
> - js 1f /* negative -> in kernel */
> - SWAPGS
> - xorl %ebx,%ebx
> -1: ret
> - CFI_ENDPROC
> -END(save_paranoid)
> -
> -/*
> - * A newly forked process directly context switches into this address.
> - *
> - * rdi: prev task we switched from
> - */
> -ENTRY(ret_from_fork)
> - DEFAULT_FRAME
> -
> - LOCK ; btr $TIF_FORK,TI_flags(%r8)
> -
> - pushq_cfi $0x0002
> - popfq_cfi # reset kernel eflags
> -
> - call schedule_tail # rdi: 'prev' task parameter
> -
> - GET_THREAD_INFO(%rcx)
> -
> - RESTORE_EXTRA_REGS
> -
> - testl $3,CS(%rsp) # from kernel_thread?
> - jz 1f
> -
> - testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
> - jnz int_ret_from_sys_call
> -
> - RESTORE_TOP_OF_STACK %rdi
> - jmp ret_from_sys_call # go to the SYSRET fastpath
> -
> -1:
> - movq %rbp, %rdi
> - call *%rbx
> - movl $0, RAX(%rsp)
> - RESTORE_EXTRA_REGS
> - jmp int_ret_from_sys_call
> - CFI_ENDPROC
> -END(ret_from_fork)
> -
> /*
> * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
> *
> @@ -581,6 +528,43 @@ END(stub_x32_execveat)
> #endif
>
> /*
> + * A newly forked process directly context switches into this address.
> + *
> + * rdi: prev task we switched from
> + */
> +ENTRY(ret_from_fork)
> + DEFAULT_FRAME
> +
> + LOCK ; btr $TIF_FORK,TI_flags(%r8)
> +
> + pushq_cfi $0x0002
> + popfq_cfi # reset kernel eflags
> +
> + call schedule_tail # rdi: 'prev' task parameter
> +
> + GET_THREAD_INFO(%rcx)
> +
> + RESTORE_EXTRA_REGS
> +
> + testl $3,CS(%rsp) # from kernel_thread?
> + jz 1f
> +
> + testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
> + jnz int_ret_from_sys_call
> +
> + RESTORE_TOP_OF_STACK %rdi
> + jmp ret_from_sys_call # go to the SYSRET fastpath
> +
> +1:
> + movq %rbp, %rdi
> + call *%rbx
> + movl $0, RAX(%rsp)
> + RESTORE_EXTRA_REGS
> + jmp int_ret_from_sys_call
> + CFI_ENDPROC
> +END(ret_from_fork)
> +
> +/*
> * Build the entry stubs and pointer table with some assembler magic.
> * We pack 7 stubs into a single 32-byte chunk, which will fit in a
> * single cache line on all modern x86 implementations.
> @@ -1269,6 +1253,22 @@ idtentry async_page_fault do_async_page_fault has_error_code=1
> idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
> #endif
>
> +ENTRY(save_paranoid)
> + XCPT_FRAME 1 RDI+8
> + cld
> + SAVE_C_REGS 8
> + SAVE_EXTRA_REGS 8
> + movl $1,%ebx
> + movl $MSR_GS_BASE,%ecx
> + rdmsr
> + testl %edx,%edx
> + js 1f /* negative -> in kernel */
> + SWAPGS
> + xorl %ebx,%ebx
> +1: ret
> + CFI_ENDPROC
> +END(save_paranoid)
> +
> /*
> * "Paranoid" exit path from exception stack. This is invoked
> * only on return from non-NMI IST interrupts that came
> --
> 1.8.1.4
>



--
Andy Lutomirski
AMA Capital Management, LLC

2015-02-24 00:38:12

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH 3/6] x86: entry_64.S: rename save_paranoid to paranoid_entry, no code changes

On Mon, Feb 23, 2015 at 4:12 PM, Denys Vlasenko <[email protected]> wrote:
> This patch does a lot of cleanup in comments and formatting,
> but it does not change any code.
>
> Rename save_paranoid to paranoid_entry: this makes naming
> similar to its "non-paranoid" sibling, error_entry,
> and to its counterpart, paranoid_exit.
>
> Use the same CFI annotation atop paranoid_entry and error_entry.
>
> Fix irregular indentation of assembler operands.
>
> Add/fix comments on top of paranoid_entry and error_entry.
> Remove stale comment about "oldrax".
> Make comments about "no swapgs" flag in ebx more prominent.
> Deindent wrongly indented top-level comment atop paranoid_exit.
> Indent wrongly deindented comment inside error_entry.

Applied.

>
> Signed-off-by: Denys Vlasenko <[email protected]>
> CC: Linus Torvalds <[email protected]>
> CC: Oleg Nesterov <[email protected]>
> CC: Borislav Petkov <[email protected]>
> CC: "H. Peter Anvin" <[email protected]>
> CC: Andy Lutomirski <[email protected]>
> CC: Frederic Weisbecker <[email protected]>
> CC: X86 ML <[email protected]>
> CC: Alexei Starovoitov <[email protected]>
> CC: Will Drewry <[email protected]>
> CC: Kees Cook <[email protected]>
> CC: [email protected]
> ---
> arch/x86/kernel/entry_64.S | 68 ++++++++++++++++++++++++----------------------
> 1 file changed, 36 insertions(+), 32 deletions(-)
>
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index 71b549a..03498d0 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -981,10 +981,11 @@ ENTRY(\sym)
> testl $3, CS(%rsp) /* If coming from userspace, switch */
> jnz 1f /* stacks. */
> .endif
> - call save_paranoid
> + call paranoid_entry
> .else
> call error_entry
> .endif
> + /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
>
> DEFAULT_FRAME 0
>
> @@ -1015,10 +1016,11 @@ ENTRY(\sym)
> addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
> .endif
>
> + /* these procedures expect "no swapgs" flag in ebx */
> .if \paranoid
> - jmp paranoid_exit /* %ebx: no swapgs flag */
> + jmp paranoid_exit
> .else
> - jmp error_exit /* %ebx: no swapgs flag */
> + jmp error_exit
> .endif
>
> .if \paranoid == 1
> @@ -1253,8 +1255,13 @@ idtentry async_page_fault do_async_page_fault has_error_code=1
> idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
> #endif
>
> -ENTRY(save_paranoid)
> - XCPT_FRAME 1 RDI+8
> +/*
> + * Save all registers in pt_regs, and switch gs if needed.
> + * Use slow, but surefire "are we in kernel?" check.
> + * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
> + */
> +ENTRY(paranoid_entry)
> + XCPT_FRAME 1 15*8
> cld
> SAVE_C_REGS 8
> SAVE_EXTRA_REGS 8
> @@ -1267,20 +1274,19 @@ ENTRY(save_paranoid)
> xorl %ebx,%ebx
> 1: ret
> CFI_ENDPROC
> -END(save_paranoid)
> -
> - /*
> - * "Paranoid" exit path from exception stack. This is invoked
> - * only on return from non-NMI IST interrupts that came
> - * from kernel space.
> - *
> - * We may be returning to very strange contexts (e.g. very early
> - * in syscall entry), so checking for preemption here would
> - * be complicated. Fortunately, we there's no good reason
> - * to try to handle preemption here.
> - */
> +END(paranoid_entry)
>
> - /* ebx: no swapgs flag */
> +/*
> + * "Paranoid" exit path from exception stack. This is invoked
> + * only on return from non-NMI IST interrupts that came
> + * from kernel space.
> + *
> + * We may be returning to very strange contexts (e.g. very early
> + * in syscall entry), so checking for preemption here would
> + * be complicated. Fortunately, we there's no good reason
> + * to try to handle preemption here.
> + */
> +/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
> ENTRY(paranoid_exit)
> DEFAULT_FRAME
> DISABLE_INTERRUPTS(CLBR_NONE)
> @@ -1301,13 +1307,11 @@ paranoid_exit_restore:
> END(paranoid_exit)
>
> /*
> - * Exception entry point. This expects an error code/orig_rax on the stack.
> - * returns in "no swapgs flag" in %ebx.
> + * Save all registers in pt_regs, and switch gs if needed.
> + * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
> */
> ENTRY(error_entry)
> - XCPT_FRAME
> - CFI_ADJUST_CFA_OFFSET 15*8
> - /* oldrax contains error code */
> + XCPT_FRAME 1 15*8
> cld
> SAVE_C_REGS 8
> SAVE_EXTRA_REGS 8
> @@ -1320,12 +1324,12 @@ error_sti:
> TRACE_IRQS_OFF
> ret
>
> -/*
> - * There are two places in the kernel that can potentially fault with
> - * usergs. Handle them here. B stepping K8s sometimes report a
> - * truncated RIP for IRET exceptions returning to compat mode. Check
> - * for these here too.
> - */
> + /*
> + * There are two places in the kernel that can potentially fault with
> + * usergs. Handle them here. B stepping K8s sometimes report a
> + * truncated RIP for IRET exceptions returning to compat mode. Check
> + * for these here too.
> + */
> error_kernelspace:
> CFI_REL_OFFSET rcx, RCX+8
> incl %ebx
> @@ -1355,7 +1359,7 @@ error_bad_iret:
> END(error_entry)
>
>
> -/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
> +/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
> ENTRY(error_exit)
> DEFAULT_FRAME
> movl %ebx,%eax
> @@ -1581,13 +1585,13 @@ end_repeat_nmi:
> ALLOC_PT_GPREGS_ON_STACK
>
> /*
> - * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
> + * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
> * as we should not be calling schedule in NMI context.
> * Even with normal interrupts enabled. An NMI should not be
> * setting NEED_RESCHED or anything that normal interrupts and
> * exceptions might do.
> */
> - call save_paranoid
> + call paranoid_entry
> DEFAULT_FRAME 0
>
> /*
> --
> 1.8.1.4
>



--
Andy Lutomirski
AMA Capital Management, LLC