All blame goes to: color white,red "[^[:graph:]]+$"
in .nanorc ;).
Signed-off-by: Alexander van Heukelum <[email protected]>
---
arch/x86/kernel/entry_64.S | 200 ++++++++++++++++++++++----------------------
1 files changed, 100 insertions(+), 100 deletions(-)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 983d85a..a3a7fb2 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -11,15 +11,15 @@
*
* NOTE: This code handles signal-recognition, which happens every time
* after an interrupt and after each system call.
- *
- * Normal syscalls and interrupts don't save a full stack frame, this is
+ *
+ * Normal syscalls and interrupts don't save a full stack frame, this is
* only done for syscall tracing, signals or fork/exec et.al.
- *
- * A note on terminology:
- * - top of stack: Architecture defined interrupt frame from SS to RIP
- * at the top of the kernel process stack.
+ *
+ * A note on terminology:
+ * - top of stack: Architecture defined interrupt frame from SS to RIP
+ * at the top of the kernel process stack.
* - partial stack frame: partially saved registers upto R11.
- * - full stack frame: Like partial stack frame, but all register saved.
+ * - full stack frame: Like partial stack frame, but all register saved.
*
* Some macro usage:
* - CFI macros are used to generate dwarf2 unwind information for better
@@ -147,7 +147,7 @@ END(mcount)
#ifndef CONFIG_PREEMPT
#define retint_kernel retint_restore_args
-#endif
+#endif
#ifdef CONFIG_PARAVIRT
ENTRY(native_usergs_sysret64)
@@ -166,14 +166,14 @@ ENTRY(native_usergs_sysret64)
.endm
/*
- * C code is not supposed to know about undefined top of stack. Every time
- * a C function with an pt_regs argument is called from the SYSCALL based
+ * C code is not supposed to know about undefined top of stack. Every time
+ * a C function with an pt_regs argument is called from the SYSCALL based
* fast path FIXUP_TOP_OF_STACK is needed.
* RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
* manipulation.
- */
-
- /* %rsp:at FRAMEEND */
+ */
+
+ /* %rsp:at FRAMEEND */
.macro FIXUP_TOP_OF_STACK tmp
movq %gs:pda_oldrsp,\tmp
movq \tmp,RSP(%rsp)
@@ -249,8 +249,8 @@ ENTRY(native_usergs_sysret64)
.endm
/*
* A newly forked process directly context switches into this.
- */
-/* rdi: prev */
+ */
+/* rdi: prev */
ENTRY(ret_from_fork)
CFI_DEFAULT_STACK
push kernel_eflags(%rip)
@@ -262,7 +262,7 @@ ENTRY(ret_from_fork)
testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
CFI_REMEMBER_STATE
jnz rff_trace
-rff_action:
+rff_action:
RESTORE_REST
testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
je int_ret_from_sys_call
@@ -274,7 +274,7 @@ rff_action:
rff_trace:
movq %rsp,%rdi
call syscall_trace_leave
- GET_THREAD_INFO(%rcx)
+ GET_THREAD_INFO(%rcx)
jmp rff_action
CFI_ENDPROC
END(ret_from_fork)
@@ -285,20 +285,20 @@ END(ret_from_fork)
* SYSCALL does not save anything on the stack and does not change the
* stack pointer.
*/
-
+
/*
- * Register setup:
+ * Register setup:
* rax system call number
* rdi arg0
- * rcx return address for syscall/sysret, C arg3
+ * rcx return address for syscall/sysret, C arg3
* rsi arg1
- * rdx arg2
+ * rdx arg2
* r10 arg3 (--> moved to rcx for C)
* r8 arg4
* r9 arg5
* r11 eflags for syscall/sysret, temporary for C
- * r12-r15,rbp,rbx saved by C code, not touched.
- *
+ * r12-r15,rbp,rbx saved by C code, not touched.
+ *
* Interrupts are off on entry.
* Only called from user space.
*
@@ -308,7 +308,7 @@ END(ret_from_fork)
* When user can change the frames always force IRET. That is because
* it deals with uncanonical addresses better. SYSRET has trouble
* with them due to bugs in both AMD and Intel CPUs.
- */
+ */
ENTRY(system_call)
CFI_STARTPROC simple
@@ -324,7 +324,7 @@ ENTRY(system_call)
*/
ENTRY(system_call_after_swapgs)
- movq %rsp,%gs:pda_oldrsp
+ movq %rsp,%gs:pda_oldrsp
movq %gs:pda_kernelstack,%rsp
/*
* No need to follow this irqs off/on section - it's straight
@@ -332,7 +332,7 @@ ENTRY(system_call_after_swapgs)
*/
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_ARGS 8,1
- movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
+ movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
GET_THREAD_INFO(%rcx)
@@ -346,19 +346,19 @@ system_call_fastpath:
movq %rax,RAX-ARGOFFSET(%rsp)
/*
* Syscall return path ending with SYSRET (fast path)
- * Has incomplete stack frame and undefined top of stack.
- */
+ * Has incomplete stack frame and undefined top of stack.
+ */
ret_from_sys_call:
movl $_TIF_ALLWORK_MASK,%edi
/* edi: flagmask */
-sysret_check:
+sysret_check:
LOCKDEP_SYS_EXIT
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
movl TI_flags(%rcx),%edx
andl %edi,%edx
- jnz sysret_careful
+ jnz sysret_careful
CFI_REMEMBER_STATE
/*
* sysretq will re-enable interrupts:
@@ -373,7 +373,7 @@ sysret_check:
CFI_RESTORE_STATE
/* Handle reschedules */
- /* edx: work, edi: workmask */
+ /* edx: work, edi: workmask */
sysret_careful:
bt $TIF_NEED_RESCHED,%edx
jnc sysret_signal
@@ -386,7 +386,7 @@ sysret_careful:
CFI_ADJUST_CFA_OFFSET -8
jmp sysret_check
- /* Handle a signal */
+ /* Handle a signal */
sysret_signal:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
@@ -405,7 +405,7 @@ sysret_signal:
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check
-
+
badsys:
movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
jmp ret_from_sys_call
@@ -444,7 +444,7 @@ sysret_audit:
#endif /* CONFIG_AUDITSYSCALL */
/* Do syscall tracing */
-tracesys:
+tracesys:
#ifdef CONFIG_AUDITSYSCALL
testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
jz auditsys
@@ -467,8 +467,8 @@ tracesys:
call *sys_call_table(,%rax,8)
movq %rax,RAX-ARGOFFSET(%rsp)
/* Use IRET because user could have changed frame */
-
-/*
+
+/*
* Syscall return path ending with IRET.
* Has correct top of stack, but partial stack frame.
*/
@@ -512,18 +512,18 @@ int_very_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_REST
- /* Check for syscall exit trace */
+ /* Check for syscall exit trace */
testl $_TIF_WORK_SYSCALL_EXIT,%edx
jz int_signal
pushq %rdi
CFI_ADJUST_CFA_OFFSET 8
- leaq 8(%rsp),%rdi # &ptregs -> arg1
+ leaq 8(%rsp),%rdi # &ptregs -> arg1
call syscall_trace_leave
popq %rdi
CFI_ADJUST_CFA_OFFSET -8
andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
jmp int_restore_rest
-
+
int_signal:
testl $_TIF_DO_NOTIFY_MASK,%edx
jz 1f
@@ -538,11 +538,11 @@ int_restore_rest:
jmp int_with_check
CFI_ENDPROC
END(system_call)
-
-/*
+
+/*
* Certain special system calls that need to save a complete full stack frame.
- */
-
+ */
+
.macro PTREGSCALL label,func,arg
.globl \label
\label:
@@ -579,7 +579,7 @@ ENTRY(ptregscall_common)
ret
CFI_ENDPROC
END(ptregscall_common)
-
+
ENTRY(stub_execve)
CFI_STARTPROC
popq %r11
@@ -595,11 +595,11 @@ ENTRY(stub_execve)
jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_execve)
-
+
/*
* sigreturn is special because it needs to restore all registers on return.
* This cannot be done with SYSRET, so use the IRET return path instead.
- */
+ */
ENTRY(stub_rt_sigreturn)
CFI_STARTPROC
addq $8, %rsp
@@ -634,15 +634,15 @@ END(stub_rt_sigreturn)
vector already pushed) */
#define XCPT_FRAME _frame ORIG_RAX
-/*
+/*
* Interrupt entry/exit.
*
* Interrupt entry points save only callee clobbered registers in fast path.
- *
- * Entry runs with interrupts off.
- */
+ *
+ * Entry runs with interrupts off.
+ */
-/* 0(%rsp): interrupt number */
+/* 0(%rsp): interrupt number */
.macro interrupt func
cld
SAVE_ARGS
@@ -692,12 +692,12 @@ exit_intr:
GET_THREAD_INFO(%rcx)
testl $3,CS-ARGOFFSET(%rsp)
je retint_kernel
-
+
/* Interrupt came from user space */
/*
* Has a correct top of stack, but a partial stack frame
* %rcx: thread info. Interrupts off.
- */
+ */
retint_with_reschedule:
movl $_TIF_WORK_MASK,%edi
retint_check:
@@ -770,20 +770,20 @@ retint_careful:
pushq %rdi
CFI_ADJUST_CFA_OFFSET 8
call schedule
- popq %rdi
+ popq %rdi
CFI_ADJUST_CFA_OFFSET -8
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp retint_check
-
+
retint_signal:
testl $_TIF_DO_NOTIFY_MASK,%edx
jz retint_swapgs
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_REST
- movq $-1,ORIG_RAX(%rsp)
+ movq $-1,ORIG_RAX(%rsp)
xorl %esi,%esi # oldset
movq %rsp,%rdi # &pt_regs
call do_notify_resume
@@ -805,14 +805,14 @@ ENTRY(retint_kernel)
jnc retint_restore_args
call preempt_schedule_irq
jmp exit_intr
-#endif
+#endif
CFI_ENDPROC
END(common_interrupt)
-
+
/*
* APIC interrupts.
- */
+ */
.macro apicinterrupt num,func
INTR_FRAME
pushq $~(\num)
@@ -830,14 +830,14 @@ ENTRY(threshold_interrupt)
apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
END(threshold_interrupt)
-#ifdef CONFIG_SMP
+#ifdef CONFIG_SMP
ENTRY(reschedule_interrupt)
apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
END(reschedule_interrupt)
.macro INVALIDATE_ENTRY num
ENTRY(invalidate_interrupt\num)
- apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
+ apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
END(invalidate_interrupt\num)
.endm
@@ -876,22 +876,22 @@ END(error_interrupt)
ENTRY(spurious_interrupt)
apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
END(spurious_interrupt)
-
+
/*
* Exception entry points.
- */
+ */
.macro zeroentry sym
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $0 /* push error code/oldrax */
+ pushq $0 /* push error code/oldrax */
CFI_ADJUST_CFA_OFFSET 8
- pushq %rax /* push real oldrax to the rdi slot */
+ pushq %rax /* push real oldrax to the rdi slot */
CFI_ADJUST_CFA_OFFSET 8
CFI_REL_OFFSET rax,0
leaq \sym(%rip),%rax
jmp error_entry
CFI_ENDPROC
- .endm
+ .endm
.macro errorentry sym
XCPT_FRAME
@@ -1005,13 +1005,13 @@ paranoid_schedule\trace:
/*
* Exception entry point. This expects an error code/orig_rax on the stack
- * and the exception handler in %rax.
- */
+ * and the exception handler in %rax.
+ */
KPROBE_ENTRY(error_entry)
_frame RDI
CFI_REL_OFFSET rax,0
/* rdi slot contains rax, oldrax contains error code */
- cld
+ cld
subq $14*8,%rsp
CFI_ADJUST_CFA_OFFSET (14*8)
movq %rsi,13*8(%rsp)
@@ -1022,7 +1022,7 @@ KPROBE_ENTRY(error_entry)
CFI_REL_OFFSET rdx,RDX
movq %rcx,11*8(%rsp)
CFI_REL_OFFSET rcx,RCX
- movq %rsi,10*8(%rsp) /* store rax */
+ movq %rsi,10*8(%rsp) /* store rax */
CFI_REL_OFFSET rax,RAX
movq %r8, 9*8(%rsp)
CFI_REL_OFFSET r8,R8
@@ -1032,29 +1032,29 @@ KPROBE_ENTRY(error_entry)
CFI_REL_OFFSET r10,R10
movq %r11,6*8(%rsp)
CFI_REL_OFFSET r11,R11
- movq %rbx,5*8(%rsp)
+ movq %rbx,5*8(%rsp)
CFI_REL_OFFSET rbx,RBX
- movq %rbp,4*8(%rsp)
+ movq %rbp,4*8(%rsp)
CFI_REL_OFFSET rbp,RBP
- movq %r12,3*8(%rsp)
+ movq %r12,3*8(%rsp)
CFI_REL_OFFSET r12,R12
- movq %r13,2*8(%rsp)
+ movq %r13,2*8(%rsp)
CFI_REL_OFFSET r13,R13
- movq %r14,1*8(%rsp)
+ movq %r14,1*8(%rsp)
CFI_REL_OFFSET r14,R14
- movq %r15,(%rsp)
+ movq %r15,(%rsp)
CFI_REL_OFFSET r15,R15
- xorl %ebx,%ebx
+ xorl %ebx,%ebx
testl $3,CS(%rsp)
je error_kernelspace
-error_swapgs:
+error_swapgs:
SWAPGS
error_sti:
TRACE_IRQS_OFF
- movq %rdi,RDI(%rsp)
+ movq %rdi,RDI(%rsp)
CFI_REL_OFFSET rdi,RDI
movq %rsp,%rdi
- movq ORIG_RAX(%rsp),%rsi /* get error code */
+ movq ORIG_RAX(%rsp),%rsi /* get error code */
movq $-1,ORIG_RAX(%rsp)
call *%rax
/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
@@ -1063,7 +1063,7 @@ error_exit:
RESTORE_REST
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- GET_THREAD_INFO(%rcx)
+ GET_THREAD_INFO(%rcx)
testl %eax,%eax
jne retint_kernel
LOCKDEP_SYS_EXIT_IRQ
@@ -1079,7 +1079,7 @@ error_kernelspace:
/* There are two places in the kernel that can potentially fault with
usergs. Handle them here. The exception handlers after
iret run with kernel gs again, so don't set the user space flag.
- B stepping K8s sometimes report an truncated RIP for IRET
+ B stepping K8s sometimes report an truncated RIP for IRET
exceptions returning to compat mode. Check for these here too. */
leaq irq_return(%rip),%rcx
cmpq %rcx,RIP(%rsp)
@@ -1091,17 +1091,17 @@ error_kernelspace:
je error_swapgs
jmp error_sti
KPROBE_END(error_entry)
-
+
/* Reload gs selector with exception handling */
- /* edi: new selector */
+ /* edi: new selector */
ENTRY(native_load_gs_index)
CFI_STARTPROC
pushf
CFI_ADJUST_CFA_OFFSET 8
DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
SWAPGS
-gs_change:
- movl %edi,%gs
+gs_change:
+ movl %edi,%gs
2: mfence /* workaround */
SWAPGS
popf
@@ -1109,20 +1109,20 @@ gs_change:
ret
CFI_ENDPROC
ENDPROC(native_load_gs_index)
-
+
.section __ex_table,"a"
.align 8
.quad gs_change,bad_gs
.previous
.section .fixup,"ax"
/* running with kernelgs */
-bad_gs:
+bad_gs:
SWAPGS /* switch back to user gs */
xorl %eax,%eax
movl %eax,%gs
jmp 2b
- .previous
-
+ .previous
+
/*
* Create a kernel thread.
*
@@ -1145,7 +1145,7 @@ ENTRY(kernel_thread)
xorl %r8d,%r8d
xorl %r9d,%r9d
-
+
# clone now
call do_fork
movq %rax,RAX(%rsp)
@@ -1156,14 +1156,14 @@ ENTRY(kernel_thread)
* so internally to the x86_64 port you can rely on kernel_thread()
* not to reschedule the child before returning, this avoids the need
* of hacks for example to fork off the per-CPU idle tasks.
- * [Hopefully no generic code relies on the reschedule -AK]
+ * [Hopefully no generic code relies on the reschedule -AK]
*/
RESTORE_ALL
UNFAKE_STACK_FRAME
ret
CFI_ENDPROC
ENDPROC(kernel_thread)
-
+
child_rip:
pushq $0 # fake return address
CFI_STARTPROC
@@ -1198,10 +1198,10 @@ ENDPROC(child_rip)
ENTRY(kernel_execve)
CFI_STARTPROC
FAKE_STACK_FRAME $0
- SAVE_ALL
+ SAVE_ALL
movq %rsp,%rcx
call sys_execve
- movq %rax, RAX(%rsp)
+ movq %rax, RAX(%rsp)
RESTORE_REST
testq %rax,%rax
je int_ret_from_sys_call
@@ -1220,7 +1220,7 @@ ENTRY(coprocessor_error)
END(coprocessor_error)
ENTRY(simd_coprocessor_error)
- zeroentry do_simd_coprocessor_error
+ zeroentry do_simd_coprocessor_error
END(simd_coprocessor_error)
ENTRY(device_not_available)
@@ -1232,12 +1232,12 @@ KPROBE_ENTRY(debug)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $0
- CFI_ADJUST_CFA_OFFSET 8
+ CFI_ADJUST_CFA_OFFSET 8
paranoidentry do_debug, DEBUG_STACK
paranoidexit
KPROBE_END(debug)
- /* runs on exception stack */
+ /* runs on exception stack */
KPROBE_ENTRY(nmi)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
@@ -1271,7 +1271,7 @@ ENTRY(bounds)
END(bounds)
ENTRY(invalid_op)
- zeroentry do_invalid_op
+ zeroentry do_invalid_op
END(invalid_op)
ENTRY(coprocessor_segment_overrun)
@@ -1326,7 +1326,7 @@ ENTRY(machine_check)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $0
- CFI_ADJUST_CFA_OFFSET 8
+ CFI_ADJUST_CFA_OFFSET 8
paranoidentry do_machine_check
jmp paranoid_exit1
CFI_ENDPROC
--
1.5.4.3
From: Alexander van Heukelum <[email protected]>
The macro "interrupt" in entry_64.S generates a lot of code. This
patch moves most of its contents into an external function. It
saves anywhere between 500 and 2500 bytes of text depending on the
configuration.
The function returns with an altered stackpointer.
Dwarf2-annotations are most probably wrong or missing.
There is a comment in the original code about saving rbp twice, but
I don't understand what the code tries to do. First of all, the
second copy of rbp is written below the stack. Second, if the current
stack is already the irqstack, this second copy is overwritten. Third,
as far as I can tell, ebp should not be saved in pt_regs at all at
this stage as it is 'preserved' due to the C calling conventions. So
I left this second copy out and everything seems to work fine. If it
wouldn't, all exceptions and NMIs would be dangerous anyhow, as far
as I can see.
Signed-off-by: Alexander van Heukelum <[email protected]>
Cc: Glauber Costa <[email protected]>
---
arch/x86/kernel/entry_64.S | 123 ++++++++++++++++++++++++++------------------
1 files changed, 73 insertions(+), 50 deletions(-)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a3a7fb2..eb57c23 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -247,6 +247,78 @@ ENTRY(native_usergs_sysret64)
CFI_REL_OFFSET rsp,RSP
/*CFI_REL_OFFSET ss,SS*/
.endm
+
+/*
+ * initial frame state for interrupts and exceptions
+ */
+ .macro _frame ref
+ CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA rsp,SS+8-\ref
+ /*CFI_REL_OFFSET ss,SS-\ref*/
+ CFI_REL_OFFSET rsp,RSP-\ref
+ /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
+ /*CFI_REL_OFFSET cs,CS-\ref*/
+ CFI_REL_OFFSET rip,RIP-\ref
+ .endm
+
+/* initial frame state for interrupts (and exceptions without error code) */
+#define INTR_FRAME _frame RIP
+/* initial frame state for exceptions with error code (and interrupts with
+ vector already pushed) */
+#define XCPT_FRAME _frame ORIG_RAX
+
+/* save_args changes the stack */
+ENTRY(save_args)
+ XCPT_FRAME
+ cld
+ subq $9*8, %rsp
+ CFI_ADJUST_CFA_OFFSET 9*8
+ push 9*8(%rsp) /* copy return address */
+ CFI_ADJUST_CFA_OFFSET 8
+ movq %rdi, 8*8+16(%rsp)
+ CFI_REL_OFFSET rdi, 8*8+16
+ movq %rsi, 7*8+16(%rsp)
+ CFI_REL_OFFSET rsi, 7*8+16
+ movq %rdx, 6*8+16(%rsp)
+ CFI_REL_OFFSET rdx, 6*8+16
+ movq %rcx, 5*8+16(%rsp)
+ CFI_REL_OFFSET rcx, 5*8+16
+ movq %rax, 4*8+16(%rsp)
+ CFI_REL_OFFSET rax, 4*8+16
+ movq %r8, 3*8+16(%rsp)
+ CFI_REL_OFFSET r8, 3*8+16
+ movq %r9, 2*8+16(%rsp)
+ CFI_REL_OFFSET r9, 2*8+16
+ movq %r10, 1*8+16(%rsp)
+ CFI_REL_OFFSET r10, 1*8+16
+ movq %r11, 0*8+16(%rsp)
+ CFI_REL_OFFSET r11, 0*8+16
+ leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */
+ movq %rbp, 8(%rsp) /* push %rbp */
+ leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
+ testl $3, CS(%rdi)
+ je 1f
+ SWAPGS
+ /*
+ * irqcount is used to check if a CPU is already on an interrupt stack
+ * or not. While this is essentially redundant with preempt_count it is
+ * a little cheaper to use a separate counter in the PDA (short of
+ * moving irq_enter into assembly, which would be too much work)
+ */
+1: incl %gs:pda_irqcount
+ jne 2f
+ pop %rax /* move return address... */
+ mov %gs:pda_irqstackptr,%rsp
+ push %rax /* ... to the new stack */
+ /*
+ * We entered an interrupt context - irqs are off:
+ */
+2: TRACE_IRQS_OFF
+ ret
+ CFI_ENDPROC
+END(save_args)
+
/*
* A newly forked process directly context switches into this.
*/
@@ -615,26 +687,6 @@ ENTRY(stub_rt_sigreturn)
END(stub_rt_sigreturn)
/*
- * initial frame state for interrupts and exceptions
- */
- .macro _frame ref
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SS+8-\ref
- /*CFI_REL_OFFSET ss,SS-\ref*/
- CFI_REL_OFFSET rsp,RSP-\ref
- /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
- /*CFI_REL_OFFSET cs,CS-\ref*/
- CFI_REL_OFFSET rip,RIP-\ref
- .endm
-
-/* initial frame state for interrupts (and exceptions without error code) */
-#define INTR_FRAME _frame RIP
-/* initial frame state for exceptions with error code (and interrupts with
- vector already pushed) */
-#define XCPT_FRAME _frame ORIG_RAX
-
-/*
* Interrupt entry/exit.
*
* Interrupt entry points save only callee clobbered registers in fast path.
@@ -644,36 +696,7 @@ END(stub_rt_sigreturn)
/* 0(%rsp): interrupt number */
.macro interrupt func
- cld
- SAVE_ARGS
- leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
- pushq %rbp
- /*
- * Save rbp twice: One is for marking the stack frame, as usual, and the
- * other, to fill pt_regs properly. This is because bx comes right
- * before the last saved register in that structure, and not bp. If the
- * base pointer were in the place bx is today, this would not be needed.
- */
- movq %rbp, -8(%rsp)
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rbp, 0
- movq %rsp,%rbp
- CFI_DEF_CFA_REGISTER rbp
- testl $3,CS(%rdi)
- je 1f
- SWAPGS
- /* irqcount is used to check if a CPU is already on an interrupt
- stack or not. While this is essentially redundant with preempt_count
- it is a little cheaper to use a separate counter in the PDA
- (short of moving irq_enter into assembly, which would be too
- much work) */
-1: incl %gs:pda_irqcount
- cmoveq %gs:pda_irqstackptr,%rsp
- push %rbp # backlink for old unwinder
- /*
- * We entered an interrupt context - irqs are off:
- */
- TRACE_IRQS_OFF
+ call save_args
call \func
.endm
--
1.5.4.3
* Alexander van Heukelum <[email protected]> wrote:
> All blame goes to: color white,red "[^[:graph:]]+$"
> in .nanorc ;).
>
> Signed-off-by: Alexander van Heukelum <[email protected]>
> ---
> arch/x86/kernel/entry_64.S | 200 ++++++++++++++++++++++----------------------
> 1 files changed, 100 insertions(+), 100 deletions(-)
applied to tip/x86/cleanups, thanks Alexander. This file still has a
horrible code structure but now at least our eyes do not burn out when
looking at it ;-)
Ingo
On Sun, Nov 16, 2008 at 03:29:01PM +0100, Alexander van Heukelum wrote:
> From: Alexander van Heukelum <[email protected]>
>
> The macro "interrupt" in entry_64.S generates a lot of code. This
> patch moves most of its contents into an external function. It
> saves anywhere between 500 and 2500 bytes of text depending on the
> configuration.
>
> The function returns with an altered stackpointer.
>
> Dwarf2-annotations are most probably wrong or missing.
>
> There is a comment in the original code about saving rbp twice, but
> I don't understand what the code tries to do. First of all, the
> second copy of rbp is written below the stack.
This is not for the stack, but to keep the registers as the pt_regs struct
expects them. We'll be casting this structure later, and not doing that
can lead to bogus code, whenever we cast regs->bp.
> Second, if the current
> stack is already the irqstack, this second copy is overwritten. Third,
> as far as I can tell, ebp should not be saved in pt_regs at all at
> this stage as it is 'preserved' due to the C calling conventions. So
> I left this second copy out and everything seems to work fine. If it
> wouldn't, all exceptions and NMIs would be dangerous anyhow, as far
> as I can see.
Try testing with a kernel with CONFIG_FRAME_POINTER turned on.
>
> Signed-off-by: Alexander van Heukelum <[email protected]>
> Cc: Glauber Costa <[email protected]>
>
> ---
> arch/x86/kernel/entry_64.S | 123 ++++++++++++++++++++++++++------------------
> 1 files changed, 73 insertions(+), 50 deletions(-)
>
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index a3a7fb2..eb57c23 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -247,6 +247,78 @@ ENTRY(native_usergs_sysret64)
> CFI_REL_OFFSET rsp,RSP
> /*CFI_REL_OFFSET ss,SS*/
> .endm
> +
> +/*
> + * initial frame state for interrupts and exceptions
> + */
> + .macro _frame ref
> + CFI_STARTPROC simple
> + CFI_SIGNAL_FRAME
> + CFI_DEF_CFA rsp,SS+8-\ref
> + /*CFI_REL_OFFSET ss,SS-\ref*/
> + CFI_REL_OFFSET rsp,RSP-\ref
> + /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
> + /*CFI_REL_OFFSET cs,CS-\ref*/
> + CFI_REL_OFFSET rip,RIP-\ref
> + .endm
> +
> +/* initial frame state for interrupts (and exceptions without error code) */
> +#define INTR_FRAME _frame RIP
> +/* initial frame state for exceptions with error code (and interrupts with
> + vector already pushed) */
> +#define XCPT_FRAME _frame ORIG_RAX
> +
> +/* save_args changes the stack */
> +ENTRY(save_args)
> + XCPT_FRAME
> + cld
> + subq $9*8, %rsp
> + CFI_ADJUST_CFA_OFFSET 9*8
> + push 9*8(%rsp) /* copy return address */
> + CFI_ADJUST_CFA_OFFSET 8
> + movq %rdi, 8*8+16(%rsp)
> + CFI_REL_OFFSET rdi, 8*8+16
> + movq %rsi, 7*8+16(%rsp)
> + CFI_REL_OFFSET rsi, 7*8+16
> + movq %rdx, 6*8+16(%rsp)
> + CFI_REL_OFFSET rdx, 6*8+16
> + movq %rcx, 5*8+16(%rsp)
> + CFI_REL_OFFSET rcx, 5*8+16
> + movq %rax, 4*8+16(%rsp)
> + CFI_REL_OFFSET rax, 4*8+16
> + movq %r8, 3*8+16(%rsp)
> + CFI_REL_OFFSET r8, 3*8+16
> + movq %r9, 2*8+16(%rsp)
> + CFI_REL_OFFSET r9, 2*8+16
> + movq %r10, 1*8+16(%rsp)
> + CFI_REL_OFFSET r10, 1*8+16
> + movq %r11, 0*8+16(%rsp)
> + CFI_REL_OFFSET r11, 0*8+16
> + leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */
> + movq %rbp, 8(%rsp) /* push %rbp */
> + leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
> + testl $3, CS(%rdi)
> + je 1f
> + SWAPGS
> + /*
> + * irqcount is used to check if a CPU is already on an interrupt stack
> + * or not. While this is essentially redundant with preempt_count it is
> + * a little cheaper to use a separate counter in the PDA (short of
> + * moving irq_enter into assembly, which would be too much work)
> + */
> +1: incl %gs:pda_irqcount
> + jne 2f
> + pop %rax /* move return address... */
> + mov %gs:pda_irqstackptr,%rsp
> + push %rax /* ... to the new stack */
> + /*
> + * We entered an interrupt context - irqs are off:
> + */
> +2: TRACE_IRQS_OFF
> + ret
> + CFI_ENDPROC
> +END(save_args)
> +
> /*
> * A newly forked process directly context switches into this.
> */
> @@ -615,26 +687,6 @@ ENTRY(stub_rt_sigreturn)
> END(stub_rt_sigreturn)
>
> /*
> - * initial frame state for interrupts and exceptions
> - */
> - .macro _frame ref
> - CFI_STARTPROC simple
> - CFI_SIGNAL_FRAME
> - CFI_DEF_CFA rsp,SS+8-\ref
> - /*CFI_REL_OFFSET ss,SS-\ref*/
> - CFI_REL_OFFSET rsp,RSP-\ref
> - /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
> - /*CFI_REL_OFFSET cs,CS-\ref*/
> - CFI_REL_OFFSET rip,RIP-\ref
> - .endm
> -
> -/* initial frame state for interrupts (and exceptions without error code) */
> -#define INTR_FRAME _frame RIP
> -/* initial frame state for exceptions with error code (and interrupts with
> - vector already pushed) */
> -#define XCPT_FRAME _frame ORIG_RAX
> -
> -/*
> * Interrupt entry/exit.
> *
> * Interrupt entry points save only callee clobbered registers in fast path.
> @@ -644,36 +696,7 @@ END(stub_rt_sigreturn)
>
> /* 0(%rsp): interrupt number */
> .macro interrupt func
> - cld
> - SAVE_ARGS
> - leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
> - pushq %rbp
> - /*
> - * Save rbp twice: One is for marking the stack frame, as usual, and the
> - * other, to fill pt_regs properly. This is because bx comes right
> - * before the last saved register in that structure, and not bp. If the
> - * base pointer were in the place bx is today, this would not be needed.
> - */
> - movq %rbp, -8(%rsp)
> - CFI_ADJUST_CFA_OFFSET 8
> - CFI_REL_OFFSET rbp, 0
> - movq %rsp,%rbp
> - CFI_DEF_CFA_REGISTER rbp
> - testl $3,CS(%rdi)
> - je 1f
> - SWAPGS
> - /* irqcount is used to check if a CPU is already on an interrupt
> - stack or not. While this is essentially redundant with preempt_count
> - it is a little cheaper to use a separate counter in the PDA
> - (short of moving irq_enter into assembly, which would be too
> - much work) */
> -1: incl %gs:pda_irqcount
> - cmoveq %gs:pda_irqstackptr,%rsp
> - push %rbp # backlink for old unwinder
> - /*
> - * We entered an interrupt context - irqs are off:
> - */
> - TRACE_IRQS_OFF
> + call save_args
> call \func
> .endm
>
> --
> 1.5.4.3
>
On Sun, Nov 16, 2008 at 03:29:01PM +0100, Alexander van Heukelum wrote:
> From: Alexander van Heukelum <[email protected]>
>
> The macro "interrupt" in entry_64.S generates a lot of code. This
> patch moves most of its contents into an external function. It
> saves anywhere between 500 and 2500 bytes of text depending on the
> configuration.
The only duplication is in the apicinterrupt entry points which
have expanded recently (when I wrote all that there weren't as many)
I think it would be cleaner to just have a common apic_interrupt
entry point similar to how the exceptions work that try to factor
out "interrupt" like this. As more and more of them get added
(I have another new one in recent) patches that will likely
save more space.
The only ugly part is that passing the handler to the common
stub requires the manual pt_regs setup that the exception
handler currently does. Because that could be factored
out in a new macro. Or just copied (I have heard complaints
in the past that the file has too many macros already)
> There is a comment in the original code about saving rbp twice, but
> I don't understand what the code tries to do. First of all, the
To be honest I didn't understand this one either when it was added. In
standard frame pointer format rbp has to be at the place pointed to by the real
rbp register with the return address directly on top of it. But pushing
%rbp below the pt_regs doesn't put it into this format, because
the return address is at the wrong place.
-Andi
On Mon, 17 Nov 2008 10:14:44 -0200, "Glauber Costa" <[email protected]>
said:
> On Sun, Nov 16, 2008 at 03:29:01PM +0100, Alexander van Heukelum wrote:
> > From: Alexander van Heukelum <[email protected]>
> >
> > The macro "interrupt" in entry_64.S generates a lot of code. This
> > patch moves most of its contents into an external function. It
> > saves anywhere between 500 and 2500 bytes of text depending on the
> > configuration.
> >
> > The function returns with an altered stackpointer.
> >
> > Dwarf2-annotations are most probably wrong or missing.
> >
> > There is a comment in the original code about saving rbp twice, but
> > I don't understand what the code tries to do. First of all, the
> > second copy of rbp is written below the stack.
> This is not for the stack, but to keep the registers as the pt_regs
> struct
> expects them. We'll be casting this structure later, and not doing that
> can lead to bogus code, whenever we cast regs->bp.
>
> > Second, if the current
> > stack is already the irqstack, this second copy is overwritten. Third,
> > as far as I can tell, ebp should not be saved in pt_regs at all at
> > this stage as it is 'preserved' due to the C calling conventions. So
> > I left this second copy out and everything seems to work fine. If it
> > wouldn't, all exceptions and NMIs would be dangerous anyhow, as far
> > as I can see.
> Try testing with a kernel with CONFIG_FRAME_POINTER turned on.
Hi Glauber,
Thanks for your reply, but I'm afraid I still don't see why this
second copy is needed.
A CONFIG_FRAME_POINTER configuration seems to be working fine. Of
course regs->bp will give a bogus value, but so will r12, r13,
r14, r15, and rbx. However, if you have CONFIG_FRAME_POINTER=y,
then rbp can be trusted to be used as a framepointer... so one
should get it directly from the registers, not from regs->bp.
Could you give me a configuration that fails, and describe what
goes wrong? Probably the right fix is not to use regs->bp.
Greetings,
Alexander
> >
> > Signed-off-by: Alexander van Heukelum <[email protected]>
> > Cc: Glauber Costa <[email protected]>
--
Alexander van Heukelum
[email protected]
--
http://www.fastmail.fm - mmm... Fastmail...
On Mon, 17 Nov 2008 10:47:40 +0100, "Ingo Molnar" <[email protected]> said:
>
> * Alexander van Heukelum <[email protected]> wrote:
>
> > All blame goes to: color white,red "[^[:graph:]]+$"
> > in .nanorc ;).
> >
> > Signed-off-by: Alexander van Heukelum <[email protected]>
> > ---
> > arch/x86/kernel/entry_64.S | 200 ++++++++++++++++++++++----------------------
> > 1 files changed, 100 insertions(+), 100 deletions(-)
>
> applied to tip/x86/cleanups, thanks Alexander. This file still has a
> horrible code structure but now at least our eyes do not burn out when
> looking at it ;-)
>
> Ingo
Lots of thanks!
Alexander
--
Alexander van Heukelum
[email protected]
--
http://www.fastmail.fm - I mean, what is it about a decent email service?
On Mon, 17 Nov 2008 13:53:17 +0100, "Andi Kleen" <[email protected]>
said:
> On Sun, Nov 16, 2008 at 03:29:01PM +0100, Alexander van Heukelum wrote:
> > From: Alexander van Heukelum <[email protected]>
> >
> > The macro "interrupt" in entry_64.S generates a lot of code. This
> > patch moves most of its contents into an external function. It
> > saves anywhere between 500 and 2500 bytes of text depending on the
> > configuration.
>
> The only duplication is in the apicinterrupt entry points which
> have expanded recently (when I wrote all that there weren't as many)
>
> I think it would be cleaner to just have a common apic_interrupt
> entry point similar to how the exceptions work that try to factor
> out "interrupt" like this. As more and more of them get added
> (I have another new one in recent) patches that will likely
> save more space.
>
> The only ugly part is that passing the handler to the common
> stub requires the manual pt_regs setup that the exception
> handler currently does. Because that could be factored
> out in a new macro. Or just copied (I have heard complaints
> in the past that the file has too many macros already)
Hi Andi,
I liked this way of calling an external function, because it
circumvents exactly most of this ugly setup. For two bytes
extra per stub, one can make this setup function a completely
normal one (without relocating the return address on the stack).
I intended the stubs to fit in one cache line (32 bytes) as it
does not make much sense to make them smaller than that. A
further advantage is that no indirect call is needed, but maybe
this is not as slow anymore as it used to be? B.t.w., I intended
to change the exception handler stubs in a similar way to get
rid of the indirect call.
> > There is a comment in the original code about saving rbp twice, but
> > I don't understand what the code tries to do. First of all, the
>
> To be honest I didn't understand this one either when it was added. In
> standard frame pointer format rbp has to be at the place pointed to by
> the real
> rbp register with the return address directly on top of it. But pushing
> %rbp below the pt_regs doesn't put it into this format, because
> the return address is at the wrong place.
The second copy of %rbp is indeed placed at the 'correct' position
inside the pt_regs struct. However, at this point only a partial
stack frame is saved: the C argument registers and the scratch
registers. r12-r15,ebp,and rbx will be saved only later if necessary.
A problem could arise if some code uses pt_regs.bp of this partial
stack frame, because it will contain a bogus value. Glauber's patch
makes pt_regs.bp contain the right value most of the time... Which
means that if the patch fixed something for him, the problem has only
been made unlikely to happen. The place where things should be fixed
are the places where pt_regs.bp is used, but not filled in.
Greetings,
Alexander
> -Andi
--
Alexander van Heukelum
[email protected]
--
http://www.fastmail.fm - Access your email from home and the web
The macro "interrupt" in entry_64.S generates a lot of code and it
is used more and more often. This patch moves most of its contents
into an external function. This saves anywhere between 500 and 2500
bytes of text depending on the configuration.
Dwarf2-annotations are most probably wrong or missing at all.
v2 moves adjusting the stack to the caller. This avoids the ugly
shuffle to handle the position of the return address on the stack.
After this patch, a typical handler looks like this:
<thermal_interrupt>:
68 05 ff ff ff pushq $0xffffffffffffff05
48 83 ec 50 sub $0x50,%rsp
e8 72 f4 ff ff callq ffffffff80211260 <save_args>
e8 ec 08 00 00 callq ffffffff802126df <smp_thermal_inter_interrupt>
e9 16 fd ff ff jmpq ffffffff80211b0e <ret_from_intr>
0f 1f 84 00 00 nopl 0x0(%rax,%rax,1)
00 00 00
I think this approach (v2) is much cleaner than using the same
strategy as for the exception handlers, where the address of
the C-handler is passed to a common entry point which makes
an indirect call to the handler.
<coprocessor_error>:
ff 15 f2 71 1c callq *0x1c71f2(%rip) # <pv_irq_ops+0x38>
00
6a 00 pushq $0x0
50 push %rax
48 8d 05 d9 11 lea 0x11d9(%rip),%rax # <do_coprocessor_error>
00 00
e9 4b 99 0f 00 jmpq ffffffff8030bbb0 <error_entry>
66 66 2e 0f 1f nopw %cs:0x0(%rax,%rax,1)
84 00 00 00 00 00
The advantage of _this_ way of doing things is that the stubs can
probably be made to fit in 16 bytes, but it comes at the cost of
doing an unnecessary indirect call.
Signed-off-by: Alexander van Heukelum <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Jan Beulich <[email protected]>
Cc: Thomas Gleixner <[email protected]>
arch/x86/kernel/entry_64.S | 135 ++++++++++++++++++++++++++------------------
1 files changed, 81 insertions(+), 54 deletions(-)
---
Hi all,
I just want to give this one more shot ;). Comments?
This patch is on top of tip/x86/cleanups and contains some left-over
whitespace changes.
Greetings,
Alexander
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 5492778..d483e07 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -242,6 +242,78 @@ ENTRY(native_usergs_sysret64)
CFI_REL_OFFSET rsp,RSP
/*CFI_REL_OFFSET ss,SS*/
.endm
+
+/*
+ * initial frame state for interrupts and exceptions
+ */
+ .macro _frame ref
+ CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA rsp,SS+8-\ref
+ /*CFI_REL_OFFSET ss,SS-\ref*/
+ CFI_REL_OFFSET rsp,RSP-\ref
+ /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
+ /*CFI_REL_OFFSET cs,CS-\ref*/
+ CFI_REL_OFFSET rip,RIP-\ref
+ .endm
+
+/*
+ * initial frame state for interrupts (and exceptions without error code)
+ */
+#define INTR_FRAME _frame RIP
+/*
+ * initial frame state for exceptions with error code (and interrupts
+ * with vector already pushed)
+ */
+#define XCPT_FRAME _frame ORIG_RAX
+
+/* save partial stack frame */
+ENTRY(save_args)
+ XCPT_FRAME
+ cld
+ movq %rdi, 8*8+16(%rsp)
+ CFI_REL_OFFSET rdi, 8*8+16
+ movq %rsi, 7*8+16(%rsp)
+ CFI_REL_OFFSET rsi, 7*8+16
+ movq %rdx, 6*8+16(%rsp)
+ CFI_REL_OFFSET rdx, 6*8+16
+ movq %rcx, 5*8+16(%rsp)
+ CFI_REL_OFFSET rcx, 5*8+16
+ movq %rax, 4*8+16(%rsp)
+ CFI_REL_OFFSET rax, 4*8+16
+ movq %r8, 3*8+16(%rsp)
+ CFI_REL_OFFSET r8, 3*8+16
+ movq %r9, 2*8+16(%rsp)
+ CFI_REL_OFFSET r9, 2*8+16
+ movq %r10, 1*8+16(%rsp)
+ CFI_REL_OFFSET r10, 1*8+16
+ movq %r11, 0*8+16(%rsp)
+ CFI_REL_OFFSET r11, 0*8+16
+ leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */
+ movq %rbp, 8(%rsp) /* push %rbp */
+ leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
+ testl $3, CS(%rdi)
+ je 1f
+ SWAPGS
+ /*
+ * irqcount is used to check if a CPU is already on an interrupt stack
+ * or not. While this is essentially redundant with preempt_count it is
+ * a little cheaper to use a separate counter in the PDA (short of
+ * moving irq_enter into assembly, which would be too much work)
+ */
+1: incl %gs:pda_irqcount
+ jne 2f
+ pop %rax /* move return address... */
+ mov %gs:pda_irqstackptr,%rsp
+ push %rax /* ... to the new stack */
+ /*
+ * We entered an interrupt context - irqs are off:
+ */
+2: TRACE_IRQS_OFF
+ ret
+ CFI_ENDPROC
+END(save_args)
+
/*
* A newly forked process directly context switches into this.
*/
@@ -608,65 +680,18 @@ ENTRY(stub_rt_sigreturn)
END(stub_rt_sigreturn)
/*
- * initial frame state for interrupts and exceptions
- */
- .macro _frame ref
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SS+8-\ref
- /*CFI_REL_OFFSET ss,SS-\ref*/
- CFI_REL_OFFSET rsp,RSP-\ref
- /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
- /*CFI_REL_OFFSET cs,CS-\ref*/
- CFI_REL_OFFSET rip,RIP-\ref
- .endm
-
-/* initial frame state for interrupts (and exceptions without error code) */
-#define INTR_FRAME _frame RIP
-/* initial frame state for exceptions with error code (and interrupts with
- vector already pushed) */
-#define XCPT_FRAME _frame ORIG_RAX
-
-/*
* Interrupt entry/exit.
*
* Interrupt entry points save only callee clobbered registers in fast path.
- *
- * Entry runs with interrupts off.
- */
+ *
+ * Entry runs with interrupts off.
+ */
-/* 0(%rsp): interrupt number */
+/* 0(%rsp): interrupt number */
.macro interrupt func
- cld
- SAVE_ARGS
- leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
- pushq %rbp
- /*
- * Save rbp twice: One is for marking the stack frame, as usual, and the
- * other, to fill pt_regs properly. This is because bx comes right
- * before the last saved register in that structure, and not bp. If the
- * base pointer were in the place bx is today, this would not be needed.
- */
- movq %rbp, -8(%rsp)
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rbp, 0
- movq %rsp,%rbp
- CFI_DEF_CFA_REGISTER rbp
- testl $3,CS(%rdi)
- je 1f
- SWAPGS
- /* irqcount is used to check if a CPU is already on an interrupt
- stack or not. While this is essentially redundant with preempt_count
- it is a little cheaper to use a separate counter in the PDA
- (short of moving irq_enter into assembly, which would be too
- much work) */
-1: incl %gs:pda_irqcount
- cmoveq %gs:pda_irqstackptr,%rsp
- push %rbp # backlink for old unwinder
- /*
- * We entered an interrupt context - irqs are off:
- */
- TRACE_IRQS_OFF
+ subq $10*8, %rsp
+ CFI_ADJUST_CFA_OFFSET 10*8
+ call save_args
call \func
.endm
@@ -806,6 +831,8 @@ END(common_interrupt)
/*
* APIC interrupts.
*/
+ .p2align 5
+
.macro apicinterrupt num,func
INTR_FRAME
pushq $~(\num)
> I liked this way of calling an external function, because it
> circumvents exactly most of this ugly setup. For two bytes
It's not too ugly, just one field is swapped.
> extra per stub, one can make this setup function a completely
> normal one (without relocating the return address on the stack).
> I intended the stubs to fit in one cache line (32 bytes) as it
Cache lines are 64 bytes on pretty much all modern x86 CPUs.
> does not make much sense to make them smaller than that. A
> further advantage is that no indirect call is needed, but maybe
> this is not as slow anymore as it used to be? B.t.w., I intended
It depends on the CPU. But always it requires resources which
someone else might have already swamped.
> to change the exception handler stubs in a similar way to get
> rid of the indirect call.
Ok. Hopefully it's worth the effort. The branch misprediction bubble
should not be too bad, perhaps it'll make up for the other cycles
you're adding. But even if it doesn't decreasing cache line foot print is
always a good thing.
> The second copy of %rbp is indeed placed at the 'correct' position
> inside the pt_regs struct. However, at this point only a partial
> stack frame is saved: the C argument registers and the scratch
> registers. r12-r15,ebp,and rbx will be saved only later if necessary.
> A problem could arise if some code uses pt_regs.bp of this partial
> stack frame, because it will contain a bogus value. Glauber's patch
Well they shouldn't --
> makes pt_regs.bp contain the right value most of the time... Which
> means that if the patch fixed something for him, the problem has only
> been made unlikely to happen. The place where things should be fixed
> are the places where pt_regs.bp is used, but not filled in.
Hmm I first thought it was rather so that backtracing over the exception
stubs with FPs works better. But then it didn't even set up
a correct frame for that so it might have been something else.
The original design was that you should only use these extended
registers in special calls that go through the PTREGS stubs.
But then sysprof/oprofile application profiling was added which
wants to do user space backtracing using FPs (which seemed pointless to me
because most/all of 64bit user space and recently also pretty
much all 32bit user space is compiled without FPs by default)
The only way to make this work though is to always save RBP.
But there is no need to do the strange double saving, you
can just store it always directly.
But I personally have doubts its worth making every interrupt
/syscall slower since it won't work with most apps anyways.
The only sure way to do these backtraces is to do dwarf2 unwinding
in user space too which doesn't need hacks like this.
-Andi
[Andi Kleen - Mon, Nov 17, 2008 at 07:23:40PM +0100]
...
|
| Ok. Hopefully it's worth the effort. The branch misprediction bubble
| should not be too bad, perhaps it'll make up for the other cycles
| you're adding. But even if it doesn't decreasing cache line foot print is
| always a good thing.
...
May I turn in? :)
Original .macro interrupt func has
testl $3, CS(%rdi)
je 1f
SWAPGS
1: incl %gs:pda_irqcount
jne 2f
pop %rax
mov %gs:pda_irqstackptr,%rsp
push %rax
2: TRACE_IRQS_OFF
Wouldn't we help the branch predictor a bit by
testl $3, CS(%rdi)
je 1f
SWAPGS
jne 2f
1: pop %rax
mov %gs:pda_irqstackptr,%rsp
push %rax
2: incl %gs:pda_irqcount
TRACE_IRQS_OFF
I hope I didn't miss anything but maybe it's just a churning
or plain wrong. Don't shoot me :)
- Cyrill -
[Cyrill Gorcunov - Mon, Nov 17, 2008 at 10:22:16PM +0300]
| [Andi Kleen - Mon, Nov 17, 2008 at 07:23:40PM +0100]
| ...
| |
| | Ok. Hopefully it's worth the effort. The branch misprediction bubble
| | should not be too bad, perhaps it'll make up for the other cycles
| | you're adding. But even if it doesn't decreasing cache line foot print is
| | always a good thing.
| ...
|
| May I turn in? :)
|
I meant - turn into thread not "turn in" (ouch).
- Cyrill -
On Mon, 17 Nov 2008 22:22:16 +0300, "Cyrill Gorcunov"
<[email protected]> said:
> [Andi Kleen - Mon, Nov 17, 2008 at 07:23:40PM +0100]
> ...
> |
> | Ok. Hopefully it's worth the effort. The branch misprediction bubble
> | should not be too bad, perhaps it'll make up for the other cycles
> | you're adding. But even if it doesn't decreasing cache line foot print
> is
> | always a good thing.
> ...
>
> May I turn in? :)
Sure!
> Original .macro interrupt func has
>
> testl $3, CS(%rdi)
> je 1f
> SWAPGS
> 1: incl %gs:pda_irqcount
inc changes flags...
> jne 2f
> pop %rax
> mov %gs:pda_irqstackptr,%rsp
> push %rax
> 2: TRACE_IRQS_OFF
>
> Wouldn't we help the branch predictor a bit by
>
> testl $3, CS(%rdi)
> je 1f
> SWAPGS
> jne 2f
... so this is not the same.
> 1: pop %rax
> mov %gs:pda_irqstackptr,%rsp
> push %rax
> 2: incl %gs:pda_irqcount
> TRACE_IRQS_OFF
>
> I hope I didn't miss anything but maybe it's just a churning
> or plain wrong. Don't shoot me :)
I'm sure it helps the branch predictor, though ;).
Alexander
> - Cyrill -
--
Alexander van Heukelum
[email protected]
--
http://www.fastmail.fm - A fast, anti-spam email service.
On Mon, 17 Nov 2008 22:29:41 +0300, "Cyrill Gorcunov"
<[email protected]> said:
> [Cyrill Gorcunov - Mon, Nov 17, 2008 at 10:22:16PM +0300]
> | [Andi Kleen - Mon, Nov 17, 2008 at 07:23:40PM +0100]
> | ...
> | |
> | | Ok. Hopefully it's worth the effort. The branch misprediction bubble
> | | should not be too bad, perhaps it'll make up for the other cycles
> | | you're adding. But even if it doesn't decreasing cache line foot
> print is
> | | always a good thing.
> | ...
> |
> | May I turn in? :)
> |
>
> I meant - turn into thread not "turn in" (ouch).
Even more likely is that you meant: "May I jump in" ;)
> - Cyrill -
--
Alexander van Heukelum
[email protected]
--
http://www.fastmail.fm - Access all of your messages and folders
wherever you are
[Alexander van Heukelum - Mon, Nov 17, 2008 at 08:43:26PM +0100]
| http://www.fastmail.fm - A fast, anti-spam email service.
|
...
(me: gone out in deep shame)
- Cyrill -
[Alexander van Heukelum - Mon, Nov 17, 2008 at 08:49:35PM +0100]
| On Mon, 17 Nov 2008 22:29:41 +0300, "Cyrill Gorcunov"
| <[email protected]> said:
| > [Cyrill Gorcunov - Mon, Nov 17, 2008 at 10:22:16PM +0300]
| > | [Andi Kleen - Mon, Nov 17, 2008 at 07:23:40PM +0100]
| > | ...
| > | |
| > | | Ok. Hopefully it's worth the effort. The branch misprediction bubble
| > | | should not be too bad, perhaps it'll make up for the other cycles
| > | | you're adding. But even if it doesn't decreasing cache line foot
| > print is
| > | | always a good thing.
| > | ...
| > |
| > | May I turn in? :)
| > |
| >
| > I meant - turn into thread not "turn in" (ouch).
|
| Even more likely is that you meant: "May I jump in" ;)
|
| > - Cyrill -
| --
| Alexander van Heukelum
| [email protected]
|
| --
| http://www.fastmail.fm - Access all of your messages and folders
| wherever you are
|
It doesn't matter since I was killed out :) But
I have some argument to justify myself -- je and jne
don't depend on CF :-)
- Cyrill -
>>> Alexander van Heukelum <[email protected]> 17.11.08 18:52 >>>
>The macro "interrupt" in entry_64.S generates a lot of code and it
>is used more and more often. This patch moves most of its contents
>into an external function. This saves anywhere between 500 and 2500
>bytes of text depending on the configuration.
Without numbers, I'd doubt that the code size reduction outweighs the
overhead of the extra call. Did you do any measurements?
>Dwarf2-annotations are most probably wrong or missing at all.
Indeed - do you have intentions to address this?
Jan
On Tue, Nov 18, 2008 at 08:09:28AM +0000, Jan Beulich wrote:
> >>> Alexander van Heukelum <[email protected]> 17.11.08 18:52 >>>
> >The macro "interrupt" in entry_64.S generates a lot of code and it
> >is used more and more often. This patch moves most of its contents
> >into an external function. This saves anywhere between 500 and 2500
> >bytes of text depending on the configuration.
>
> Without numbers, I'd doubt that the code size reduction outweighs the
> overhead of the extra call. Did you do any measurements?
I have not measured yet. I'll see if I can highjack a machine. What
kind of benchmark would you suggest?
> >Dwarf2-annotations are most probably wrong or missing at all.
>
> Indeed - do you have intentions to address this?
Yes, I'ld like to get it right. What do you use to check the
annotations?
Greetings,
Alexander
> Jan
>
>>> Alexander van Heukelum <[email protected]> 18.11.08 12:16 >>>
>> >Dwarf2-annotations are most probably wrong or missing at all.
>>
>> Indeed - do you have intentions to address this?
>
>Yes, I'ld like to get it right. What do you use to check the
>annotations?
No tool, if you mean that. Extensive changes I verify by looking at the
dump, problems are usually found only when back traces don't come out
right.
Jan
* Jan Beulich <[email protected]> wrote:
> >>> Alexander van Heukelum <[email protected]> 18.11.08 12:16 >>>
> >> >Dwarf2-annotations are most probably wrong or missing at all.
> >>
> >> Indeed - do you have intentions to address this?
> >
> > Yes, I'ld like to get it right. What do you use to check the
> > annotations?
>
> No tool, if you mean that. Extensive changes I verify by looking at
> the dump, problems are usually found only when back traces don't
> come out right.
that's a fundamental weakness of all the CFI annotations.
It is outright wrong to waste humans on this mechanic task: as it is
abundantly clear to GAS where we change a stack pointer and by how
much - it could emit magic annotations automatically just as much.
So if you care about it, please fix this in the tools space. The
entry_64.S impact of finegrained annotations is just too ugly for
things like this.
One limited exception is for basic stack frames where we do syscalls
or call into other C code. (i.e. the patch proposed here would have to
do that limited annotation)
But the per instruction annotations currently in that code are madness
and must either be cleaned up significantly via the use of GAS macros
(so that all stack pointer manipulations go via a single macro
invocation), or be completely auto-generated by GAS.
Ingo
>>> Ingo Molnar <[email protected]> 18.11.08 15:03 >>>
>* Jan Beulich <[email protected]> wrote:
>> No tool, if you mean that. Extensive changes I verify by looking at
>> the dump, problems are usually found only when back traces don't
>> come out right.
>
>that's a fundamental weakness of all the CFI annotations.
>
>It is outright wrong to waste humans on this mechanic task: as it is
This part I agree to.
>abundantly clear to GAS where we change a stack pointer and by how
>much - it could emit magic annotations automatically just as much.
>
>So if you care about it, please fix this in the tools space. The
>entry_64.S impact of finegrained annotations is just too ugly for
>things like this.
>
>One limited exception is for basic stack frames where we do syscalls
>or call into other C code. (i.e. the patch proposed here would have to
>do that limited annotation)
>
>But the per instruction annotations currently in that code are madness
>and must either be cleaned up significantly via the use of GAS macros
>(so that all stack pointer manipulations go via a single macro
>invocation), or be completely auto-generated by GAS.
Making gas auto-generate this is not really possible (much like ia64
requires the annotations to be inserted manually), mainly because gas
can't know whether e.g. a push of a register is in order to preserve its
value, or for some other purpose.
I do have a set of macros for this in nlkd, maybe (as you're asking for it)
I should get them out of there (and convert them to AT&T syntax).
Jan
* Jan Beulich <[email protected]> wrote:
> >>> Ingo Molnar <[email protected]> 18.11.08 15:03 >>>
> >* Jan Beulich <[email protected]> wrote:
> >> No tool, if you mean that. Extensive changes I verify by looking at
> >> the dump, problems are usually found only when back traces don't
> >> come out right.
> >
> >that's a fundamental weakness of all the CFI annotations.
> >
> >It is outright wrong to waste humans on this mechanic task: as it is
>
> This part I agree to.
>
> >abundantly clear to GAS where we change a stack pointer and by how
> >much - it could emit magic annotations automatically just as much.
> >
> >So if you care about it, please fix this in the tools space. The
> >entry_64.S impact of finegrained annotations is just too ugly for
> >things like this.
> >
> >One limited exception is for basic stack frames where we do syscalls
> >or call into other C code. (i.e. the patch proposed here would have to
> >do that limited annotation)
> >
> >But the per instruction annotations currently in that code are madness
> >and must either be cleaned up significantly via the use of GAS macros
> >(so that all stack pointer manipulations go via a single macro
> >invocation), or be completely auto-generated by GAS.
>
> Making gas auto-generate this is not really possible (much like ia64
> requires the annotations to be inserted manually), mainly because
> gas can't know whether e.g. a push of a register is in order to
> preserve its value, or for some other purpose.
but that's the exception. Most of the annotations could be
auto-generated.
> I do have a set of macros for this in nlkd, maybe (as you're asking
> for it) I should get them out of there (and convert them to AT&T
> syntax).
i'd definitely like to have a look ...
if you can make this clean enough, most of the resistence to CFI
annotations will go away.
The requirements is extreme cleanliness: single line in the source
that gets us _both_ the instruction and the annotation. Also always
insert the proper frame pointer as well, when we call into C. Make it
as hard as possible to mess up the annotations - we could even run a
build-time grep on the .S files that matter to see whether there's any
(common) "naked" stack-manipulating instructions that shouldnt be
used.
Ingo
> but that's the exception. Most of the annotations could be
> auto-generated.
Not really. An implicit .cfi_undefined can be auto-generated for an
unannotated instruction with an output register. An implicit .cfi_register
can be auto-generated for an unannotated register-to-register move. An
implicit .cfi_same_value can be auto-generated when you can tell a register
is being written with the register or stack slot tha the current CFI state
says holds the caller's value of that register. Beyond that, it gets into
either assumptions or hairy analysis of how stack slots are being used and
so forth.
I don't think auto-generation is very a useful angle to take for this any
time soon. Explicit (but simple) macros in the assembly is what I favor.
Thanks,
Roland
> I don't think auto-generation is very a useful angle to take for this any
> time soon. Explicit (but simple) macros in the assembly is what I favor.
Do you mean macros that generate both the instruction and the CFI
or separate? The major disadvantage of doing it together in a
single macro is that it is not really readable for any assembler
programmer anymore, but starts becoming a Linux specific assembler
language. Likely not a good thing for maintenance. anyone who
wants to know the real assembler would need to read objdump -S
output, which is not nice.
Perhaps it would be a reasonable readability improvement to just use shorter
cfi macros which are not shouted?
-Andi
Andi Kleen wrote:
>> I don't think auto-generation is very a useful angle to take for this any
>> time soon. Explicit (but simple) macros in the assembly is what I favor.
>>
>
> Do you mean macros that generate both the instruction and the CFI
> or separate? The major disadvantage of doing it together in a
> single macro is that it is not really readable for any assembler
> programmer anymore, but starts becoming a Linux specific assembler
> language. Likely not a good thing for maintenance. anyone who
> wants to know the real assembler would need to read objdump -S
> output, which is not nice.
>
> Perhaps it would be a reasonable readability improvement to just use shorter
> cfi macros which are not shouted?
Not really. At the moment we have two parallel assembly languages which
say different things about the same instructions. In practice, almost
nobody understands the cfi parts, so they just get ignored while the x86
instructions change around them, leaving them either stale or missing.
If we had a sensible macro layer which emits both instructions and cfi
annotations, it at least means that people who write plain x86
instructions will simply get no annotations, and people who bother to
learn the (clearly and fully documented) macros will get the best of both.
J
Yes, I mean a single macro that produces both the instruction and the CFI
pseudo-op to go with it. This is the essential characteristic that makes
it an improvement for maintaining the code. The main problem we have now
is that it's easy to write/modify plain assembly instructions and forget to
add or update the CFI to match. A well-considered set of macros can solve
this without making it any harder for the average assembly programmer to
understand what each line of the source means intuitively.
Thanks,
Roland
Jeremy Fitzhardinge wrote:
>
> Not really. At the moment we have two parallel assembly languages which
> say different things about the same instructions. In practice, almost
> nobody understands the cfi parts, so they just get ignored while the x86
> instructions change around them, leaving them either stale or missing.
>
> If we had a sensible macro layer which emits both instructions and cfi
> annotations, it at least means that people who write plain x86
> instructions will simply get no annotations, and people who bother to
> learn the (clearly and fully documented) macros will get the best of both.
>
I think that it would be nice to have macros for the most commonly
annotatable instructions, e.g. push, and stack pointer movement. Just
compactifying the code should improve readability, if perhaps not
writability.
-hpa
On Tue, Nov 18, 2008 at 03:45:11PM -0800, Roland McGrath wrote:
> Yes, I mean a single macro that produces both the instruction and the CFI
> pseudo-op to go with it. This is the essential characteristic that makes
> it an improvement for maintaining the code. The main problem we have now
> is that it's easy to write/modify plain assembly instructions and forget to
> add or update the CFI to match. A well-considered set of macros can solve
> this without making it any harder for the average assembly programmer to
> understand what each line of the source means intuitively.
Hmm, but if the assembler cannot auto generate it how should the assembler
writer know if he should use the macro or the direct instruction without
understanding CFI?
Also what will the assembler reader do? Do they first have to understand
CFI to understand everything? I personally would probably just
resort to objdump -S in this situation.
I think you're saying that for the user the macros would be just
equivalent, but if that's true they could be just auto generated
by the assembler. But it's obviously not, so you'll end up
with the Linux magic asm dialect (and its maintenance disadvantages)
and you'll still require CFI knowledge to understand/write everything
anyways.
-Andi
Andi Kleen wrote:
>
> Hmm, but if the assembler cannot auto generate it how should the assembler
> writer know if he should use the macro or the direct instruction without
> understanding CFI?
>
> Also what will the assembler reader do? Do they first have to understand
> CFI to understand everything? I personally would probably just
> resort to objdump -S in this situation.
>
> I think you're saying that for the user the macros would be just
> equivalent, but if that's true they could be just auto generated
> by the assembler. But it's obviously not, so you'll end up
> with the Linux magic asm dialect (and its maintenance disadvantages)
> and you'll still require CFI knowledge to understand/write everything
> anyways.
>
We already have a "Linux magic asm dialect" which require CFI knowledge.
Nothing can change that other than dumping the requirement that we
have valid CFI data. However, the current code is hard to read and easy
to trip up on. We can at least make it easier, especially to read --
and making it easier to read will help writers, too.
-hpa
H. Peter Anvin wrote:
> Jeremy Fitzhardinge wrote:
>>
>> Not really. At the moment we have two parallel assembly languages
>> which say different things about the same instructions. In practice,
>> almost nobody understands the cfi parts, so they just get ignored
>> while the x86 instructions change around them, leaving them either
>> stale or missing.
>>
>> If we had a sensible macro layer which emits both instructions and
>> cfi annotations, it at least means that people who write plain x86
>> instructions will simply get no annotations, and people who bother to
>> learn the (clearly and fully documented) macros will get the best of
>> both.
>>
>
> I think that it would be nice to have macros for the most commonly
> annotatable instructions, e.g. push, and stack pointer movement. Just
> compactifying the code should improve readability, if perhaps not
> writability.
Yes. Something that obviously relates to both the instruction and the
semantic intent of the annotation: add_sp, sub_sp, save_reg, etc. And
at least that will eliminate the differently-signed(!) constant for
stack movement.
J
>
> -hpa
Hi all,
Here is a combined patch that moves "save_args" out-of-line for
the interrupt macro and moves "error_entry" mostly out-of-line
for the zeroentry and errorentry macros.
The save_args function becomes really straightforward and easy
to understand, with the possible exception of the stack switch
code, which now needs to copy the return address of to the
calling function. Normal interrupts arrive with ((~vector)-0x80)
on the stack, which gets adjusted in common_interrupt:
<common_interrupt>:
(5) addq $0xffffffffffffff80,(%rsp) /* -> ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80214290 <do_IRQ>
<ret_from_intr>:
...
An apic interrupt stub now look like this:
<thermal_interrupt>:
(5) pushq $0xffffffffffffff05 /* ~(vector) */
(4) sub $0x50,%rsp /* space for registers */
(5) callq ffffffff80211290 <save_args>
(5) callq ffffffff80212b8f <smp_thermal_interrupt>
(5) jmpq ffffffff80211f93 <ret_from_intr>
Similarly the exception handler register saving function becomes
simpler, without the need of any parameter shuffling. The stub
for an exception without errorcode looks like this:
<overflow>:
(6) callq *0x1cad12(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(2) pushq $0xffffffffffffffff /* no syscall */
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(2) xor %esi,%esi /* no error code */
(5) callq ffffffff80213446 <do_overflow>
(5) jmpq ffffffff8030e460 <error_exit>
And one for an exception with errorcode like this:
<segment_not_present>:
(6) callq *0x1cab92(%rip) # ffffffff803dd448 <pv_irq_ops+0x38>
(4) sub $0x78,%rsp /* space for registers */
(5) callq ffffffff8030e3b0 <error_entry>
(3) mov %rsp,%rdi /* pt_regs pointer */
(5) mov 0x78(%rsp),%rsi /* load error code */
(9) movq $0xffffffffffffffff,0x78(%rsp) /* no syscall */
(5) callq ffffffff80213209 <do_segment_not_present>
(5) jmpq ffffffff8030e460 <error_exit>
Unfortunately, this last type is more than 32 bytes. But the total space
savings due to this patch is about 2500 bytes on an smp-configuration,
and I think the code is clearer than it was before. The tested kernels
were non-paravirt ones (i.e., without the indirect call at the top of
the exception handlers).
Anyhow, I tested this patch on top of a recent -tip. The machine
was an 2x4-core Xeon at 2333MHz. Measured where the delays between
(almost-)adjacent rdtsc instructions. The graphs show how much
time is spent outside of the program as a function of the measured
delay. The area under the graph represents the total time spent
outside the program. Eight instances of the rdtsctest were
started, each pinned to a single cpu. The histogams are added.
For each kernel two measurements were done: one in mostly idle
condition, the other while running "bonnie++ -f", bound to cpu 0.
Each measurement took 40 minutes runtime. See the attached graphs
for the results. The graphs overlap almost everywhere, but there
are small differences.
CFI_-annotations are missing or wrong. I intend to try to fix
that, but I'ld like comments on the approach first.
Signed-off-by: Alexander van Heukelum <[email protected]>
---
arch/x86/kernel/entry_64.S | 300 ++++++++++++++++++++++++--------------------
1 files changed, 166 insertions(+), 134 deletions(-)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 936d296..e22502a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -247,6 +247,78 @@ ENTRY(native_usergs_sysret64)
CFI_REL_OFFSET rsp,RSP
/*CFI_REL_OFFSET ss,SS*/
.endm
+
+/*
+ * initial frame state for interrupts and exceptions
+ */
+ .macro _frame ref
+ CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA rsp,SS+8-\ref
+ /*CFI_REL_OFFSET ss,SS-\ref*/
+ CFI_REL_OFFSET rsp,RSP-\ref
+ /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
+ /*CFI_REL_OFFSET cs,CS-\ref*/
+ CFI_REL_OFFSET rip,RIP-\ref
+ .endm
+
+/*
+ * initial frame state for interrupts (and exceptions without error code)
+ */
+#define INTR_FRAME _frame RIP
+/*
+ * initial frame state for exceptions with error code (and interrupts
+ * with vector already pushed)
+ */
+#define XCPT_FRAME _frame ORIG_RAX
+
+/* save partial stack frame */
+ENTRY(save_args)
+ XCPT_FRAME
+ cld
+ movq %rdi, 8*8+16(%rsp)
+ CFI_REL_OFFSET rdi, 8*8+16
+ movq %rsi, 7*8+16(%rsp)
+ CFI_REL_OFFSET rsi, 7*8+16
+ movq %rdx, 6*8+16(%rsp)
+ CFI_REL_OFFSET rdx, 6*8+16
+ movq %rcx, 5*8+16(%rsp)
+ CFI_REL_OFFSET rcx, 5*8+16
+ movq %rax, 4*8+16(%rsp)
+ CFI_REL_OFFSET rax, 4*8+16
+ movq %r8, 3*8+16(%rsp)
+ CFI_REL_OFFSET r8, 3*8+16
+ movq %r9, 2*8+16(%rsp)
+ CFI_REL_OFFSET r9, 2*8+16
+ movq %r10, 1*8+16(%rsp)
+ CFI_REL_OFFSET r10, 1*8+16
+ movq %r11, 0*8+16(%rsp)
+ CFI_REL_OFFSET r11, 0*8+16
+ leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */
+ movq %rbp, 8(%rsp) /* push %rbp */
+ leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
+ testl $3, CS(%rdi)
+ je 1f
+ SWAPGS
+ /*
+ * irqcount is used to check if a CPU is already on an interrupt stack
+ * or not. While this is essentially redundant with preempt_count it is
+ * a little cheaper to use a separate counter in the PDA (short of
+ * moving irq_enter into assembly, which would be too much work)
+ */
+1: incl %gs:pda_irqcount
+ jne 2f
+ pop %rax /* move return address... */
+ mov %gs:pda_irqstackptr,%rsp
+ push %rax /* ... to the new stack */
+ /*
+ * We entered an interrupt context - irqs are off:
+ */
+2: TRACE_IRQS_OFF
+ ret
+ CFI_ENDPROC
+END(save_args)
+
/*
* A newly forked process directly context switches into this.
*/
@@ -615,26 +687,6 @@ ENTRY(stub_rt_sigreturn)
END(stub_rt_sigreturn)
/*
- * initial frame state for interrupts and exceptions
- */
- .macro _frame ref
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SS+8-\ref
- /*CFI_REL_OFFSET ss,SS-\ref*/
- CFI_REL_OFFSET rsp,RSP-\ref
- /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
- /*CFI_REL_OFFSET cs,CS-\ref*/
- CFI_REL_OFFSET rip,RIP-\ref
- .endm
-
-/* initial frame state for interrupts (and exceptions without error code) */
-#define INTR_FRAME _frame RIP
-/* initial frame state for exceptions with error code (and interrupts with
- vector already pushed) */
-#define XCPT_FRAME _frame ORIG_RAX
-
-/*
* Build the entry stubs and pointer table with some assembler magic.
* We pack 7 stubs into a single 32-byte chunk, which will fit in a
* single cache line on all modern x86 implementations.
@@ -674,46 +726,19 @@ END(irq_entries_start)
END(interrupt)
.previous
-/*
+/*
* Interrupt entry/exit.
*
* Interrupt entry points save only callee clobbered registers in fast path.
- *
- * Entry runs with interrupts off.
- */
+ *
+ * Entry runs with interrupts off.
+ */
/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
- cld
- SAVE_ARGS
- leaq -ARGOFFSET(%rsp),%rdi /* arg1 for handler */
- pushq %rbp
- /*
- * Save rbp twice: One is for marking the stack frame, as usual, and the
- * other, to fill pt_regs properly. This is because bx comes right
- * before the last saved register in that structure, and not bp. If the
- * base pointer were in the place bx is today, this would not be needed.
- */
- movq %rbp, -8(%rsp)
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rbp, 0
- movq %rsp,%rbp
- CFI_DEF_CFA_REGISTER rbp
- testl $3,CS(%rdi)
- je 1f
- SWAPGS
- /* irqcount is used to check if a CPU is already on an interrupt
- stack or not. While this is essentially redundant with preempt_count
- it is a little cheaper to use a separate counter in the PDA
- (short of moving irq_enter into assembly, which would be too
- much work) */
-1: incl %gs:pda_irqcount
- cmoveq %gs:pda_irqstackptr,%rsp
- push %rbp # backlink for old unwinder
- /*
- * We entered an interrupt context - irqs are off:
- */
- TRACE_IRQS_OFF
+ subq $10*8, %rsp
+ CFI_ADJUST_CFA_OFFSET 10*8
+ call save_args
call \func
.endm
@@ -859,6 +884,8 @@ END(common_interrupt)
/*
* APIC interrupts.
*/
+ .p2align 5
+
.macro apicinterrupt num,func
INTR_FRAME
pushq $~(\num)
@@ -929,24 +956,29 @@ END(spurious_interrupt)
.macro zeroentry sym
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $0 /* push error code/oldrax */
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
CFI_ADJUST_CFA_OFFSET 8
- pushq %rax /* push real oldrax to the rdi slot */
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rax,0
- leaq \sym(%rip),%rax
- jmp error_entry
+ subq $15*8,%rsp
+ CFI_ADJUST_CFA_OFFSET 15*8
+ call error_entry
+ movq %rsp,%rdi /* pt_regs pointer */
+ xorl %esi,%esi /* no error code */
+ call \sym
+ jmp error_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
.endm
.macro errorentry sym
XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq %rax
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rax,0
- leaq \sym(%rip),%rax
- jmp error_entry
+ subq $15*8,%rsp
+ CFI_ADJUST_CFA_OFFSET 15*8
+ call error_entry
+ movq %rsp,%rdi /* pt_regs pointer */
+ movq ORIG_RAX(%rsp),%rsi /* get error code */
+ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
+ call \sym
+ jmp error_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
.endm
@@ -1050,93 +1082,93 @@ paranoid_schedule\trace:
.endm
/*
- * Exception entry point. This expects an error code/orig_rax on the stack
- * and the exception handler in %rax.
+ * Exception entry point. This expects an error code/orig_rax on the stack.
+ * returns in "no swapgs flag" in %ebx.
*/
KPROBE_ENTRY(error_entry)
_frame RDI
- CFI_REL_OFFSET rax,0
- /* rdi slot contains rax, oldrax contains error code */
+ CFI_ADJUST_CFA_OFFSET 15*8
+ /* oldrax contains error code */
cld
- subq $14*8,%rsp
- CFI_ADJUST_CFA_OFFSET (14*8)
- movq %rsi,13*8(%rsp)
- CFI_REL_OFFSET rsi,RSI
- movq 14*8(%rsp),%rsi /* load rax from rdi slot */
- CFI_REGISTER rax,rsi
- movq %rdx,12*8(%rsp)
- CFI_REL_OFFSET rdx,RDX
- movq %rcx,11*8(%rsp)
- CFI_REL_OFFSET rcx,RCX
- movq %rsi,10*8(%rsp) /* store rax */
- CFI_REL_OFFSET rax,RAX
- movq %r8, 9*8(%rsp)
- CFI_REL_OFFSET r8,R8
- movq %r9, 8*8(%rsp)
- CFI_REL_OFFSET r9,R9
- movq %r10,7*8(%rsp)
- CFI_REL_OFFSET r10,R10
- movq %r11,6*8(%rsp)
- CFI_REL_OFFSET r11,R11
- movq %rbx,5*8(%rsp)
- CFI_REL_OFFSET rbx,RBX
- movq %rbp,4*8(%rsp)
- CFI_REL_OFFSET rbp,RBP
- movq %r12,3*8(%rsp)
- CFI_REL_OFFSET r12,R12
- movq %r13,2*8(%rsp)
- CFI_REL_OFFSET r13,R13
- movq %r14,1*8(%rsp)
- CFI_REL_OFFSET r14,R14
- movq %r15,(%rsp)
- CFI_REL_OFFSET r15,R15
+ movq %rdi,14*8+8(%rsp)
+ CFI_REL_OFFSET rdi,RDI+8
+ movq %rsi,13*8+8(%rsp)
+ CFI_REL_OFFSET rsi,RSI+8
+ movq %rdx,12*8+8(%rsp)
+ CFI_REL_OFFSET rdx,RDX+8
+ movq %rcx,11*8+8(%rsp)
+ CFI_REL_OFFSET rcx,RCX+8
+ movq %rax,10*8+8(%rsp)
+ CFI_REL_OFFSET rax,RAX+8
+ movq %r8, 9*8+8(%rsp)
+ CFI_REL_OFFSET r8,R8+8
+ movq %r9, 8*8+8(%rsp)
+ CFI_REL_OFFSET r9,R9+8
+ movq %r10,7*8+8(%rsp)
+ CFI_REL_OFFSET r10,R10+8
+ movq %r11,6*8+8(%rsp)
+ CFI_REL_OFFSET r11,R11+8
+ movq %rbx,5*8+8(%rsp)
+ CFI_REL_OFFSET rbx,RBX+8
+ movq %rbp,4*8+8(%rsp)
+ CFI_REL_OFFSET rbp,RBP+8
+ movq %r12,3*8+8(%rsp)
+ CFI_REL_OFFSET r12,R12+8
+ movq %r13,2*8+8(%rsp)
+ CFI_REL_OFFSET r13,R13+8
+ movq %r14,1*8+8(%rsp)
+ CFI_REL_OFFSET r14,R14+8
+ movq %r15,0*8+8(%rsp)
+ CFI_REL_OFFSET r15,R15+8
xorl %ebx,%ebx
- testl $3,CS(%rsp)
- je error_kernelspace
+ testl $3,CS+8(%rsp)
+ je error_kernelspace
error_swapgs:
SWAPGS
error_sti:
TRACE_IRQS_OFF
- movq %rdi,RDI(%rsp)
- CFI_REL_OFFSET rdi,RDI
- movq %rsp,%rdi
- movq ORIG_RAX(%rsp),%rsi /* get error code */
- movq $-1,ORIG_RAX(%rsp)
- call *%rax
- /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
-error_exit:
+ ret
+ CFI_ENDPROC
+
+/*
+ * There are two places in the kernel that can potentially fault with
+ * usergs. Handle them here. The exception handlers after iret run with
+ * kernel gs again, so don't set the user space flag. B stepping K8s
+ * sometimes report an truncated RIP for IRET exceptions returning to
+ * compat mode. Check for these here too.
+ */
+error_kernelspace:
+ incl %ebx
+ leaq irq_return(%rip),%rcx
+ cmpq %rcx,RIP+8(%rsp)
+ je error_swapgs
+ movl %ecx,%ecx /* zero extend */
+ cmpq %rcx,RIP+8(%rsp)
+ je error_swapgs
+ cmpq $gs_change,RIP+8(%rsp)
+ je error_swapgs
+ jmp error_sti
+KPROBE_END(error_entry)
+
+
+/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
+KPROBE_ENTRY(error_exit)
+ _frame R15
movl %ebx,%eax
RESTORE_REST
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
GET_THREAD_INFO(%rcx)
testl %eax,%eax
- jne retint_kernel
+ jne retint_kernel
LOCKDEP_SYS_EXIT_IRQ
- movl TI_flags(%rcx),%edx
- movl $_TIF_WORK_MASK,%edi
- andl %edi,%edx
- jnz retint_careful
+ movl TI_flags(%rcx),%edx
+ movl $_TIF_WORK_MASK,%edi
+ andl %edi,%edx
+ jnz retint_careful
jmp retint_swapgs
CFI_ENDPROC
-
-error_kernelspace:
- incl %ebx
- /* There are two places in the kernel that can potentially fault with
- usergs. Handle them here. The exception handlers after
- iret run with kernel gs again, so don't set the user space flag.
- B stepping K8s sometimes report an truncated RIP for IRET
- exceptions returning to compat mode. Check for these here too. */
- leaq irq_return(%rip),%rcx
- cmpq %rcx,RIP(%rsp)
- je error_swapgs
- movl %ecx,%ecx /* zero extend */
- cmpq %rcx,RIP(%rsp)
- je error_swapgs
- cmpq $gs_change,RIP(%rsp)
- je error_swapgs
- jmp error_sti
-KPROBE_END(error_entry)
+KPROBE_END(error_exit)
/* Reload gs selector with exception handling */
/* edi: new selector */
* Roland McGrath <[email protected]> wrote:
> > but that's the exception. Most of the annotations could be
> > auto-generated.
>
> Not really. An implicit .cfi_undefined can be auto-generated for an
> unannotated instruction with an output register. An implicit
> .cfi_register can be auto-generated for an unannotated
> register-to-register move. An implicit .cfi_same_value can be
> auto-generated when you can tell a register is being written with
> the register or stack slot tha the current CFI state says holds the
> caller's value of that register. Beyond that, it gets into either
> assumptions or hairy analysis of how stack slots are being used and
> so forth.
i dont buy that argument at all.
Yes, of course full "no changes to the current code at all" automation
is hard.
But _at minimum_ GAS should automate a large part of this and help us
out syntactically: make it really easy to human-annotate instructions
in a _minimal way_. Also, automate the easy bits while at it. Plus
warn about missing annotations - nesting errors, etc. etc.
there's a ton of easy things GAS could do here that it does not do.
> [...] Explicit (but simple) macros in the assembly is what I favor.
Yeah. This is the second-best option - but has some limitations. Still
it is much better than what we have now.
What _clearly_ sucks is the current mess of:
CFI_ADJUST_CFA_OFFSET 8
/*CFI_REL_OFFSET ss,0*/
pushq %rax /* rsp */
CFI_ADJUST_CFA_OFFSET 8
CFI_REL_OFFSET rsp,0
pushq $(1<<9) /* eflags - interrupts on */
CFI_ADJUST_CFA_OFFSET 8
/*CFI_REL_OFFSET rflags,0*/
pushq $__KERNEL_CS /* cs */
CFI_ADJUST_CFA_OFFSET 8
/*CFI_REL_OFFSET cs,0*/
pushq \child_rip /* rip */
CFI_ADJUST_CFA_OFFSET 8
CFI_REL_OFFSET rip,0
pushq %rax /* orig rax */
CFI_ADJUST_CFA_OFFSET 8
Compared to what we could have (stupid mockup):
pushq_cf1 %rax /* rsp */
pushq_cf1 $(1<<9) /* eflags - interrupts on */
pushq_cf1 $__KERNEL_CS /* cs */
pushq_cf2 \child_rip /* rip */
pushq_cf1 %rax /* orig rax */
Whoever claims that this cannot be automated in _large_ part isnt
thinking it through really. Those CFI annotations should never have
been added in this form.
Ingo
Alexander van Heukelum wrote:
> Hi all,
>
> Here is a combined patch that moves "save_args" out-of-line for
> the interrupt macro and moves "error_entry" mostly out-of-line
> for the zeroentry and errorentry macros.
>
> The save_args function becomes really straightforward and easy
> to understand, with the possible exception of the stack switch
> code, which now needs to copy the return address of to the
> calling function. Normal interrupts arrive with ((~vector)-0x80)
> on the stack, which gets adjusted in common_interrupt:
>
> <common_interrupt>:
> (5) addq $0xffffffffffffff80,(%rsp) /* -> ~(vector) */
> (4) sub $0x50,%rsp /* space for registers */
> (5) callq ffffffff80211290 <save_args>
> (5) callq ffffffff80214290 <do_IRQ>
> <ret_from_intr>:
> ...
>
> An apic interrupt stub now look like this:
>
> <thermal_interrupt>:
> (5) pushq $0xffffffffffffff05 /* ~(vector) */
> (4) sub $0x50,%rsp /* space for registers */
> (5) callq ffffffff80211290 <save_args>
> (5) callq ffffffff80212b8f <smp_thermal_interrupt>
> (5) jmpq ffffffff80211f93 <ret_from_intr>
>
Sorry, I'm away on a trip at the moment, so sorry for the delayed feedback.
First of all, if we're going to go through common code here, we should
do the vector number adjustment in save_args and be able to use the
short form of pushq in the common case.
What isn't clear to me is if we should just push a target field to the
stack and then do an indirect call. That way we can do save_args and
ret_from_intr inline, but at the expense of an indirect call which will
not necessarily speculate cleanly.
All of this qualify as "tweaking", which means we need to be very
careful so we don't end up burning more performance than we gain, but
all in all, I think this is a good idea.
-hpa
* Ingo Molnar <[email protected]> wrote:
> What _clearly_ sucks is the current mess of:
>
> CFI_ADJUST_CFA_OFFSET 8
> /*CFI_REL_OFFSET ss,0*/
> pushq %rax /* rsp */
> CFI_ADJUST_CFA_OFFSET 8
> CFI_REL_OFFSET rsp,0
> pushq $(1<<9) /* eflags - interrupts on */
> CFI_ADJUST_CFA_OFFSET 8
> /*CFI_REL_OFFSET rflags,0*/
> pushq $__KERNEL_CS /* cs */
> CFI_ADJUST_CFA_OFFSET 8
> /*CFI_REL_OFFSET cs,0*/
> pushq \child_rip /* rip */
> CFI_ADJUST_CFA_OFFSET 8
> CFI_REL_OFFSET rip,0
> pushq %rax /* orig rax */
> CFI_ADJUST_CFA_OFFSET 8
>
> Compared to what we could have (stupid mockup):
>
> pushq_cf1 %rax /* rsp */
> pushq_cf1 $(1<<9) /* eflags - interrupts on */
> pushq_cf1 $__KERNEL_CS /* cs */
> pushq_cf2 \child_rip /* rip */
> pushq_cf1 %rax /* orig rax */
>
> Whoever claims that this cannot be automated in _large_ part isnt
> thinking it through really. Those CFI annotations should never have
> been added in this form.
Something like this would be a lot cleaner equivalent replacement:
PUSHQ %rax /* rsp */
PUSHQ $(1<<9) /* eflags - interrupts on */
PUSHQ $__KERNEL_CS /* cs */
PUSHQ \child_rip /* rip */
cfi_map rip, 0
PUSHQ %rax /* orig rax */
as most of the really annoying CFI annotations in entry_64.S that
obscruct code reading are just plain CFA offset modifications related
to stack shuffling.
[ Sidenote: trying to connect up RIP like that in the FAKE_STACK_FRAME
is pretty wrong to begin with - the annotation is incomplete up to
this point. ]
The problems are not caused by the prologue or epilogue annotations,
nor by any of the trickier stack shuffling annotations we do around
syscall/sysret and around exception frames. A lot of the frame formats
we use are special, controlled by hw details and we do have to map
those details to the debuginfo - it's an inevitably manual piece of
work.
It's the plain crappy:
pushq %rdi
CFI_ADJUST_CFA_OFFSET 8
call schedule
popq %rdi
CFI_ADJUST_CFA_OFFSET -8
annotation spam that hurts readability the most. The "+8" and "-8"
CFA-offset lines are completely uninformative and they obsctruct the
reading of this already very trick type of source code (assembly
language).
It should be something like this:
PUSHQ %rdi
call schedule
POPQ %rdi
instead.
Ingo
* H. Peter Anvin <[email protected]> wrote:
> All of this qualify as "tweaking", which means we need to be very
> careful so we don't end up burning more performance than we gain,
> but all in all, I think this is a good idea.
okay - will queue it up in tip/x86/irq, lets see how it goes.
Ingo
This add-on patch to x86: move entry_64.S register saving out
of the macros visually cleans up the appearance of the code by
introducing some basic helper macro's. It also adds some cfi
annotations which were missing.
Signed-off-by: Alexander van Heukelum <[email protected]>
---
arch/x86/kernel/entry_64.S | 220 ++++++++++++++++++++++----------------------
1 files changed, 112 insertions(+), 108 deletions(-)
Hello Ingo,
This patch improves the CFI-situation in entry_64.S, but restricted
mostly to the areas touched by "x86: move entry_64.S register saving
out of the macros". I'm sure there will be some small errors somewhere,
but it compiles and runs fine.
Greetings,
Alexander
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 5a12432..4a3a3f4 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -60,6 +60,23 @@
#define __AUDIT_ARCH_LE 0x40000000
.code64
+/*
+ * Some macro's to hide the most frequently occuring CFI annotations.
+ */
+ .macro cfi_pushq reg
+ pushq \reg
+ CFI_ADJUST_CFA_OFFSET 8
+ .endm
+
+ .macro cfi_popq reg
+ popq \reg
+ CFI_ADJUST_CFA_OFFSET -8
+ .endm
+
+ .macro cfi_store reg offset=0
+ movq %\reg, \offset(%rsp)
+ CFI_REL_OFFSET \reg, \offset
+ .endm
#ifdef CONFIG_FUNCTION_TRACER
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -213,84 +230,84 @@ ENTRY(native_usergs_sysret64)
CFI_ADJUST_CFA_OFFSET -(6*8)
.endm
- .macro CFI_DEFAULT_STACK start=1
+/*
+ * initial frame state for interrupts (and exceptions without error code)
+ */
+ .macro EMPTY_FRAME start=1 offset=0
.if \start
- CFI_STARTPROC simple
+ CFI_STARTPROC simple
CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SS+8
+ CFI_DEF_CFA rsp,8+\offset
.else
- CFI_DEF_CFA_OFFSET SS+8
+ CFI_DEF_CFA_OFFSET 8+\offset
.endif
- CFI_REL_OFFSET r15,R15
- CFI_REL_OFFSET r14,R14
- CFI_REL_OFFSET r13,R13
- CFI_REL_OFFSET r12,R12
- CFI_REL_OFFSET rbp,RBP
- CFI_REL_OFFSET rbx,RBX
- CFI_REL_OFFSET r11,R11
- CFI_REL_OFFSET r10,R10
- CFI_REL_OFFSET r9,R9
- CFI_REL_OFFSET r8,R8
- CFI_REL_OFFSET rax,RAX
- CFI_REL_OFFSET rcx,RCX
- CFI_REL_OFFSET rdx,RDX
- CFI_REL_OFFSET rsi,RSI
- CFI_REL_OFFSET rdi,RDI
- CFI_REL_OFFSET rip,RIP
- /*CFI_REL_OFFSET cs,CS*/
- /*CFI_REL_OFFSET rflags,EFLAGS*/
- CFI_REL_OFFSET rsp,RSP
- /*CFI_REL_OFFSET ss,SS*/
.endm
/*
- * initial frame state for interrupts and exceptions
+ * initial frame state for interrupts (and exceptions without error code)
*/
- .macro _frame ref
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SS+8-\ref
- /*CFI_REL_OFFSET ss,SS-\ref*/
- CFI_REL_OFFSET rsp,RSP-\ref
- /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
- /*CFI_REL_OFFSET cs,CS-\ref*/
- CFI_REL_OFFSET rip,RIP-\ref
+ .macro INTR_FRAME start=1 offset=0
+ EMPTY_FRAME \start, (SS+8-RIP)+\offset
+ /*CFI_REL_OFFSET ss, SS-RIP+\offset*/
+ CFI_REL_OFFSET rsp, RSP-RIP+\offset
+ /*CFI_REL_OFFSET rflags, EFLAGS-RIP+\offset*/
+ /*CFI_REL_OFFSET cs, CS-RIP+\offset*/
+ CFI_REL_OFFSET rip, RIP-RIP+\offset
.endm
/*
- * initial frame state for interrupts (and exceptions without error code)
- */
-#define INTR_FRAME _frame RIP
-/*
* initial frame state for exceptions with error code (and interrupts
* with vector already pushed)
*/
-#define XCPT_FRAME _frame ORIG_RAX
+ .macro XCPT_FRAME start=1 offset=0
+ INTR_FRAME \start, (RIP-ORIG_RAX)+\offset
+ /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
+ .endm
+
+/*
+ * frame that enables calling into C.
+ */
+ .macro PARTIAL_FRAME start=1 offset=0
+ XCPT_FRAME \start, (ORIG_RAX-ARGOFFSET)+\offset
+ CFI_REL_OFFSET rdi, (RDI-ARGOFFSET)+\offset
+ CFI_REL_OFFSET rsi, (RSI-ARGOFFSET)+\offset
+ CFI_REL_OFFSET rdx, (RDX-ARGOFFSET)+\offset
+ CFI_REL_OFFSET rcx, (RCX-ARGOFFSET)+\offset
+ CFI_REL_OFFSET rax, (RAX-ARGOFFSET)+\offset
+ CFI_REL_OFFSET r8, (R8-ARGOFFSET)+\offset
+ CFI_REL_OFFSET r9, (R9-ARGOFFSET)+\offset
+ CFI_REL_OFFSET r10, (R10-ARGOFFSET)+\offset
+ CFI_REL_OFFSET r11, (R11-ARGOFFSET)+\offset
+ .endm
+
+/*
+ * frame that enables passing a complete pt_regs to a C function.
+ */
+ .macro DEFAULT_FRAME start=1 offset=0
+ PARTIAL_FRAME \start, (R11-R15)+\offset
+ CFI_REL_OFFSET rbx, RBX+\offset
+ CFI_REL_OFFSET rbp, RBP+\offset
+ CFI_REL_OFFSET r12, R12+\offset
+ CFI_REL_OFFSET r13, R13+\offset
+ CFI_REL_OFFSET r14, R14+\offset
+ CFI_REL_OFFSET r15, R15+\offset
+ .endm
/* save partial stack frame */
ENTRY(save_args)
XCPT_FRAME
cld
- movq %rdi, 8*8+16(%rsp)
- CFI_REL_OFFSET rdi, 8*8+16
- movq %rsi, 7*8+16(%rsp)
- CFI_REL_OFFSET rsi, 7*8+16
- movq %rdx, 6*8+16(%rsp)
- CFI_REL_OFFSET rdx, 6*8+16
- movq %rcx, 5*8+16(%rsp)
- CFI_REL_OFFSET rcx, 5*8+16
- movq %rax, 4*8+16(%rsp)
- CFI_REL_OFFSET rax, 4*8+16
- movq %r8, 3*8+16(%rsp)
- CFI_REL_OFFSET r8, 3*8+16
- movq %r9, 2*8+16(%rsp)
- CFI_REL_OFFSET r9, 2*8+16
- movq %r10, 1*8+16(%rsp)
- CFI_REL_OFFSET r10, 1*8+16
- movq %r11, 0*8+16(%rsp)
- CFI_REL_OFFSET r11, 0*8+16
+ cfi_store rdi, (RDI-ARGOFFSET)+16
+ cfi_store rsi, (RSI-ARGOFFSET)+16
+ cfi_store rdx, (RDX-ARGOFFSET)+16
+ cfi_store rcx, (RCX-ARGOFFSET)+16
+ cfi_store rax, (RAX-ARGOFFSET)+16
+ cfi_store r8, (R8-ARGOFFSET)+16
+ cfi_store r9, (R9-ARGOFFSET)+16
+ cfi_store r10, (R10-ARGOFFSET)+16
+ cfi_store r11, (R11-ARGOFFSET)+16
leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */
- movq %rbp, 8(%rsp) /* push %rbp */
+ cfi_store rbp, 8 /* push %rbp */
leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
testl $3, CS(%rdi)
je 1f
@@ -303,9 +320,10 @@ ENTRY(save_args)
*/
1: incl %gs:pda_irqcount
jne 2f
- pop %rax /* move return address... */
+ cfi_popq %rax /* move return address... */
mov %gs:pda_irqstackptr,%rsp
- push %rax /* ... to the new stack */
+ EMPTY_FRAME 0
+ cfi_pushq %rax /* ... to the new stack */
/*
* We entered an interrupt context - irqs are off:
*/
@@ -319,7 +337,7 @@ END(save_args)
*/
/* rdi: prev */
ENTRY(ret_from_fork)
- CFI_DEFAULT_STACK
+ DEFAULT_FRAME
push kernel_eflags(%rip)
CFI_ADJUST_CFA_OFFSET 8
popf # reset kernel eflags
@@ -732,6 +750,7 @@ END(interrupt)
subq $10*8, %rsp
CFI_ADJUST_CFA_OFFSET 10*8
call save_args
+ PARTIAL_FRAME 0
call \func
.endm
@@ -949,11 +968,11 @@ END(spurious_interrupt)
.macro zeroentry sym
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $-1 /* ORIG_RAX: no syscall to restart */
- CFI_ADJUST_CFA_OFFSET 8
+ cfi_pushq $-1 /* ORIG_RAX: no syscall to restart */
subq $15*8,%rsp
CFI_ADJUST_CFA_OFFSET 15*8
call error_entry
+ DEFAULT_FRAME 0
movq %rsp,%rdi /* pt_regs pointer */
xorl %esi,%esi /* no error code */
call \sym
@@ -967,6 +986,7 @@ END(spurious_interrupt)
subq $15*8,%rsp
CFI_ADJUST_CFA_OFFSET 15*8
call error_entry
+ DEFAULT_FRAME 0
movq %rsp,%rdi /* pt_regs pointer */
movq ORIG_RAX(%rsp),%rsi /* get error code */
movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
@@ -1079,40 +1099,25 @@ paranoid_schedule\trace:
* returns in "no swapgs flag" in %ebx.
*/
KPROBE_ENTRY(error_entry)
- _frame RDI
+ XCPT_FRAME
CFI_ADJUST_CFA_OFFSET 15*8
/* oldrax contains error code */
cld
- movq %rdi,14*8+8(%rsp)
- CFI_REL_OFFSET rdi,RDI+8
- movq %rsi,13*8+8(%rsp)
- CFI_REL_OFFSET rsi,RSI+8
- movq %rdx,12*8+8(%rsp)
- CFI_REL_OFFSET rdx,RDX+8
- movq %rcx,11*8+8(%rsp)
- CFI_REL_OFFSET rcx,RCX+8
- movq %rax,10*8+8(%rsp)
- CFI_REL_OFFSET rax,RAX+8
- movq %r8, 9*8+8(%rsp)
- CFI_REL_OFFSET r8,R8+8
- movq %r9, 8*8+8(%rsp)
- CFI_REL_OFFSET r9,R9+8
- movq %r10,7*8+8(%rsp)
- CFI_REL_OFFSET r10,R10+8
- movq %r11,6*8+8(%rsp)
- CFI_REL_OFFSET r11,R11+8
- movq %rbx,5*8+8(%rsp)
- CFI_REL_OFFSET rbx,RBX+8
- movq %rbp,4*8+8(%rsp)
- CFI_REL_OFFSET rbp,RBP+8
- movq %r12,3*8+8(%rsp)
- CFI_REL_OFFSET r12,R12+8
- movq %r13,2*8+8(%rsp)
- CFI_REL_OFFSET r13,R13+8
- movq %r14,1*8+8(%rsp)
- CFI_REL_OFFSET r14,R14+8
- movq %r15,0*8+8(%rsp)
- CFI_REL_OFFSET r15,R15+8
+ cfi_store rdi, RDI+8
+ cfi_store rsi, RSI+8
+ cfi_store rdx, RDX+8
+ cfi_store rcx, RCX+8
+ cfi_store rax, RAX+8
+ cfi_store r8, R8+8
+ cfi_store r9, R9+8
+ cfi_store r10, R10+8
+ cfi_store r11, R11+8
+ cfi_store rbx, RBX+8
+ cfi_store rbp, RBP+8
+ cfi_store r12, R12+8
+ cfi_store r13, R13+8
+ cfi_store r14, R14+8
+ cfi_store r15, R15+8
xorl %ebx,%ebx
testl $3,CS+8(%rsp)
je error_kernelspace
@@ -1146,7 +1151,7 @@ KPROBE_END(error_entry)
/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
KPROBE_ENTRY(error_exit)
- _frame R15
+ DEFAULT_FRAME
movl %ebx,%eax
RESTORE_REST
DISABLE_INTERRUPTS(CLBR_NONE)
@@ -1455,7 +1460,7 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
see the correct pointer to the pt_regs */
movq %rdi, %rsp # we don't return, adjust the stack frame
CFI_ENDPROC
- CFI_DEFAULT_STACK
+ DEFAULT_FRAME
11: incl %gs:pda_irqcount
movq %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
@@ -1483,10 +1488,13 @@ END(do_hypervisor_callback)
# with its current contents: any discrepancy means we in category 1.
*/
ENTRY(xen_failsafe_callback)
- framesz = (RIP-0x30) /* workaround buggy gas */
- _frame framesz
- CFI_REL_OFFSET rcx, 0
- CFI_REL_OFFSET r11, 8
+ INTR_FRAME 1 (6*8)
+ /*CFI_REL_OFFSET gs,GS*/
+ /*CFI_REL_OFFSET fs,FS*/
+ /*CFI_REL_OFFSET es,ES*/
+ /*CFI_REL_OFFSET ds,DS*/
+ CFI_REL_OFFSET r11,8
+ CFI_REL_OFFSET rcx,0
movw %ds,%cx
cmpw %cx,0x10(%rsp)
CFI_REMEMBER_STATE
@@ -1507,12 +1515,9 @@ ENTRY(xen_failsafe_callback)
CFI_RESTORE r11
addq $0x30,%rsp
CFI_ADJUST_CFA_OFFSET -0x30
- pushq $0
- CFI_ADJUST_CFA_OFFSET 8
- pushq %r11
- CFI_ADJUST_CFA_OFFSET 8
- pushq %rcx
- CFI_ADJUST_CFA_OFFSET 8
+ cfi_pushq $0 /* RIP */
+ cfi_pushq %r11
+ cfi_pushq %rcx
jmp general_protection
CFI_RESTORE_STATE
1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
@@ -1522,8 +1527,7 @@ ENTRY(xen_failsafe_callback)
CFI_RESTORE r11
addq $0x30,%rsp
CFI_ADJUST_CFA_OFFSET -0x30
- pushq $0
- CFI_ADJUST_CFA_OFFSET 8
+ cfi_pushq $0
SAVE_ALL
jmp error_exit
CFI_ENDPROC
--
1.5.4.3
On Thu, Nov 20, 2008 at 02:40:11PM +0100, Alexander van Heukelum wrote:
> This add-on patch to x86: move entry_64.S register saving out
> of the macros visually cleans up the appearance of the code by
> introducing some basic helper macro's.
So it's not standard assembler anymore. Welcome objdump -S !
-Andi
* Alexander van Heukelum <[email protected]> wrote:
> This add-on patch to x86: move entry_64.S register saving out of the
> macros visually cleans up the appearance of the code by introducing
> some basic helper macro's. It also adds some cfi annotations which
> were missing.
>
> Signed-off-by: Alexander van Heukelum <[email protected]>
> ---
> arch/x86/kernel/entry_64.S | 220 ++++++++++++++++++++++----------------------
> 1 files changed, 112 insertions(+), 108 deletions(-)
>
> Hello Ingo,
>
> This patch improves the CFI-situation in entry_64.S, but restricted
> mostly to the areas touched by "x86: move entry_64.S register saving
> out of the macros". I'm sure there will be some small errors
> somewhere, but it compiles and runs fine.
very nice cleanup! This is exactly what should be done. Applied to
tip/x86/irq.
Note, i did a small rename:
cfi_pushq => pushq_cfi
cfi_popq => popq_cfi
cfi_store => movq_cfi
as the goal is to have the actual source code read mostly as regular
assembly code. The fact that the macro is equivalent to a
default-annotated pushq/popq/movq instruction is much more important
than the fact that it also does CFI annotations.
Also, while cfi_store is correct as well, the usual x86 assembly term
(and instruction used here) is movq.
Find below how the commit ended up looking like.
Thanks,
Ingo
----------------->
>From 250f351f57022f125c19178af1b0335503f1a02d Mon Sep 17 00:00:00 2001
From: Alexander van Heukelum <[email protected]>
Date: Thu, 20 Nov 2008 14:40:11 +0100
Subject: [PATCH] x86: clean up after: move entry_64.S register saving out of the macros
This add-on patch to x86: move entry_64.S register saving out
of the macros visually cleans up the appearance of the code by
introducing some basic helper macro's. It also adds some cfi
annotations which were missing.
Signed-off-by: Alexander van Heukelum <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/kernel/entry_64.S | 220 ++++++++++++++++++++++----------------------
1 files changed, 112 insertions(+), 108 deletions(-)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 5a12432..6df9d31 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -60,6 +60,23 @@
#define __AUDIT_ARCH_LE 0x40000000
.code64
+/*
+ * Some macro's to hide the most frequently occuring CFI annotations.
+ */
+ .macro pushq_cfi reg
+ pushq \reg
+ CFI_ADJUST_CFA_OFFSET 8
+ .endm
+
+ .macro popq_cfi reg
+ popq \reg
+ CFI_ADJUST_CFA_OFFSET -8
+ .endm
+
+ .macro movq_cfi reg offset=0
+ movq %\reg, \offset(%rsp)
+ CFI_REL_OFFSET \reg, \offset
+ .endm
#ifdef CONFIG_FUNCTION_TRACER
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -213,84 +230,84 @@ ENTRY(native_usergs_sysret64)
CFI_ADJUST_CFA_OFFSET -(6*8)
.endm
- .macro CFI_DEFAULT_STACK start=1
+/*
+ * initial frame state for interrupts (and exceptions without error code)
+ */
+ .macro EMPTY_FRAME start=1 offset=0
.if \start
- CFI_STARTPROC simple
+ CFI_STARTPROC simple
CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SS+8
+ CFI_DEF_CFA rsp,8+\offset
.else
- CFI_DEF_CFA_OFFSET SS+8
+ CFI_DEF_CFA_OFFSET 8+\offset
.endif
- CFI_REL_OFFSET r15,R15
- CFI_REL_OFFSET r14,R14
- CFI_REL_OFFSET r13,R13
- CFI_REL_OFFSET r12,R12
- CFI_REL_OFFSET rbp,RBP
- CFI_REL_OFFSET rbx,RBX
- CFI_REL_OFFSET r11,R11
- CFI_REL_OFFSET r10,R10
- CFI_REL_OFFSET r9,R9
- CFI_REL_OFFSET r8,R8
- CFI_REL_OFFSET rax,RAX
- CFI_REL_OFFSET rcx,RCX
- CFI_REL_OFFSET rdx,RDX
- CFI_REL_OFFSET rsi,RSI
- CFI_REL_OFFSET rdi,RDI
- CFI_REL_OFFSET rip,RIP
- /*CFI_REL_OFFSET cs,CS*/
- /*CFI_REL_OFFSET rflags,EFLAGS*/
- CFI_REL_OFFSET rsp,RSP
- /*CFI_REL_OFFSET ss,SS*/
.endm
/*
- * initial frame state for interrupts and exceptions
+ * initial frame state for interrupts (and exceptions without error code)
*/
- .macro _frame ref
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SS+8-\ref
- /*CFI_REL_OFFSET ss,SS-\ref*/
- CFI_REL_OFFSET rsp,RSP-\ref
- /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
- /*CFI_REL_OFFSET cs,CS-\ref*/
- CFI_REL_OFFSET rip,RIP-\ref
+ .macro INTR_FRAME start=1 offset=0
+ EMPTY_FRAME \start, (SS+8-RIP)+\offset
+ /*CFI_REL_OFFSET ss, SS-RIP+\offset*/
+ CFI_REL_OFFSET rsp, RSP-RIP+\offset
+ /*CFI_REL_OFFSET rflags, EFLAGS-RIP+\offset*/
+ /*CFI_REL_OFFSET cs, CS-RIP+\offset*/
+ CFI_REL_OFFSET rip, RIP-RIP+\offset
.endm
/*
- * initial frame state for interrupts (and exceptions without error code)
- */
-#define INTR_FRAME _frame RIP
-/*
* initial frame state for exceptions with error code (and interrupts
* with vector already pushed)
*/
-#define XCPT_FRAME _frame ORIG_RAX
+ .macro XCPT_FRAME start=1 offset=0
+ INTR_FRAME \start, (RIP-ORIG_RAX)+\offset
+ /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
+ .endm
+
+/*
+ * frame that enables calling into C.
+ */
+ .macro PARTIAL_FRAME start=1 offset=0
+ XCPT_FRAME \start, (ORIG_RAX-ARGOFFSET)+\offset
+ CFI_REL_OFFSET rdi, (RDI-ARGOFFSET)+\offset
+ CFI_REL_OFFSET rsi, (RSI-ARGOFFSET)+\offset
+ CFI_REL_OFFSET rdx, (RDX-ARGOFFSET)+\offset
+ CFI_REL_OFFSET rcx, (RCX-ARGOFFSET)+\offset
+ CFI_REL_OFFSET rax, (RAX-ARGOFFSET)+\offset
+ CFI_REL_OFFSET r8, (R8-ARGOFFSET)+\offset
+ CFI_REL_OFFSET r9, (R9-ARGOFFSET)+\offset
+ CFI_REL_OFFSET r10, (R10-ARGOFFSET)+\offset
+ CFI_REL_OFFSET r11, (R11-ARGOFFSET)+\offset
+ .endm
+
+/*
+ * frame that enables passing a complete pt_regs to a C function.
+ */
+ .macro DEFAULT_FRAME start=1 offset=0
+ PARTIAL_FRAME \start, (R11-R15)+\offset
+ CFI_REL_OFFSET rbx, RBX+\offset
+ CFI_REL_OFFSET rbp, RBP+\offset
+ CFI_REL_OFFSET r12, R12+\offset
+ CFI_REL_OFFSET r13, R13+\offset
+ CFI_REL_OFFSET r14, R14+\offset
+ CFI_REL_OFFSET r15, R15+\offset
+ .endm
/* save partial stack frame */
ENTRY(save_args)
XCPT_FRAME
cld
- movq %rdi, 8*8+16(%rsp)
- CFI_REL_OFFSET rdi, 8*8+16
- movq %rsi, 7*8+16(%rsp)
- CFI_REL_OFFSET rsi, 7*8+16
- movq %rdx, 6*8+16(%rsp)
- CFI_REL_OFFSET rdx, 6*8+16
- movq %rcx, 5*8+16(%rsp)
- CFI_REL_OFFSET rcx, 5*8+16
- movq %rax, 4*8+16(%rsp)
- CFI_REL_OFFSET rax, 4*8+16
- movq %r8, 3*8+16(%rsp)
- CFI_REL_OFFSET r8, 3*8+16
- movq %r9, 2*8+16(%rsp)
- CFI_REL_OFFSET r9, 2*8+16
- movq %r10, 1*8+16(%rsp)
- CFI_REL_OFFSET r10, 1*8+16
- movq %r11, 0*8+16(%rsp)
- CFI_REL_OFFSET r11, 0*8+16
+ movq_cfi rdi, (RDI-ARGOFFSET)+16
+ movq_cfi rsi, (RSI-ARGOFFSET)+16
+ movq_cfi rdx, (RDX-ARGOFFSET)+16
+ movq_cfi rcx, (RCX-ARGOFFSET)+16
+ movq_cfi rax, (RAX-ARGOFFSET)+16
+ movq_cfi r8, (R8-ARGOFFSET)+16
+ movq_cfi r9, (R9-ARGOFFSET)+16
+ movq_cfi r10, (R10-ARGOFFSET)+16
+ movq_cfi r11, (R11-ARGOFFSET)+16
leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */
- movq %rbp, 8(%rsp) /* push %rbp */
+ movq_cfi rbp, 8 /* push %rbp */
leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
testl $3, CS(%rdi)
je 1f
@@ -303,9 +320,10 @@ ENTRY(save_args)
*/
1: incl %gs:pda_irqcount
jne 2f
- pop %rax /* move return address... */
+ popq_cfi %rax /* move return address... */
mov %gs:pda_irqstackptr,%rsp
- push %rax /* ... to the new stack */
+ EMPTY_FRAME 0
+ pushq_cfi %rax /* ... to the new stack */
/*
* We entered an interrupt context - irqs are off:
*/
@@ -319,7 +337,7 @@ END(save_args)
*/
/* rdi: prev */
ENTRY(ret_from_fork)
- CFI_DEFAULT_STACK
+ DEFAULT_FRAME
push kernel_eflags(%rip)
CFI_ADJUST_CFA_OFFSET 8
popf # reset kernel eflags
@@ -732,6 +750,7 @@ END(interrupt)
subq $10*8, %rsp
CFI_ADJUST_CFA_OFFSET 10*8
call save_args
+ PARTIAL_FRAME 0
call \func
.endm
@@ -949,11 +968,11 @@ END(spurious_interrupt)
.macro zeroentry sym
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $-1 /* ORIG_RAX: no syscall to restart */
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
subq $15*8,%rsp
CFI_ADJUST_CFA_OFFSET 15*8
call error_entry
+ DEFAULT_FRAME 0
movq %rsp,%rdi /* pt_regs pointer */
xorl %esi,%esi /* no error code */
call \sym
@@ -967,6 +986,7 @@ END(spurious_interrupt)
subq $15*8,%rsp
CFI_ADJUST_CFA_OFFSET 15*8
call error_entry
+ DEFAULT_FRAME 0
movq %rsp,%rdi /* pt_regs pointer */
movq ORIG_RAX(%rsp),%rsi /* get error code */
movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
@@ -1079,40 +1099,25 @@ paranoid_schedule\trace:
* returns in "no swapgs flag" in %ebx.
*/
KPROBE_ENTRY(error_entry)
- _frame RDI
+ XCPT_FRAME
CFI_ADJUST_CFA_OFFSET 15*8
/* oldrax contains error code */
cld
- movq %rdi,14*8+8(%rsp)
- CFI_REL_OFFSET rdi,RDI+8
- movq %rsi,13*8+8(%rsp)
- CFI_REL_OFFSET rsi,RSI+8
- movq %rdx,12*8+8(%rsp)
- CFI_REL_OFFSET rdx,RDX+8
- movq %rcx,11*8+8(%rsp)
- CFI_REL_OFFSET rcx,RCX+8
- movq %rax,10*8+8(%rsp)
- CFI_REL_OFFSET rax,RAX+8
- movq %r8, 9*8+8(%rsp)
- CFI_REL_OFFSET r8,R8+8
- movq %r9, 8*8+8(%rsp)
- CFI_REL_OFFSET r9,R9+8
- movq %r10,7*8+8(%rsp)
- CFI_REL_OFFSET r10,R10+8
- movq %r11,6*8+8(%rsp)
- CFI_REL_OFFSET r11,R11+8
- movq %rbx,5*8+8(%rsp)
- CFI_REL_OFFSET rbx,RBX+8
- movq %rbp,4*8+8(%rsp)
- CFI_REL_OFFSET rbp,RBP+8
- movq %r12,3*8+8(%rsp)
- CFI_REL_OFFSET r12,R12+8
- movq %r13,2*8+8(%rsp)
- CFI_REL_OFFSET r13,R13+8
- movq %r14,1*8+8(%rsp)
- CFI_REL_OFFSET r14,R14+8
- movq %r15,0*8+8(%rsp)
- CFI_REL_OFFSET r15,R15+8
+ movq_cfi rdi, RDI+8
+ movq_cfi rsi, RSI+8
+ movq_cfi rdx, RDX+8
+ movq_cfi rcx, RCX+8
+ movq_cfi rax, RAX+8
+ movq_cfi r8, R8+8
+ movq_cfi r9, R9+8
+ movq_cfi r10, R10+8
+ movq_cfi r11, R11+8
+ movq_cfi rbx, RBX+8
+ movq_cfi rbp, RBP+8
+ movq_cfi r12, R12+8
+ movq_cfi r13, R13+8
+ movq_cfi r14, R14+8
+ movq_cfi r15, R15+8
xorl %ebx,%ebx
testl $3,CS+8(%rsp)
je error_kernelspace
@@ -1146,7 +1151,7 @@ KPROBE_END(error_entry)
/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
KPROBE_ENTRY(error_exit)
- _frame R15
+ DEFAULT_FRAME
movl %ebx,%eax
RESTORE_REST
DISABLE_INTERRUPTS(CLBR_NONE)
@@ -1455,7 +1460,7 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
see the correct pointer to the pt_regs */
movq %rdi, %rsp # we don't return, adjust the stack frame
CFI_ENDPROC
- CFI_DEFAULT_STACK
+ DEFAULT_FRAME
11: incl %gs:pda_irqcount
movq %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
@@ -1483,10 +1488,13 @@ END(do_hypervisor_callback)
# with its current contents: any discrepancy means we in category 1.
*/
ENTRY(xen_failsafe_callback)
- framesz = (RIP-0x30) /* workaround buggy gas */
- _frame framesz
- CFI_REL_OFFSET rcx, 0
- CFI_REL_OFFSET r11, 8
+ INTR_FRAME 1 (6*8)
+ /*CFI_REL_OFFSET gs,GS*/
+ /*CFI_REL_OFFSET fs,FS*/
+ /*CFI_REL_OFFSET es,ES*/
+ /*CFI_REL_OFFSET ds,DS*/
+ CFI_REL_OFFSET r11,8
+ CFI_REL_OFFSET rcx,0
movw %ds,%cx
cmpw %cx,0x10(%rsp)
CFI_REMEMBER_STATE
@@ -1507,12 +1515,9 @@ ENTRY(xen_failsafe_callback)
CFI_RESTORE r11
addq $0x30,%rsp
CFI_ADJUST_CFA_OFFSET -0x30
- pushq $0
- CFI_ADJUST_CFA_OFFSET 8
- pushq %r11
- CFI_ADJUST_CFA_OFFSET 8
- pushq %rcx
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $0 /* RIP */
+ pushq_cfi %r11
+ pushq_cfi %rcx
jmp general_protection
CFI_RESTORE_STATE
1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
@@ -1522,8 +1527,7 @@ ENTRY(xen_failsafe_callback)
CFI_RESTORE r11
addq $0x30,%rsp
CFI_ADJUST_CFA_OFFSET -0x30
- pushq $0
- CFI_ADJUST_CFA_OFFSET 8
+ pushq_cfi $0
SAVE_ALL
jmp error_exit
CFI_ENDPROC
On Thu, Nov 20, 2008 at 04:04:12PM +0100, Ingo Molnar wrote:
>
> * Alexander van Heukelum <[email protected]> wrote:
>
> > This add-on patch to x86: move entry_64.S register saving out of the
> > macros visually cleans up the appearance of the code by introducing
> > some basic helper macro's. It also adds some cfi annotations which
> > were missing.
> >
> > Signed-off-by: Alexander van Heukelum <[email protected]>
> > ---
> > arch/x86/kernel/entry_64.S | 220 ++++++++++++++++++++++----------------------
> > 1 files changed, 112 insertions(+), 108 deletions(-)
> >
> > Hello Ingo,
> >
> > This patch improves the CFI-situation in entry_64.S, but restricted
> > mostly to the areas touched by "x86: move entry_64.S register saving
> > out of the macros". I'm sure there will be some small errors
> > somewhere, but it compiles and runs fine.
>
> very nice cleanup! This is exactly what should be done. Applied to
> tip/x86/irq.
>
> Note, i did a small rename:
>
> cfi_pushq => pushq_cfi
> cfi_popq => popq_cfi
> cfi_store => movq_cfi
>
> as the goal is to have the actual source code read mostly as regular
> assembly code. The fact that the macro is equivalent to a
> default-annotated pushq/popq/movq instruction is much more important
> than the fact that it also does CFI annotations.
>
> Also, while cfi_store is correct as well, the usual x86 assembly term
> (and instruction used here) is movq.
Now I have a little problem with my next patch... I wanted to
introduce cfi_load. Guess what assembly instruction that maps
to ;).
Greetings,
Alexander
> Find below how the commit ended up looking like.
>
> Thanks,
>
> Ingo
>
> ----------------->
--
Alexander van Heukelum
* Alexander van Heukelum <[email protected]> wrote:
> On Thu, Nov 20, 2008 at 04:04:12PM +0100, Ingo Molnar wrote:
> >
> > * Alexander van Heukelum <[email protected]> wrote:
> >
> > > This add-on patch to x86: move entry_64.S register saving out of the
> > > macros visually cleans up the appearance of the code by introducing
> > > some basic helper macro's. It also adds some cfi annotations which
> > > were missing.
> > >
> > > Signed-off-by: Alexander van Heukelum <[email protected]>
> > > ---
> > > arch/x86/kernel/entry_64.S | 220 ++++++++++++++++++++++----------------------
> > > 1 files changed, 112 insertions(+), 108 deletions(-)
> > >
> > > Hello Ingo,
> > >
> > > This patch improves the CFI-situation in entry_64.S, but restricted
> > > mostly to the areas touched by "x86: move entry_64.S register saving
> > > out of the macros". I'm sure there will be some small errors
> > > somewhere, but it compiles and runs fine.
> >
> > very nice cleanup! This is exactly what should be done. Applied to
> > tip/x86/irq.
> >
> > Note, i did a small rename:
> >
> > cfi_pushq => pushq_cfi
> > cfi_popq => popq_cfi
> > cfi_store => movq_cfi
> >
> > as the goal is to have the actual source code read mostly as regular
> > assembly code. The fact that the macro is equivalent to a
> > default-annotated pushq/popq/movq instruction is much more important
> > than the fact that it also does CFI annotations.
> >
> > Also, while cfi_store is correct as well, the usual x86 assembly term
> > (and instruction used here) is movq.
>
> Now I have a little problem with my next patch... I wanted to
> introduce cfi_load. Guess what assembly instruction that maps to ;).
heh ;-)
the restore direction could be named movq_cfi_restore, and have the same
order of arguments as the regular movq that it replaces. I.e.:
movq 8(%rsp),%r11
CFI_RESTORE r11
would map to:
movq_cfi_restore 8, r11
or so.
cfi_store has really a bad name: it's confusing whether it's the CFI
info we are storing/registering (which we are), or a 'store' instruction
(which this is too).
If then we should name it movq_cfi_store or movq_cfi_register - but
that's too long.
movq_cfi for the frame construction direction and movq_cfi_restore for
the frame deconstruction phase sounds like a good naming compromise, hm?
Ingo
>>> Ingo Molnar <[email protected]> 20.11.08 16:39 >>>
>movq_cfi for the frame construction direction and movq_cfi_restore for
>the frame deconstruction phase sounds like a good naming compromise, hm?
How about movq_spill and movq_fill (losely following some ia64 terminology)?
Jan
On Thu, Nov 20, 2008 at 04:39:54PM +0100, Ingo Molnar wrote:
>
> * Alexander van Heukelum <[email protected]> wrote:
>
> > On Thu, Nov 20, 2008 at 04:04:12PM +0100, Ingo Molnar wrote:
> > >
> > > * Alexander van Heukelum <[email protected]> wrote:
> > >
> > > > This add-on patch to x86: move entry_64.S register saving out of the
> > > > macros visually cleans up the appearance of the code by introducing
> > > > some basic helper macro's. It also adds some cfi annotations which
> > > > were missing.
> > > >
> > > > Signed-off-by: Alexander van Heukelum <[email protected]>
> > > > ---
> > > > arch/x86/kernel/entry_64.S | 220 ++++++++++++++++++++++----------------------
> > > > 1 files changed, 112 insertions(+), 108 deletions(-)
> > > >
> > > > Hello Ingo,
> > > >
> > > > This patch improves the CFI-situation in entry_64.S, but restricted
> > > > mostly to the areas touched by "x86: move entry_64.S register saving
> > > > out of the macros". I'm sure there will be some small errors
> > > > somewhere, but it compiles and runs fine.
> > >
> > > very nice cleanup! This is exactly what should be done. Applied to
> > > tip/x86/irq.
> > >
> > > Note, i did a small rename:
> > >
> > > cfi_pushq => pushq_cfi
> > > cfi_popq => popq_cfi
> > > cfi_store => movq_cfi
Does not work... But if you are attached to the underscores, I
think we can force it to work by using CPP to convert it to
something the assembler does parse right:
#define pushq_cfi pushq.cfi
etc?
Or is that just too ugly?
Alexander
> > > as the goal is to have the actual source code read mostly as regular
> > > assembly code. The fact that the macro is equivalent to a
> > > default-annotated pushq/popq/movq instruction is much more important
> > > than the fact that it also does CFI annotations.
> > >
> > > Also, while cfi_store is correct as well, the usual x86 assembly term
> > > (and instruction used here) is movq.
> >
> > Now I have a little problem with my next patch... I wanted to
> > introduce cfi_load. Guess what assembly instruction that maps to ;).
>
> heh ;-)
>
> the restore direction could be named movq_cfi_restore, and have the same
> order of arguments as the regular movq that it replaces. I.e.:
>
> movq 8(%rsp),%r11
> CFI_RESTORE r11
>
> would map to:
>
> movq_cfi_restore 8, r11
>
> or so.
>
> cfi_store has really a bad name: it's confusing whether it's the CFI
> info we are storing/registering (which we are), or a 'store' instruction
> (which this is too).
>
> If then we should name it movq_cfi_store or movq_cfi_register - but
> that's too long.
>
> movq_cfi for the frame construction direction and movq_cfi_restore for
> the frame deconstruction phase sounds like a good naming compromise, hm?
>
> Ingo
--
Alexander van Heukelum
[Alexander van Heukelum - Thu, Nov 20, 2008 at 04:57:44PM +0100]
| On Thu, Nov 20, 2008 at 04:39:54PM +0100, Ingo Molnar wrote:
| >
| > * Alexander van Heukelum <[email protected]> wrote:
| >
| > > On Thu, Nov 20, 2008 at 04:04:12PM +0100, Ingo Molnar wrote:
| > > >
| > > > * Alexander van Heukelum <[email protected]> wrote:
| > > >
| > > > > This add-on patch to x86: move entry_64.S register saving out of the
| > > > > macros visually cleans up the appearance of the code by introducing
| > > > > some basic helper macro's. It also adds some cfi annotations which
| > > > > were missing.
| > > > >
| > > > > Signed-off-by: Alexander van Heukelum <[email protected]>
| > > > > ---
| > > > > arch/x86/kernel/entry_64.S | 220 ++++++++++++++++++++++----------------------
| > > > > 1 files changed, 112 insertions(+), 108 deletions(-)
| > > > >
| > > > > Hello Ingo,
| > > > >
| > > > > This patch improves the CFI-situation in entry_64.S, but restricted
| > > > > mostly to the areas touched by "x86: move entry_64.S register saving
| > > > > out of the macros". I'm sure there will be some small errors
| > > > > somewhere, but it compiles and runs fine.
| > > >
| > > > very nice cleanup! This is exactly what should be done. Applied to
| > > > tip/x86/irq.
| > > >
| > > > Note, i did a small rename:
| > > >
| > > > cfi_pushq => pushq_cfi
| > > > cfi_popq => popq_cfi
| > > > cfi_store => movq_cfi
|
| Does not work... But if you are attached to the underscores, I
| think we can force it to work by using CPP to convert it to
| something the assembler does parse right:
|
| #define pushq_cfi pushq.cfi
|
| etc?
|
| Or is that just too ugly?
|
| Alexander
|
| > > > as the goal is to have the actual source code read mostly as regular
| > > > assembly code. The fact that the macro is equivalent to a
| > > > default-annotated pushq/popq/movq instruction is much more important
| > > > than the fact that it also does CFI annotations.
| > > >
| > > > Also, while cfi_store is correct as well, the usual x86 assembly term
| > > > (and instruction used here) is movq.
| > >
| > > Now I have a little problem with my next patch... I wanted to
| > > introduce cfi_load. Guess what assembly instruction that maps to ;).
| >
| > heh ;-)
| >
| > the restore direction could be named movq_cfi_restore, and have the same
| > order of arguments as the regular movq that it replaces. I.e.:
| >
| > movq 8(%rsp),%r11
| > CFI_RESTORE r11
| >
| > would map to:
| >
| > movq_cfi_restore 8, r11
| >
| > or so.
| >
| > cfi_store has really a bad name: it's confusing whether it's the CFI
| > info we are storing/registering (which we are), or a 'store' instruction
| > (which this is too).
| >
| > If then we should name it movq_cfi_store or movq_cfi_register - but
| > that's too long.
| >
| > movq_cfi for the frame construction direction and movq_cfi_restore for
| > the frame deconstruction phase sounds like a good naming compromise, hm?
| >
| > Ingo
|
| --
| Alexander van Heukelum
|
Hi,
I didn't check if it possible but maybe just put these CFI
annotations on right side like
movq 8(%rsp),%r11 ; CFI_RESTORE r11
Hmm? Yes I know -- it will not make 'auto' cfi annotations
but *anyway* if say I'm to write this kind of code I _have_
to know how CFI works (or at least read some specs about)
I guess.
- Cyrill -
On Thu, Nov 20, 2008 at 04:57:44PM +0100, Alexander van Heukelum wrote:
> On Thu, Nov 20, 2008 at 04:39:54PM +0100, Ingo Molnar wrote:
> > > > Note, i did a small rename:
> > > >
> > > > cfi_pushq => pushq_cfi
> > > > cfi_popq => popq_cfi
> > > > cfi_store => movq_cfi
>
> Does not work... But if you are attached to the underscores, I
> think we can force it to work by using CPP to convert it to
> something the assembler does parse right:
Hi Ingo,
False alarm. It's happily building at the moment. Does it still
fail for you?
Greetings,
Alexander
> #define pushq_cfi pushq.cfi
>
> etc?
>
> Or is that just too ugly?
>
> Alexander
* Alexander van Heukelum <[email protected]> wrote:
> Does not work... But if you are attached to the underscores, I think
> we can force it to work by using CPP to convert it to something the
> assembler does parse right:
>
> #define pushq_cfi pushq.cfi
>
> etc?
>
> Or is that just too ugly?
changed it to pushq.cfi / popq.cfi / movq.cfi - does that read OK to
you? It looks fine here.
Ingo
The save_rest function completes a partial stack frame for use
by the PTREGSCALL macro. This also avoid the indirect call in
PTREGSCALLs.
This adds the macro movq_cfi_restore to hide the CFI_RESTORE
annotation when restoring a register from the stack frame.
Signed-off-by: Alexander van Heukelum <[email protected]>
---
arch/x86/kernel/entry_64.S | 91 +++++++++++++++++++++++++------------------
1 files changed, 53 insertions(+), 38 deletions(-)
Hello Ingo,
Here are three more patches cleaning up entry_64.S. They apply to
current tip/x86/irq. Thanks again for your help on the binutils
problems!
Greetings,
Alexander
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 92c5e18..ef95c45 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -78,6 +78,11 @@
CFI_REL_OFFSET \reg, \offset
.endm
+ .macro movq_cfi_restore offset reg
+ movq \offset(%rsp), %\reg
+ CFI_RESTORE \reg
+ .endm
+
#ifdef CONFIG_FUNCTION_TRACER
#ifdef CONFIG_DYNAMIC_FTRACE
ENTRY(mcount)
@@ -186,21 +191,21 @@ ENTRY(native_usergs_sysret64)
*/
/* %rsp:at FRAMEEND */
- .macro FIXUP_TOP_OF_STACK tmp
- movq %gs:pda_oldrsp,\tmp
- movq \tmp,RSP(%rsp)
- movq $__USER_DS,SS(%rsp)
- movq $__USER_CS,CS(%rsp)
- movq $-1,RCX(%rsp)
- movq R11(%rsp),\tmp /* get eflags */
- movq \tmp,EFLAGS(%rsp)
+ .macro FIXUP_TOP_OF_STACK tmp offset=0
+ movq %gs:pda_oldrsp,\tmp
+ movq \tmp,RSP+\offset(%rsp)
+ movq $__USER_DS,SS+\offset(%rsp)
+ movq $__USER_CS,CS+\offset(%rsp)
+ movq $-1,RCX+\offset(%rsp)
+ movq R11+\offset(%rsp),\tmp /* get eflags */
+ movq \tmp,EFLAGS+\offset(%rsp)
.endm
- .macro RESTORE_TOP_OF_STACK tmp,offset=0
- movq RSP-\offset(%rsp),\tmp
- movq \tmp,%gs:pda_oldrsp
- movq EFLAGS-\offset(%rsp),\tmp
- movq \tmp,R11-\offset(%rsp)
+ .macro RESTORE_TOP_OF_STACK tmp offset=0
+ movq RSP+\offset(%rsp),\tmp
+ movq \tmp,%gs:pda_oldrsp
+ movq EFLAGS+\offset(%rsp),\tmp
+ movq \tmp,R11+\offset(%rsp)
.endm
.macro FAKE_STACK_FRAME child_rip
@@ -333,6 +338,21 @@ ENTRY(save_args)
CFI_ENDPROC
END(save_args)
+ENTRY(save_rest)
+ PARTIAL_FRAME 1 REST_SKIP+8
+ movq 5*8+16(%rsp), %r11 /* save return address */
+ movq_cfi rbx, RBX+16
+ movq_cfi rbp, RBP+16
+ movq_cfi r12, R12+16
+ movq_cfi r13, R13+16
+ movq_cfi r14, R14+16
+ movq_cfi r15, R15+16
+ movq %r11, 8(%rsp) /* return address */
+ FIXUP_TOP_OF_STACK %r11, 16
+ ret
+ CFI_ENDPROC
+END(save_rest)
+
/*
* A newly forked process directly context switches into this.
*/
@@ -353,7 +373,7 @@ rff_action:
je int_ret_from_sys_call
testl $_TIF_IA32,TI_flags(%rcx)
jnz int_ret_from_sys_call
- RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
+ RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
jmp ret_from_sys_call
rff_trace:
movq %rsp,%rdi
@@ -626,18 +646,20 @@ END(system_call)
/*
* Certain special system calls that need to save a complete full stack frame.
*/
-
.macro PTREGSCALL label,func,arg
- .globl \label
-\label:
- leaq \func(%rip),%rax
- leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
- jmp ptregscall_common
+ENTRY(\label)
+ PARTIAL_FRAME 1 8 /* offset 8: return address */
+ subq $REST_SKIP, %rsp
+ CFI_ADJUST_CFA_OFFSET REST_SKIP
+ call save_rest
+ DEFAULT_FRAME 0 8 /* offset 8: return address */
+ leaq 8(%rsp), \arg /* pt_regs pointer */
+ call \func
+ jmp ptregscall_common
+ CFI_ENDPROC
END(\label)
.endm
- CFI_STARTPROC
-
PTREGSCALL stub_clone, sys_clone, %r8
PTREGSCALL stub_fork, sys_fork, %rdi
PTREGSCALL stub_vfork, sys_vfork, %rdi
@@ -645,22 +667,15 @@ END(\label)
PTREGSCALL stub_iopl, sys_iopl, %rsi
ENTRY(ptregscall_common)
- popq %r11
- CFI_ADJUST_CFA_OFFSET -8
- CFI_REGISTER rip, r11
- SAVE_REST
- movq %r11, %r15
- CFI_REGISTER rip, r15
- FIXUP_TOP_OF_STACK %r11
- call *%rax
- RESTORE_TOP_OF_STACK %r11
- movq %r15, %r11
- CFI_REGISTER rip, r11
- RESTORE_REST
- pushq %r11
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rip, 0
- ret
+ DEFAULT_FRAME 1 8 /* offset 8: return address */
+ RESTORE_TOP_OF_STACK %r11, 8
+ movq_cfi_restore R15+8, r15
+ movq_cfi_restore R14+8, r14
+ movq_cfi_restore R13+8, r13
+ movq_cfi_restore R12+8, r12
+ movq_cfi_restore RBP+8, rbp
+ movq_cfi_restore RBX+8, rbx
+ ret $REST_SKIP /* pop extended registers */
CFI_ENDPROC
END(ptregscall_common)
--
1.5.4.3
Also expand the paranoid_exit0 macro into nmi_exit inside the
nmi stub in the case of enabled irq-tracing.
This gives a few hundred bytes code size reduction.
Signed-off-by: Alexander van Heukelum <[email protected]>
---
arch/x86/kernel/entry_64.S | 151 +++++++++++++++++++++++++++++--------------
1 files changed, 102 insertions(+), 49 deletions(-)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index ef95c45..fad777b 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -353,6 +353,36 @@ ENTRY(save_rest)
CFI_ENDPROC
END(save_rest)
+/* save complete stack frame */
+ENTRY(save_paranoid)
+ XCPT_FRAME 1 RDI+8
+ cld
+ movq_cfi rdi, RDI+8
+ movq_cfi rsi, RSI+8
+ movq_cfi rdx, RDX+8
+ movq_cfi rcx, RCX+8
+ movq_cfi rax, RAX+8
+ movq_cfi r8, R8+8
+ movq_cfi r9, R9+8
+ movq_cfi r10, R10+8
+ movq_cfi r11, R11+8
+ movq_cfi rbx, RBX+8
+ movq_cfi rbp, RBP+8
+ movq_cfi r12, R12+8
+ movq_cfi r13, R13+8
+ movq_cfi r14, R14+8
+ movq_cfi r15, R15+8
+ movl $1,%ebx
+ movl $MSR_GS_BASE,%ecx
+ rdmsr
+ testl %edx,%edx
+ js 1f /* negative -> in kernel */
+ SWAPGS
+ xorl %ebx,%ebx
+1: ret
+ CFI_ENDPROC
+END(save_paranoid)
+
/*
* A newly forked process directly context switches into this.
*/
@@ -1012,24 +1042,15 @@ END(spurious_interrupt)
.endm
/* error code is on the stack already */
- /* handle NMI like exceptions that can happen everywhere */
- .macro paranoidentry sym, ist=0, irqtrace=1
- SAVE_ALL
- cld
- movl $1,%ebx
- movl $MSR_GS_BASE,%ecx
- rdmsr
- testl %edx,%edx
- js 1f
- SWAPGS
- xorl %ebx,%ebx
-1:
+ .macro paranoidentry sym ist=0
+ subq $15*8, %rsp
+ CFI_ADJUST_CFA_OFFSET 15*8
+ call save_paranoid
+ DEFAULT_FRAME 0
.if \ist
movq %gs:pda_data_offset, %rbp
.endif
- .if \irqtrace
TRACE_IRQS_OFF
- .endif
movq %rsp,%rdi
movq ORIG_RAX(%rsp),%rsi
movq $-1,ORIG_RAX(%rsp)
@@ -1041,9 +1062,7 @@ END(spurious_interrupt)
addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
.endif
DISABLE_INTERRUPTS(CLBR_NONE)
- .if \irqtrace
TRACE_IRQS_OFF
- .endif
.endm
/*
@@ -1058,57 +1077,48 @@ END(spurious_interrupt)
* is fundamentally NMI-unsafe. (we cannot change the soft and
* hard flags at once, atomically)
*/
- .macro paranoidexit trace=1
+
/* ebx: no swapgs flag */
-paranoid_exit\trace:
+KPROBE_ENTRY(paranoid_exit)
+ INTR_FRAME
testl %ebx,%ebx /* swapgs needed? */
- jnz paranoid_restore\trace
+ jnz paranoid_restore
testl $3,CS(%rsp)
- jnz paranoid_userspace\trace
-paranoid_swapgs\trace:
- .if \trace
+ jnz paranoid_userspace
+paranoid_swapgs:
TRACE_IRQS_IRETQ 0
- .endif
SWAPGS_UNSAFE_STACK
-paranoid_restore\trace:
+paranoid_restore:
RESTORE_ALL 8
jmp irq_return
-paranoid_userspace\trace:
+paranoid_userspace:
GET_THREAD_INFO(%rcx)
movl TI_flags(%rcx),%ebx
andl $_TIF_WORK_MASK,%ebx
- jz paranoid_swapgs\trace
+ jz paranoid_swapgs
movq %rsp,%rdi /* &pt_regs */
call sync_regs
movq %rax,%rsp /* switch stack for scheduling */
testl $_TIF_NEED_RESCHED,%ebx
- jnz paranoid_schedule\trace
+ jnz paranoid_schedule
movl %ebx,%edx /* arg3: thread flags */
- .if \trace
TRACE_IRQS_ON
- .endif
ENABLE_INTERRUPTS(CLBR_NONE)
xorl %esi,%esi /* arg2: oldset */
movq %rsp,%rdi /* arg1: &pt_regs */
call do_notify_resume
DISABLE_INTERRUPTS(CLBR_NONE)
- .if \trace
TRACE_IRQS_OFF
- .endif
- jmp paranoid_userspace\trace
-paranoid_schedule\trace:
- .if \trace
+ jmp paranoid_userspace
+paranoid_schedule:
TRACE_IRQS_ON
- .endif
ENABLE_INTERRUPTS(CLBR_ANY)
call schedule
DISABLE_INTERRUPTS(CLBR_ANY)
- .if \trace
TRACE_IRQS_OFF
- .endif
- jmp paranoid_userspace\trace
+ jmp paranoid_userspace
CFI_ENDPROC
- .endm
+END(paranoid_exit)
/*
* Exception entry point. This expects an error code/orig_rax on the stack.
@@ -1326,20 +1336,63 @@ KPROBE_ENTRY(debug)
pushq $0
CFI_ADJUST_CFA_OFFSET 8
paranoidentry do_debug, DEBUG_STACK
- paranoidexit
+ jmp paranoid_exit
+ CFI_ENDPROC
KPROBE_END(debug)
/* runs on exception stack */
KPROBE_ENTRY(nmi)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $-1
- CFI_ADJUST_CFA_OFFSET 8
- paranoidentry do_nmi, 0, 0
+ pushq_cfi $-1
+ subq $15*8, %rsp
+ CFI_ADJUST_CFA_OFFSET 15*8
+ call save_paranoid
+ DEFAULT_FRAME 0
+ /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
+ movq %rsp,%rdi
+ movq ORIG_RAX(%rsp),%rsi
+ movq $-1,ORIG_RAX(%rsp)
+ call do_nmi
+ DISABLE_INTERRUPTS(CLBR_NONE)
#ifdef CONFIG_TRACE_IRQFLAGS
- paranoidexit 0
+ /* paranoidexit; without TRACE_IRQS_OFF */
+ /* ebx: no swapgs flag */
+nmi_exit:
+ testl %ebx,%ebx /* swapgs needed? */
+ jnz nmi_restore
+ testl $3,CS(%rsp)
+ jnz nmi_userspace
+nmi_swapgs:
+ SWAPGS_UNSAFE_STACK
+nmi_restore:
+ RESTORE_ALL 8
+ jmp irq_return
+nmi_userspace:
+ GET_THREAD_INFO(%rcx)
+ movl TI_flags(%rcx),%ebx
+ andl $_TIF_WORK_MASK,%ebx
+ jz nmi_swapgs
+ movq %rsp,%rdi /* &pt_regs */
+ call sync_regs
+ movq %rax,%rsp /* switch stack for scheduling */
+ testl $_TIF_NEED_RESCHED,%ebx
+ jnz nmi_schedule
+ movl %ebx,%edx /* arg3: thread flags */
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ xorl %esi,%esi /* arg2: oldset */
+ movq %rsp,%rdi /* arg1: &pt_regs */
+ call do_notify_resume
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ jmp nmi_userspace
+nmi_schedule:
+ ENABLE_INTERRUPTS(CLBR_ANY)
+ call schedule
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ jmp nmi_userspace
+ CFI_ENDPROC
#else
- jmp paranoid_exit1
+ jmp paranoid_exit
CFI_ENDPROC
#endif
KPROBE_END(nmi)
@@ -1350,7 +1403,7 @@ KPROBE_ENTRY(int3)
pushq $0
CFI_ADJUST_CFA_OFFSET 8
paranoidentry do_int3, DEBUG_STACK
- jmp paranoid_exit1
+ jmp paranoid_exit
CFI_ENDPROC
KPROBE_END(int3)
@@ -1375,7 +1428,7 @@ ENTRY(double_fault)
XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
paranoidentry do_double_fault
- jmp paranoid_exit1
+ jmp paranoid_exit
CFI_ENDPROC
END(double_fault)
@@ -1392,7 +1445,7 @@ ENTRY(stack_segment)
XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
paranoidentry do_stack_segment
- jmp paranoid_exit1
+ jmp paranoid_exit
CFI_ENDPROC
END(stack_segment)
@@ -1420,7 +1473,7 @@ ENTRY(machine_check)
pushq $0
CFI_ADJUST_CFA_OFFSET 8
paranoidentry do_machine_check
- jmp paranoid_exit1
+ jmp paranoid_exit
CFI_ENDPROC
END(machine_check)
#endif
--
1.5.4.3
DISABLE_INTERRUPTS(CLBR_NONE)/TRACE_IRQS_OFF is now always
executed just before paranoid_exit. Move it there.
Split out paranoidzeroentry, paranoiderrorentry, and
paranoidzeroentry_ist to get more readable macro's.
Signed-off-by: Alexander van Heukelum <[email protected]>
---
arch/x86/kernel/entry_64.S | 102 ++++++++++++++++++++++----------------------
1 files changed, 51 insertions(+), 51 deletions(-)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index fad777b..692c1da 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1026,6 +1026,39 @@ END(spurious_interrupt)
CFI_ENDPROC
.endm
+ .macro paranoidzeroentry sym
+ INTR_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
+ CFI_ADJUST_CFA_OFFSET 8
+ subq $15*8, %rsp
+ call save_paranoid
+ TRACE_IRQS_OFF
+ movq %rsp,%rdi /* pt_regs pointer */
+ xorl %esi,%esi /* no error code */
+ call \sym
+ jmp paranoid_exit /* %ebx: no swapgs flag */
+ CFI_ENDPROC
+ .endm
+
+ .macro paranoidzeroentry_ist sym ist
+ INTR_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
+ CFI_ADJUST_CFA_OFFSET 8
+ subq $15*8, %rsp
+ call save_paranoid
+ TRACE_IRQS_OFF
+ movq %rsp,%rdi /* pt_regs pointer */
+ xorl %esi,%esi /* no error code */
+ movq %gs:pda_data_offset, %rbp
+ subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+ call \sym
+ addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+ jmp paranoid_exit /* %ebx: no swapgs flag */
+ CFI_ENDPROC
+ .endm
+
.macro errorentry sym
XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
@@ -1042,27 +1075,20 @@ END(spurious_interrupt)
.endm
/* error code is on the stack already */
- .macro paranoidentry sym ist=0
- subq $15*8, %rsp
+ .macro paranoiderrorentry sym
+ XCPT_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
+ subq $15*8,%rsp
CFI_ADJUST_CFA_OFFSET 15*8
call save_paranoid
DEFAULT_FRAME 0
- .if \ist
- movq %gs:pda_data_offset, %rbp
- .endif
TRACE_IRQS_OFF
- movq %rsp,%rdi
- movq ORIG_RAX(%rsp),%rsi
- movq $-1,ORIG_RAX(%rsp)
- .if \ist
- subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
- .endif
+ movq %rsp,%rdi /* pt_regs pointer */
+ movq ORIG_RAX(%rsp),%rsi /* get error code */
+ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
call \sym
- .if \ist
- addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
- .endif
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
+ jmp paranoid_exit /* %ebx: no swapgs flag */
+ CFI_ENDPROC
.endm
/*
@@ -1081,6 +1107,8 @@ END(spurious_interrupt)
/* ebx: no swapgs flag */
KPROBE_ENTRY(paranoid_exit)
INTR_FRAME
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
testl %ebx,%ebx /* swapgs needed? */
jnz paranoid_restore
testl $3,CS(%rsp)
@@ -1331,13 +1359,7 @@ END(device_not_available)
/* runs on exception stack */
KPROBE_ENTRY(debug)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $0
- CFI_ADJUST_CFA_OFFSET 8
- paranoidentry do_debug, DEBUG_STACK
- jmp paranoid_exit
- CFI_ENDPROC
+ paranoidzeroentry_ist do_debug, DEBUG_STACK
KPROBE_END(debug)
/* runs on exception stack */
@@ -1351,14 +1373,12 @@ KPROBE_ENTRY(nmi)
DEFAULT_FRAME 0
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp,%rdi
- movq ORIG_RAX(%rsp),%rsi
- movq $-1,ORIG_RAX(%rsp)
+ movq $-1,%rsi
call do_nmi
- DISABLE_INTERRUPTS(CLBR_NONE)
#ifdef CONFIG_TRACE_IRQFLAGS
/* paranoidexit; without TRACE_IRQS_OFF */
/* ebx: no swapgs flag */
-nmi_exit:
+ DISABLE_INTERRUPTS(CLBR_NONE)
testl %ebx,%ebx /* swapgs needed? */
jnz nmi_restore
testl $3,CS(%rsp)
@@ -1398,13 +1418,7 @@ nmi_schedule:
KPROBE_END(nmi)
KPROBE_ENTRY(int3)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $0
- CFI_ADJUST_CFA_OFFSET 8
- paranoidentry do_int3, DEBUG_STACK
- jmp paranoid_exit
- CFI_ENDPROC
+ paranoidzeroentry_ist do_int3, DEBUG_STACK
KPROBE_END(int3)
ENTRY(overflow)
@@ -1425,11 +1439,7 @@ END(coprocessor_segment_overrun)
/* runs on exception stack */
ENTRY(double_fault)
- XCPT_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- paranoidentry do_double_fault
- jmp paranoid_exit
- CFI_ENDPROC
+ paranoiderrorentry do_double_fault
END(double_fault)
ENTRY(invalid_TSS)
@@ -1442,11 +1452,7 @@ END(segment_not_present)
/* runs on exception stack */
ENTRY(stack_segment)
- XCPT_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- paranoidentry do_stack_segment
- jmp paranoid_exit
- CFI_ENDPROC
+ paranoiderrorentry do_stack_segment
END(stack_segment)
KPROBE_ENTRY(general_protection)
@@ -1468,13 +1474,7 @@ END(spurious_interrupt_bug)
#ifdef CONFIG_X86_MCE
/* runs on exception stack */
ENTRY(machine_check)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq $0
- CFI_ADJUST_CFA_OFFSET 8
- paranoidentry do_machine_check
- jmp paranoid_exit
- CFI_ENDPROC
+ paranoidzeroentry do_machine_check
END(machine_check)
#endif
--
1.5.4.3
* Alexander van Heukelum <[email protected]> wrote:
> /* runs on exception stack */
> ENTRY(stack_segment)
> - XCPT_FRAME
> - PARAVIRT_ADJUST_EXCEPTION_FRAME
> - paranoidentry do_stack_segment
> - jmp paranoid_exit
> - CFI_ENDPROC
> + paranoiderrorentry do_stack_segment
> END(stack_segment)
cool!
Could you please also collapse it into just a single macro? The
ENTRY/END sequence can be generated by the macro. (if you do it then
please do it in a followup patch, i'll look at and apply your current
series)
Something like:
PARANOID_ERROR_ENTRY(stack_segment)
Could do all the magic, right?
Ingo
Impact: cleanup of entry_64.S
Except for the order and the place of the functions, this
patch should not change the generated code.
Signed-off-by: Alexander van Heukelum <[email protected]>
---
arch/x86/kernel/entry_64.S | 259 +++++++++++++++++++-------------------------
1 files changed, 109 insertions(+), 150 deletions(-)
On Fri, Nov 21, 2008 at 05:06:29PM +0100, Ingo Molnar wrote:
>
> * Alexander van Heukelum <[email protected]> wrote:
>
> > /* runs on exception stack */
> > ENTRY(stack_segment)
> > - XCPT_FRAME
> > - PARAVIRT_ADJUST_EXCEPTION_FRAME
> > - paranoidentry do_stack_segment
> > - jmp paranoid_exit
> > - CFI_ENDPROC
> > + paranoiderrorentry do_stack_segment
> > END(stack_segment)
>
> cool!
>
> Could you please also collapse it into just a single macro? The
> ENTRY/END sequence can be generated by the macro. (if you do it then
> please do it in a followup patch, i'll look at and apply your current
> series)
>
> Something like:
>
> PARANOID_ERROR_ENTRY(stack_segment)
I chose to just reuse the existing names, but it's bikeshedding, so
change it if you like ;)
I now know why I did not hit the bug that was fixed by "x86: split
out some macro's and move common code to paranoid_exit, fix"...
*blush* I was doing my real-world testing using an i386-image of
debian.
Greetings,
Alexander
> Could do all the magic, right?
>
> Ingo
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e5ddf57..ec0ed9c 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -945,76 +945,70 @@ END(common_interrupt)
/*
* APIC interrupts.
*/
- .p2align 5
-
- .macro apicinterrupt num,func
+.macro apicinterrupt num sym do_sym
+ENTRY(\sym)
INTR_FRAME
pushq $~(\num)
CFI_ADJUST_CFA_OFFSET 8
- interrupt \func
+ interrupt \do_sym
jmp ret_from_intr
CFI_ENDPROC
- .endm
+END(\sym)
+.endm
-ENTRY(thermal_interrupt)
- apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
-END(thermal_interrupt)
+#ifdef CONFIG_SMP
+apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
+ irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
+#endif
-ENTRY(threshold_interrupt)
- apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
-END(threshold_interrupt)
+apicinterrupt 220 \
+ uv_bau_message_intr1 uv_bau_message_interrupt
+apicinterrupt LOCAL_TIMER_VECTOR \
+ apic_timer_interrupt smp_apic_timer_interrupt
#ifdef CONFIG_SMP
-ENTRY(reschedule_interrupt)
- apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
-END(reschedule_interrupt)
-
- .macro INVALIDATE_ENTRY num
-ENTRY(invalidate_interrupt\num)
- apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
-END(invalidate_interrupt\num)
- .endm
-
- INVALIDATE_ENTRY 0
- INVALIDATE_ENTRY 1
- INVALIDATE_ENTRY 2
- INVALIDATE_ENTRY 3
- INVALIDATE_ENTRY 4
- INVALIDATE_ENTRY 5
- INVALIDATE_ENTRY 6
- INVALIDATE_ENTRY 7
-
-ENTRY(call_function_interrupt)
- apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
-END(call_function_interrupt)
-ENTRY(call_function_single_interrupt)
- apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
-END(call_function_single_interrupt)
-ENTRY(irq_move_cleanup_interrupt)
- apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
-END(irq_move_cleanup_interrupt)
+apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
+ invalidate_interrupt0 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \
+ invalidate_interrupt1 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \
+ invalidate_interrupt2 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \
+ invalidate_interrupt3 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \
+ invalidate_interrupt4 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \
+ invalidate_interrupt5 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \
+ invalidate_interrupt6 smp_invalidate_interrupt
+apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
+ invalidate_interrupt7 smp_invalidate_interrupt
#endif
-ENTRY(apic_timer_interrupt)
- apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
-END(apic_timer_interrupt)
-
-ENTRY(uv_bau_message_intr1)
- apicinterrupt 220,uv_bau_message_interrupt
-END(uv_bau_message_intr1)
+apicinterrupt THRESHOLD_APIC_VECTOR \
+ threshold_interrupt mce_threshold_interrupt
+apicinterrupt THERMAL_APIC_VECTOR \
+ thermal_interrupt smp_thermal_interrupt
-ENTRY(error_interrupt)
- apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
-END(error_interrupt)
+#ifdef CONFIG_SMP
+apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
+ call_function_single_interrupt smp_call_function_single_interrupt
+apicinterrupt CALL_FUNCTION_VECTOR \
+ call_function_interrupt smp_call_function_interrupt
+apicinterrupt RESCHEDULE_VECTOR \
+ reschedule_interrupt smp_reschedule_interrupt
+#endif
-ENTRY(spurious_interrupt)
- apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
-END(spurious_interrupt)
+apicinterrupt ERROR_APIC_VECTOR \
+ error_interrupt smp_error_interrupt
+apicinterrupt SPURIOUS_APIC_VECTOR \
+ spurious_interrupt smp_spurious_interrupt
/*
* Exception entry points.
*/
- .macro zeroentry sym
+.macro zeroentry sym do_sym
+ENTRY(\sym)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
@@ -1024,12 +1018,14 @@ END(spurious_interrupt)
DEFAULT_FRAME 0
movq %rsp,%rdi /* pt_regs pointer */
xorl %esi,%esi /* no error code */
- call \sym
+ call \do_sym
jmp error_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
- .endm
+END(\sym)
+.endm
- .macro paranoidzeroentry sym
+.macro paranoidzeroentry sym do_sym
+KPROBE_ENTRY(\sym)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $-1 /* ORIG_RAX: no syscall to restart */
@@ -1039,12 +1035,14 @@ END(spurious_interrupt)
TRACE_IRQS_OFF
movq %rsp,%rdi /* pt_regs pointer */
xorl %esi,%esi /* no error code */
- call \sym
+ call \do_sym
jmp paranoid_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
- .endm
+KPROBE_END(\sym)
+.endm
- .macro paranoidzeroentry_ist sym ist
+.macro paranoidzeroentry_ist sym do_sym ist
+KPROBE_ENTRY(\sym)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $-1 /* ORIG_RAX: no syscall to restart */
@@ -1056,13 +1054,19 @@ END(spurious_interrupt)
xorl %esi,%esi /* no error code */
movq %gs:pda_data_offset, %rbp
subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
- call \sym
+ call \do_sym
addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
jmp paranoid_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
- .endm
+KPROBE_END(\sym)
+.endm
- .macro errorentry sym
+.macro errorentry sym do_sym entry=0
+.if \entry
+KPROBE_ENTRY(\sym)
+.else
+ENTRY(\sym)
+.endif
XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
subq $15*8,%rsp
@@ -1072,13 +1076,23 @@ END(spurious_interrupt)
movq %rsp,%rdi /* pt_regs pointer */
movq ORIG_RAX(%rsp),%rsi /* get error code */
movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
- call \sym
+ call \do_sym
jmp error_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
- .endm
+.if \entry
+KPROBE_END(\sym)
+.else
+END(\sym)
+.endif
+.endm
/* error code is on the stack already */
- .macro paranoiderrorentry sym
+.macro paranoiderrorentry sym do_sym entry=1
+.if \entry
+KPROBE_ENTRY(\sym)
+.else
+ENTRY(\sym)
+.endif
XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
subq $15*8,%rsp
@@ -1089,10 +1103,37 @@ END(spurious_interrupt)
movq %rsp,%rdi /* pt_regs pointer */
movq ORIG_RAX(%rsp),%rsi /* get error code */
movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
- call \sym
+ call \do_sym
jmp paranoid_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
- .endm
+.if \entry
+KPROBE_END(\sym)
+.else
+END(\sym)
+.endif
+.endm
+
+zeroentry divide_error do_divide_error
+paranoidzeroentry_ist debug do_debug DEBUG_STACK
+paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
+zeroentry overflow do_overflow
+zeroentry bounds do_bounds
+zeroentry invalid_op do_invalid_op
+zeroentry device_not_available do_device_not_available
+paranoiderrorentry double_fault do_double_fault 0
+zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
+errorentry invalid_TSS do_invalid_TSS
+errorentry segment_not_present do_segment_not_present
+paranoiderrorentry stack_segment do_stack_segment
+errorentry general_protection do_general_protection 1
+errorentry page_fault do_page_fault 1
+zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
+zeroentry coprocessor_error do_coprocessor_error
+errorentry alignment_check do_alignment_check
+#ifdef CONFIG_X86_MCE
+paranoidzeroentry machine_check do_machine_check
+#endif
+zeroentry simd_coprocessor_error do_simd_coprocessor_error
/*
* "Paranoid" exit path from exception stack.
@@ -1344,26 +1385,7 @@ ENTRY(kernel_execve)
CFI_ENDPROC
ENDPROC(kernel_execve)
-KPROBE_ENTRY(page_fault)
- errorentry do_page_fault
-KPROBE_END(page_fault)
-ENTRY(coprocessor_error)
- zeroentry do_coprocessor_error
-END(coprocessor_error)
-
-ENTRY(simd_coprocessor_error)
- zeroentry do_simd_coprocessor_error
-END(simd_coprocessor_error)
-
-ENTRY(device_not_available)
- zeroentry do_device_not_available
-END(device_not_available)
-
- /* runs on exception stack */
-KPROBE_ENTRY(debug)
- paranoidzeroentry_ist do_debug, DEBUG_STACK
-KPROBE_END(debug)
/* runs on exception stack */
KPROBE_ENTRY(nmi)
@@ -1420,67 +1442,6 @@ nmi_schedule:
#endif
KPROBE_END(nmi)
-KPROBE_ENTRY(int3)
- paranoidzeroentry_ist do_int3, DEBUG_STACK
-KPROBE_END(int3)
-
-ENTRY(overflow)
- zeroentry do_overflow
-END(overflow)
-
-ENTRY(bounds)
- zeroentry do_bounds
-END(bounds)
-
-ENTRY(invalid_op)
- zeroentry do_invalid_op
-END(invalid_op)
-
-ENTRY(coprocessor_segment_overrun)
- zeroentry do_coprocessor_segment_overrun
-END(coprocessor_segment_overrun)
-
- /* runs on exception stack */
-ENTRY(double_fault)
- paranoiderrorentry do_double_fault
-END(double_fault)
-
-ENTRY(invalid_TSS)
- errorentry do_invalid_TSS
-END(invalid_TSS)
-
-ENTRY(segment_not_present)
- errorentry do_segment_not_present
-END(segment_not_present)
-
- /* runs on exception stack */
-ENTRY(stack_segment)
- paranoiderrorentry do_stack_segment
-END(stack_segment)
-
-KPROBE_ENTRY(general_protection)
- errorentry do_general_protection
-KPROBE_END(general_protection)
-
-ENTRY(alignment_check)
- errorentry do_alignment_check
-END(alignment_check)
-
-ENTRY(divide_error)
- zeroentry do_divide_error
-END(divide_error)
-
-ENTRY(spurious_interrupt_bug)
- zeroentry do_spurious_interrupt_bug
-END(spurious_interrupt_bug)
-
-#ifdef CONFIG_X86_MCE
- /* runs on exception stack */
-ENTRY(machine_check)
- paranoidzeroentry do_machine_check
-END(machine_check)
-#endif
-
/* Call softirq on interrupt stack. Interrupts are off. */
ENTRY(call_softirq)
CFI_STARTPROC
@@ -1509,9 +1470,7 @@ KPROBE_ENTRY(ignore_sysret)
ENDPROC(ignore_sysret)
#ifdef CONFIG_XEN
-ENTRY(xen_hypervisor_callback)
- zeroentry xen_do_hypervisor_callback
-END(xen_hypervisor_callback)
+zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
/*
# A note on the "critical region" in our callback handler.
--
1.5.4.3
Impact: moves some code out of .kprobes.text
KPROBE_ENTRY switches code generation to .kprobes.text, and KPROBE_END
uses .popsection to get back to the previous section (.text, normally).
Also replace ENDPROC by END, for consistency.
Signed-off-by: Alexander van Heukelum <[email protected]>
---
arch/x86/kernel/entry_64.S | 16 +++++++---------
1 files changed, 7 insertions(+), 9 deletions(-)
Hi Ingo,
One more small change for today. The xen-related functions
xen_do_hypervisor_callback and xen_failsafe_callback are put
in the .kprobes.text even in the current kernel: ignore_sysret
is enclosed in KPROBE_ENTRY / ENDPROC, instead of KPROBE_ENTRY /
KPROBE_END, but I guess the situation is harmless.
Greetings,
Alexander
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index ec0ed9c..477a428 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1190,7 +1190,7 @@ paranoid_schedule:
TRACE_IRQS_OFF
jmp paranoid_userspace
CFI_ENDPROC
-END(paranoid_exit)
+KPROBE_END(paranoid_exit)
/*
* Exception entry point. This expects an error code/orig_rax on the stack.
@@ -1282,7 +1282,7 @@ gs_change:
CFI_ADJUST_CFA_OFFSET -8
ret
CFI_ENDPROC
-ENDPROC(native_load_gs_index)
+END(native_load_gs_index)
.section __ex_table,"a"
.align 8
@@ -1336,7 +1336,7 @@ ENTRY(kernel_thread)
UNFAKE_STACK_FRAME
ret
CFI_ENDPROC
-ENDPROC(kernel_thread)
+END(kernel_thread)
child_rip:
pushq $0 # fake return address
@@ -1352,7 +1352,7 @@ child_rip:
mov %eax, %edi
call do_exit
CFI_ENDPROC
-ENDPROC(child_rip)
+END(child_rip)
/*
* execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
@@ -1383,9 +1383,7 @@ ENTRY(kernel_execve)
UNFAKE_STACK_FRAME
ret
CFI_ENDPROC
-ENDPROC(kernel_execve)
-
-
+END(kernel_execve)
/* runs on exception stack */
KPROBE_ENTRY(nmi)
@@ -1460,14 +1458,14 @@ ENTRY(call_softirq)
decl %gs:pda_irqcount
ret
CFI_ENDPROC
-ENDPROC(call_softirq)
+END(call_softirq)
KPROBE_ENTRY(ignore_sysret)
CFI_STARTPROC
mov $-ENOSYS,%eax
sysret
CFI_ENDPROC
-ENDPROC(ignore_sysret)
+KPROBE_END(ignore_sysret)
#ifdef CONFIG_XEN
zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
--
1.5.4.3
[Alexander van Heukelum - Sun, Nov 23, 2008 at 10:08:28AM +0100]
| Impact: cleanup of entry_64.S
|
| Except for the order and the place of the functions, this
| patch should not change the generated code.
|
| Signed-off-by: Alexander van Heukelum <[email protected]>
|
| ---
| arch/x86/kernel/entry_64.S | 259 +++++++++++++++++++-------------------------
| 1 files changed, 109 insertions(+), 150 deletions(-)
|
Hi Alexander,
great! One moment is not obvious for me -- why we
stopped to align interrupt section to 32 bytes?
Did I miss anyhing?
- Cyrill -
On Sun, Nov 23, 2008 at 12:21:36PM +0300, Cyrill Gorcunov wrote:
> [Alexander van Heukelum - Sun, Nov 23, 2008 at 10:08:28AM +0100]
> | Impact: cleanup of entry_64.S
> |
> | Except for the order and the place of the functions, this
> | patch should not change the generated code.
> |
> | Signed-off-by: Alexander van Heukelum <[email protected]>
> |
> | ---
> | arch/x86/kernel/entry_64.S | 259 +++++++++++++++++++-------------------------
> | 1 files changed, 109 insertions(+), 150 deletions(-)
> |
>
> Hi Alexander,
>
> great! One moment is not obvious for me -- why we
> stopped to align interrupt section to 32 bytes?
> Did I miss anyhing?
I put a ".p2align 5" in earlier in the series which caused the
apicinterrupts to be 32-byte aligned. But it is a hack, really,
relying on the generated code per stub to be between 17 and 32
bytes, on the default alignment to be 16 bytes and all stubs
to be in the .text section.
I'm in favour of aligning all of the interrupt/exception stubs
to 32 bytes, but it should be implemented the right way ;),
which means that we need KPROBE_ENTRY_P5ALIGNED and so on :-/.
Greetings,
Alexander
> - Cyrill -
[Alexander van Heukelum - Sun, Nov 23, 2008 at 12:23:54PM +0100]
| On Sun, Nov 23, 2008 at 12:21:36PM +0300, Cyrill Gorcunov wrote:
| > [Alexander van Heukelum - Sun, Nov 23, 2008 at 10:08:28AM +0100]
| > | Impact: cleanup of entry_64.S
| > |
| > | Except for the order and the place of the functions, this
| > | patch should not change the generated code.
| > |
| > | Signed-off-by: Alexander van Heukelum <[email protected]>
| > |
| > | ---
| > | arch/x86/kernel/entry_64.S | 259 +++++++++++++++++++-------------------------
| > | 1 files changed, 109 insertions(+), 150 deletions(-)
| > |
| >
| > Hi Alexander,
| >
| > great! One moment is not obvious for me -- why we
| > stopped to align interrupt section to 32 bytes?
| > Did I miss anyhing?
|
| I put a ".p2align 5" in earlier in the series which caused the
| apicinterrupts to be 32-byte aligned. But it is a hack, really,
| relying on the generated code per stub to be between 17 and 32
| bytes, on the default alignment to be 16 bytes and all stubs
| to be in the .text section.
|
| I'm in favour of aligning all of the interrupt/exception stubs
| to 32 bytes, but it should be implemented the right way ;),
| which means that we need KPROBE_ENTRY_P5ALIGNED and so on :-/.
|
| Greetings,
| Alexander
|
| > - Cyrill -
|
ah. thanks for info.
- Cyrill -
* Alexander van Heukelum <[email protected]> wrote:
> Impact: cleanup of entry_64.S
>
> Except for the order and the place of the functions, this
> patch should not change the generated code.
>
> Signed-off-by: Alexander van Heukelum <[email protected]>
>
> ---
> arch/x86/kernel/entry_64.S | 259 +++++++++++++++++++-------------------------
> 1 files changed, 109 insertions(+), 150 deletions(-)
applied to tip/x86/irq, thanks Alexander!
> > Something like:
> >
> > PARANOID_ERROR_ENTRY(stack_segment)
>
> I chose to just reuse the existing names, but it's bikeshedding, so
> change it if you like ;)
yeah. But such things, if they pile up long enough, can result in real
problem. entry_64.S is the result of such a degenerative process.
> I now know why I did not hit the bug that was fixed by "x86: split
> out some macro's and move common code to paranoid_exit, fix"...
> *blush* I was doing my real-world testing using an i386-image of
> debian.
heh, that indeed explains :)
Ingo
* Alexander van Heukelum <[email protected]> wrote:
> Impact: moves some code out of .kprobes.text
>
> KPROBE_ENTRY switches code generation to .kprobes.text, and KPROBE_END
> uses .popsection to get back to the previous section (.text, normally).
> Also replace ENDPROC by END, for consistency.
>
> Signed-off-by: Alexander van Heukelum <[email protected]>
applied to tip/x86/irq, thanks Alexander!
> One more small change for today. The xen-related functions
> xen_do_hypervisor_callback and xen_failsafe_callback are put
> in the .kprobes.text even in the current kernel: ignore_sysret
> is enclosed in KPROBE_ENTRY / ENDPROC, instead of KPROBE_ENTRY /
> KPROBE_END, but I guess the situation is harmless.
yeah. It narrows no-kprobes protection for that code, but it should
indeed be fine (and that's the intention as well).
Note that this is a reoccuring bug type, and rather long-lived. Can
you think of any way to get automated nesting protection of both the
.cfi_startproc/endproc macros and kprobes start/end? A poor man's
solution would be to grep the number of start and end methods and
enforce that they are equal.
Ingo
[Ingo Molnar - Sun, Nov 23, 2008 at 02:27:52PM +0100]
|
| * Alexander van Heukelum <[email protected]> wrote:
|
| > Impact: moves some code out of .kprobes.text
| >
| > KPROBE_ENTRY switches code generation to .kprobes.text, and KPROBE_END
| > uses .popsection to get back to the previous section (.text, normally).
| > Also replace ENDPROC by END, for consistency.
| >
| > Signed-off-by: Alexander van Heukelum <[email protected]>
|
| applied to tip/x86/irq, thanks Alexander!
|
| > One more small change for today. The xen-related functions
| > xen_do_hypervisor_callback and xen_failsafe_callback are put
| > in the .kprobes.text even in the current kernel: ignore_sysret
| > is enclosed in KPROBE_ENTRY / ENDPROC, instead of KPROBE_ENTRY /
| > KPROBE_END, but I guess the situation is harmless.
|
| yeah. It narrows no-kprobes protection for that code, but it should
| indeed be fine (and that's the intention as well).
|
| Note that this is a reoccuring bug type, and rather long-lived. Can
| you think of any way to get automated nesting protection of both the
| .cfi_startproc/endproc macros and kprobes start/end? A poor man's
| solution would be to grep the number of start and end methods and
| enforce that they are equal.
|
| Ingo
|
I think we could play with preprocessor and check if ENTRY/END matches.
Looking now.
- Cyrill -
[Cyrill Gorcunov - Sun, Nov 23, 2008 at 04:51:34PM +0300]
| [Ingo Molnar - Sun, Nov 23, 2008 at 02:27:52PM +0100]
| |
| | * Alexander van Heukelum <[email protected]> wrote:
| |
| | > Impact: moves some code out of .kprobes.text
| | >
| | > KPROBE_ENTRY switches code generation to .kprobes.text, and KPROBE_END
| | > uses .popsection to get back to the previous section (.text, normally).
| | > Also replace ENDPROC by END, for consistency.
| | >
| | > Signed-off-by: Alexander van Heukelum <[email protected]>
| |
| | applied to tip/x86/irq, thanks Alexander!
| |
| | > One more small change for today. The xen-related functions
| | > xen_do_hypervisor_callback and xen_failsafe_callback are put
| | > in the .kprobes.text even in the current kernel: ignore_sysret
| | > is enclosed in KPROBE_ENTRY / ENDPROC, instead of KPROBE_ENTRY /
| | > KPROBE_END, but I guess the situation is harmless.
| |
| | yeah. It narrows no-kprobes protection for that code, but it should
| | indeed be fine (and that's the intention as well).
| |
| | Note that this is a reoccuring bug type, and rather long-lived. Can
| | you think of any way to get automated nesting protection of both the
| | .cfi_startproc/endproc macros and kprobes start/end? A poor man's
| | solution would be to grep the number of start and end methods and
| | enforce that they are equal.
| |
| | Ingo
| |
|
| I think we could play with preprocessor and check if ENTRY/END matches.
| Looking now.
|
| - Cyrill -
Here is what I've done
1) Add some macros like:
.macro __set_entry
.set _ENTRY_IN, 1
.endm
.macro __unset_entry
.set _ENTRY_IN, 0
.endm
.macro __check_entry
.ifeq _ENTRY_IN
.error "END should be used"
.abort
.endif
.endm
So the code
ENTRY(mcount)
__unset_entry
retq
__check_entry
END(mcount)
will fail like
cyrill@lenovo linux-2.6.git $ make arch/x86/kernel/entry_64.o
CHK include/linux/version.h
CHK include/linux/utsrelease.h
SYMLINK include/asm -> include/asm-x86
CALL scripts/checksyscalls.sh
AS arch/x86/kernel/entry_64.o
arch/x86/kernel/entry_64.S: Assembler messages:
arch/x86/kernel/entry_64.S:84: Error: END should be used
arch/x86/kernel/entry_64.S:84: Fatal error: .abort detected. Abandoning ship.
make[1]: *** [arch/x86/kernel/entry_64.o] Error 1
make: *** [arch/x86/kernel/entry_64.o] Error 2
cyrill@lenovo linux-2.6.git $
So if such an approach is acceptable (in general) -- I could take a more
deeper look. So every ENTRY would check if other ENTRY/KPROBE is active
and report that.
- Cyrill -
* Cyrill Gorcunov <[email protected]> wrote:
> [Cyrill Gorcunov - Sun, Nov 23, 2008 at 04:51:34PM +0300]
> | [Ingo Molnar - Sun, Nov 23, 2008 at 02:27:52PM +0100]
> | |
> | | * Alexander van Heukelum <[email protected]> wrote:
> | |
> | | > Impact: moves some code out of .kprobes.text
> | | >
> | | > KPROBE_ENTRY switches code generation to .kprobes.text, and KPROBE_END
> | | > uses .popsection to get back to the previous section (.text, normally).
> | | > Also replace ENDPROC by END, for consistency.
> | | >
> | | > Signed-off-by: Alexander van Heukelum <[email protected]>
> | |
> | | applied to tip/x86/irq, thanks Alexander!
> | |
> | | > One more small change for today. The xen-related functions
> | | > xen_do_hypervisor_callback and xen_failsafe_callback are put
> | | > in the .kprobes.text even in the current kernel: ignore_sysret
> | | > is enclosed in KPROBE_ENTRY / ENDPROC, instead of KPROBE_ENTRY /
> | | > KPROBE_END, but I guess the situation is harmless.
> | |
> | | yeah. It narrows no-kprobes protection for that code, but it should
> | | indeed be fine (and that's the intention as well).
> | |
> | | Note that this is a reoccuring bug type, and rather long-lived. Can
> | | you think of any way to get automated nesting protection of both the
> | | .cfi_startproc/endproc macros and kprobes start/end? A poor man's
> | | solution would be to grep the number of start and end methods and
> | | enforce that they are equal.
> | |
> | | Ingo
> | |
> |
> | I think we could play with preprocessor and check if ENTRY/END matches.
> | Looking now.
> |
> | - Cyrill -
>
> Here is what I've done
>
> 1) Add some macros like:
>
> .macro __set_entry
> .set _ENTRY_IN, 1
> .endm
>
> .macro __unset_entry
> .set _ENTRY_IN, 0
> .endm
>
> .macro __check_entry
> .ifeq _ENTRY_IN
> .error "END should be used"
> .abort
> .endif
> .endm
>
> So the code
>
> ENTRY(mcount)
> __unset_entry
> retq
> __check_entry
> END(mcount)
>
> will fail like
>
> cyrill@lenovo linux-2.6.git $ make arch/x86/kernel/entry_64.o
> CHK include/linux/version.h
> CHK include/linux/utsrelease.h
> SYMLINK include/asm -> include/asm-x86
> CALL scripts/checksyscalls.sh
> AS arch/x86/kernel/entry_64.o
> arch/x86/kernel/entry_64.S: Assembler messages:
> arch/x86/kernel/entry_64.S:84: Error: END should be used
> arch/x86/kernel/entry_64.S:84: Fatal error: .abort detected. Abandoning ship.
> make[1]: *** [arch/x86/kernel/entry_64.o] Error 1
> make: *** [arch/x86/kernel/entry_64.o] Error 2
> cyrill@lenovo linux-2.6.git $
>
> So if such an approach is acceptable (in general) -- I could take a
> more deeper look. So every ENTRY would check if other ENTRY/KPROBE
> is active and report that.
looks good!
Can we somehow detect a missing .cfi_endproc? That's another pattern
i've seen.
Ingo
[Ingo Molnar - Sun, Nov 23, 2008 at 03:55:54PM +0100]
|
| * Cyrill Gorcunov <[email protected]> wrote:
|
| > [Cyrill Gorcunov - Sun, Nov 23, 2008 at 04:51:34PM +0300]
| > | [Ingo Molnar - Sun, Nov 23, 2008 at 02:27:52PM +0100]
| > | |
| > | | * Alexander van Heukelum <[email protected]> wrote:
| > | |
| > | | > Impact: moves some code out of .kprobes.text
| > | | >
| > | | > KPROBE_ENTRY switches code generation to .kprobes.text, and KPROBE_END
| > | | > uses .popsection to get back to the previous section (.text, normally).
| > | | > Also replace ENDPROC by END, for consistency.
| > | | >
| > | | > Signed-off-by: Alexander van Heukelum <[email protected]>
| > | |
| > | | applied to tip/x86/irq, thanks Alexander!
| > | |
| > | | > One more small change for today. The xen-related functions
| > | | > xen_do_hypervisor_callback and xen_failsafe_callback are put
| > | | > in the .kprobes.text even in the current kernel: ignore_sysret
| > | | > is enclosed in KPROBE_ENTRY / ENDPROC, instead of KPROBE_ENTRY /
| > | | > KPROBE_END, but I guess the situation is harmless.
| > | |
| > | | yeah. It narrows no-kprobes protection for that code, but it should
| > | | indeed be fine (and that's the intention as well).
| > | |
| > | | Note that this is a reoccuring bug type, and rather long-lived. Can
| > | | you think of any way to get automated nesting protection of both the
| > | | .cfi_startproc/endproc macros and kprobes start/end? A poor man's
| > | | solution would be to grep the number of start and end methods and
| > | | enforce that they are equal.
| > | |
| > | | Ingo
| > | |
| > |
| > | I think we could play with preprocessor and check if ENTRY/END matches.
| > | Looking now.
| > |
| > | - Cyrill -
| >
| > Here is what I've done
| >
| > 1) Add some macros like:
| >
| > .macro __set_entry
| > .set _ENTRY_IN, 1
| > .endm
| >
| > .macro __unset_entry
| > .set _ENTRY_IN, 0
| > .endm
| >
| > .macro __check_entry
| > .ifeq _ENTRY_IN
| > .error "END should be used"
| > .abort
| > .endif
| > .endm
| >
| > So the code
| >
| > ENTRY(mcount)
| > __unset_entry
| > retq
| > __check_entry
| > END(mcount)
| >
| > will fail like
| >
| > cyrill@lenovo linux-2.6.git $ make arch/x86/kernel/entry_64.o
| > CHK include/linux/version.h
| > CHK include/linux/utsrelease.h
| > SYMLINK include/asm -> include/asm-x86
| > CALL scripts/checksyscalls.sh
| > AS arch/x86/kernel/entry_64.o
| > arch/x86/kernel/entry_64.S: Assembler messages:
| > arch/x86/kernel/entry_64.S:84: Error: END should be used
| > arch/x86/kernel/entry_64.S:84: Fatal error: .abort detected. Abandoning ship.
| > make[1]: *** [arch/x86/kernel/entry_64.o] Error 1
| > make: *** [arch/x86/kernel/entry_64.o] Error 2
| > cyrill@lenovo linux-2.6.git $
| >
| > So if such an approach is acceptable (in general) -- I could take a
| > more deeper look. So every ENTRY would check if other ENTRY/KPROBE
| > is active and report that.
|
| looks good!
|
| Can we somehow detect a missing .cfi_endproc? That's another pattern
| i've seen.
|
| Ingo
|
As only I finish with this pattern we could add anything we need :)
Will keep you in touch.
- Cyrill -
On Sun, Nov 23, 2008 at 05:12:37PM +0300, Cyrill Gorcunov wrote:
> [Cyrill Gorcunov - Sun, Nov 23, 2008 at 04:51:34PM +0300]
> | [Ingo Molnar - Sun, Nov 23, 2008 at 02:27:52PM +0100]
> | |
> | | * Alexander van Heukelum <[email protected]> wrote:
> | |
> | | > Impact: moves some code out of .kprobes.text
> | | >
> | | > KPROBE_ENTRY switches code generation to .kprobes.text, and KPROBE_END
> | | > uses .popsection to get back to the previous section (.text, normally).
> | | > Also replace ENDPROC by END, for consistency.
> | | >
> | | > Signed-off-by: Alexander van Heukelum <[email protected]>
> | |
> | | applied to tip/x86/irq, thanks Alexander!
> | |
> | | > One more small change for today. The xen-related functions
> | | > xen_do_hypervisor_callback and xen_failsafe_callback are put
> | | > in the .kprobes.text even in the current kernel: ignore_sysret
> | | > is enclosed in KPROBE_ENTRY / ENDPROC, instead of KPROBE_ENTRY /
> | | > KPROBE_END, but I guess the situation is harmless.
> | |
> | | yeah. It narrows no-kprobes protection for that code, but it should
> | | indeed be fine (and that's the intention as well).
> | |
> | | Note that this is a reoccuring bug type, and rather long-lived. Can
> | | you think of any way to get automated nesting protection of both the
> | | .cfi_startproc/endproc macros and kprobes start/end? A poor man's
> | | solution would be to grep the number of start and end methods and
> | | enforce that they are equal.
> | |
> | | Ingo
> | |
> |
> | I think we could play with preprocessor and check if ENTRY/END matches.
> | Looking now.
> |
> | - Cyrill -
>
> Here is what I've done
>
> 1) Add some macros like:
>
> .macro __set_entry
> .set _ENTRY_IN, 1
> .endm
>
> .macro __unset_entry
> .set _ENTRY_IN, 0
> .endm
>
> .macro __check_entry
> .ifeq _ENTRY_IN
> .error "END should be used"
> .abort
> .endif
> .endm
>
> So the code
>
> ENTRY(mcount)
> __unset_entry
> retq
> __check_entry
> END(mcount)
Looks like a good approach to me. But I assume the ENTRY cppmacro
will include magic?
Greetings,
Alexander
> will fail like
>
> cyrill@lenovo linux-2.6.git $ make arch/x86/kernel/entry_64.o
> CHK include/linux/version.h
> CHK include/linux/utsrelease.h
> SYMLINK include/asm -> include/asm-x86
> CALL scripts/checksyscalls.sh
> AS arch/x86/kernel/entry_64.o
> arch/x86/kernel/entry_64.S: Assembler messages:
> arch/x86/kernel/entry_64.S:84: Error: END should be used
> arch/x86/kernel/entry_64.S:84: Fatal error: .abort detected. Abandoning ship.
> make[1]: *** [arch/x86/kernel/entry_64.o] Error 1
> make: *** [arch/x86/kernel/entry_64.o] Error 2
> cyrill@lenovo linux-2.6.git $
>
> So if such an approach is acceptable (in general) -- I could take a more
> deeper look. So every ENTRY would check if other ENTRY/KPROBE is active
> and report that.
>
> - Cyrill -
[Alexander van Heukelum - Sun, Nov 23, 2008 at 04:04:18PM +0100]
| On Sun, Nov 23, 2008 at 05:12:37PM +0300, Cyrill Gorcunov wrote:
| > [Cyrill Gorcunov - Sun, Nov 23, 2008 at 04:51:34PM +0300]
| > | [Ingo Molnar - Sun, Nov 23, 2008 at 02:27:52PM +0100]
| > | |
| > | | * Alexander van Heukelum <[email protected]> wrote:
| > | |
| > | | > Impact: moves some code out of .kprobes.text
| > | | >
| > | | > KPROBE_ENTRY switches code generation to .kprobes.text, and KPROBE_END
| > | | > uses .popsection to get back to the previous section (.text, normally).
| > | | > Also replace ENDPROC by END, for consistency.
| > | | >
| > | | > Signed-off-by: Alexander van Heukelum <[email protected]>
| > | |
| > | | applied to tip/x86/irq, thanks Alexander!
| > | |
| > | | > One more small change for today. The xen-related functions
| > | | > xen_do_hypervisor_callback and xen_failsafe_callback are put
| > | | > in the .kprobes.text even in the current kernel: ignore_sysret
| > | | > is enclosed in KPROBE_ENTRY / ENDPROC, instead of KPROBE_ENTRY /
| > | | > KPROBE_END, but I guess the situation is harmless.
| > | |
| > | | yeah. It narrows no-kprobes protection for that code, but it should
| > | | indeed be fine (and that's the intention as well).
| > | |
| > | | Note that this is a reoccuring bug type, and rather long-lived. Can
| > | | you think of any way to get automated nesting protection of both the
| > | | .cfi_startproc/endproc macros and kprobes start/end? A poor man's
| > | | solution would be to grep the number of start and end methods and
| > | | enforce that they are equal.
| > | |
| > | | Ingo
| > | |
| > |
| > | I think we could play with preprocessor and check if ENTRY/END matches.
| > | Looking now.
| > |
| > | - Cyrill -
| >
| > Here is what I've done
| >
| > 1) Add some macros like:
| >
| > .macro __set_entry
| > .set _ENTRY_IN, 1
| > .endm
| >
| > .macro __unset_entry
| > .set _ENTRY_IN, 0
| > .endm
| >
| > .macro __check_entry
| > .ifeq _ENTRY_IN
| > .error "END should be used"
| > .abort
| > .endif
| > .endm
| >
| > So the code
| >
| > ENTRY(mcount)
| > __unset_entry
| > retq
| > __check_entry
| > END(mcount)
|
| Looks like a good approach to me. But I assume the ENTRY cppmacro
| will include magic?
|
| Greetings,
| Alexander
|
yes, but now I'm in doubts since we have this definition in common linkage.h
I dont know if such approach would be usable on other platforms.
- Cyrill -
* Cyrill Gorcunov <[email protected]> wrote:
> [Alexander van Heukelum - Sun, Nov 23, 2008 at 04:04:18PM +0100]
> | On Sun, Nov 23, 2008 at 05:12:37PM +0300, Cyrill Gorcunov wrote:
> | > [Cyrill Gorcunov - Sun, Nov 23, 2008 at 04:51:34PM +0300]
> | > | [Ingo Molnar - Sun, Nov 23, 2008 at 02:27:52PM +0100]
> | > | |
> | > | | * Alexander van Heukelum <[email protected]> wrote:
> | > | |
> | > | | > Impact: moves some code out of .kprobes.text
> | > | | >
> | > | | > KPROBE_ENTRY switches code generation to .kprobes.text, and KPROBE_END
> | > | | > uses .popsection to get back to the previous section (.text, normally).
> | > | | > Also replace ENDPROC by END, for consistency.
> | > | | >
> | > | | > Signed-off-by: Alexander van Heukelum <[email protected]>
> | > | |
> | > | | applied to tip/x86/irq, thanks Alexander!
> | > | |
> | > | | > One more small change for today. The xen-related functions
> | > | | > xen_do_hypervisor_callback and xen_failsafe_callback are put
> | > | | > in the .kprobes.text even in the current kernel: ignore_sysret
> | > | | > is enclosed in KPROBE_ENTRY / ENDPROC, instead of KPROBE_ENTRY /
> | > | | > KPROBE_END, but I guess the situation is harmless.
> | > | |
> | > | | yeah. It narrows no-kprobes protection for that code, but it should
> | > | | indeed be fine (and that's the intention as well).
> | > | |
> | > | | Note that this is a reoccuring bug type, and rather long-lived. Can
> | > | | you think of any way to get automated nesting protection of both the
> | > | | .cfi_startproc/endproc macros and kprobes start/end? A poor man's
> | > | | solution would be to grep the number of start and end methods and
> | > | | enforce that they are equal.
> | > | |
> | > | | Ingo
> | > | |
> | > |
> | > | I think we could play with preprocessor and check if ENTRY/END matches.
> | > | Looking now.
> | > |
> | > | - Cyrill -
> | >
> | > Here is what I've done
> | >
> | > 1) Add some macros like:
> | >
> | > .macro __set_entry
> | > .set _ENTRY_IN, 1
> | > .endm
> | >
> | > .macro __unset_entry
> | > .set _ENTRY_IN, 0
> | > .endm
> | >
> | > .macro __check_entry
> | > .ifeq _ENTRY_IN
> | > .error "END should be used"
> | > .abort
> | > .endif
> | > .endm
> | >
> | > So the code
> | >
> | > ENTRY(mcount)
> | > __unset_entry
> | > retq
> | > __check_entry
> | > END(mcount)
> |
> | Looks like a good approach to me. But I assume the ENTRY cppmacro
> | will include magic?
> |
> | Greetings,
> | Alexander
> |
>
> yes, but now I'm in doubts since we have this definition in common
> linkage.h I dont know if such approach would be usable on other
> platforms.
i'd suggest to introduce another entry macro name for that, for the
time being. If other architectures want to pick up the method, they
can generalize and test it.
( but this is assembly magic so i'm doubtful - while the features used
are generic GAS features that should work everywhere, binutils
variants tend to be rather fragile. So lets go with some other name
like X86_ENTRY()/X86_END() or so - or maybe ENTRY_CFI()/END_CFI(). )
Ingo
[Alexander van Heukelum - Sun, Nov 23, 2008 at 04:04:18PM +0100]
| On Sun, Nov 23, 2008 at 05:12:37PM +0300, Cyrill Gorcunov wrote:
| > [Cyrill Gorcunov - Sun, Nov 23, 2008 at 04:51:34PM +0300]
| > | [Ingo Molnar - Sun, Nov 23, 2008 at 02:27:52PM +0100]
| > | |
| > | | * Alexander van Heukelum <[email protected]> wrote:
| > | |
| > | | > Impact: moves some code out of .kprobes.text
| > | | >
| > | | > KPROBE_ENTRY switches code generation to .kprobes.text, and KPROBE_END
| > | | > uses .popsection to get back to the previous section (.text, normally).
| > | | > Also replace ENDPROC by END, for consistency.
| > | | >
| > | | > Signed-off-by: Alexander van Heukelum <[email protected]>
| > | |
| > | | applied to tip/x86/irq, thanks Alexander!
| > | |
| > | | > One more small change for today. The xen-related functions
| > | | > xen_do_hypervisor_callback and xen_failsafe_callback are put
| > | | > in the .kprobes.text even in the current kernel: ignore_sysret
| > | | > is enclosed in KPROBE_ENTRY / ENDPROC, instead of KPROBE_ENTRY /
| > | | > KPROBE_END, but I guess the situation is harmless.
| > | |
| > | | yeah. It narrows no-kprobes protection for that code, but it should
| > | | indeed be fine (and that's the intention as well).
| > | |
| > | | Note that this is a reoccuring bug type, and rather long-lived. Can
| > | | you think of any way to get automated nesting protection of both the
| > | | .cfi_startproc/endproc macros and kprobes start/end? A poor man's
| > | | solution would be to grep the number of start and end methods and
| > | | enforce that they are equal.
| > | |
| > | | Ingo
| > | |
| > |
| > | I think we could play with preprocessor and check if ENTRY/END matches.
| > | Looking now.
| > |
| > | - Cyrill -
| >
| > Here is what I've done
| >
| > 1) Add some macros like:
| >
| > .macro __set_entry
| > .set _ENTRY_IN, 1
| > .endm
| >
| > .macro __unset_entry
| > .set _ENTRY_IN, 0
| > .endm
| >
| > .macro __check_entry
| > .ifeq _ENTRY_IN
| > .error "END should be used"
| > .abort
| > .endif
| > .endm
| >
| > So the code
| >
| > ENTRY(mcount)
| > __unset_entry
| > retq
| > __check_entry
| > END(mcount)
|
| Looks like a good approach to me. But I assume the ENTRY cppmacro
| will include magic?
|
| Greetings,
| Alexander
|
| > will fail like
| >
| > cyrill@lenovo linux-2.6.git $ make arch/x86/kernel/entry_64.o
| > CHK include/linux/version.h
| > CHK include/linux/utsrelease.h
| > SYMLINK include/asm -> include/asm-x86
| > CALL scripts/checksyscalls.sh
| > AS arch/x86/kernel/entry_64.o
| > arch/x86/kernel/entry_64.S: Assembler messages:
| > arch/x86/kernel/entry_64.S:84: Error: END should be used
| > arch/x86/kernel/entry_64.S:84: Fatal error: .abort detected. Abandoning ship.
| > make[1]: *** [arch/x86/kernel/entry_64.o] Error 1
| > make: *** [arch/x86/kernel/entry_64.o] Error 2
| > cyrill@lenovo linux-2.6.git $
| >
| > So if such an approach is acceptable (in general) -- I could take a more
| > deeper look. So every ENTRY would check if other ENTRY/KPROBE is active
| > and report that.
| >
| > - Cyrill -
|
OK, the patch (below) found the first problem. The patch is still quite
rough and not good for usage in general but it found that we have an
unbalanced ENTRY for ENTRY(native_usergs_sysret64).
And the message is not fully correct -- it's not mess btw ENTRY and KPROBE
but just unbalanced ENRTY -- ie not closed by END.
---
cyrill@lenovo linux-2.6.git $ make arch/x86/kernel/entry_64.o
CHK include/linux/version.h
CHK include/linux/utsrelease.h
SYMLINK include/asm -> include/asm-x86
CALL scripts/checksyscalls.sh
AS arch/x86/kernel/entry_64.o
arch/x86/kernel/entry_64.S: Assembler messages:
arch/x86/kernel/entry_64.S:284: Error: Do not mess regular ENTRY and KPROBE!
arch/x86/kernel/entry_64.S:284: Fatal error: .abort detected. Abandoning
ship.
make[1]: *** [arch/x86/kernel/entry_64.o] Error 1
make: *** [arch/x86/kernel/entry_64.o] Error 2
cyrill@lenovo linux-2.6.git $
---
Not sure if general linkage.h is good place for such a macros.
Anyway, the patch applied to get a glace view.
- Cyrill -
---
---
include/linux/linkage.h | 43 +++++++++++++++++++++++++++++++++++++++----
1 file changed, 39 insertions(+), 4 deletions(-)
Index: linux-2.6.git/include/linux/linkage.h
===================================================================
--- linux-2.6.git.orig/include/linux/linkage.h
+++ linux-2.6.git/include/linux/linkage.h
@@ -51,8 +51,35 @@
#define ALIGN __ALIGN
#define ALIGN_STR __ALIGN_STR
+#define __set_entry .set _ENTRY_IN, 0
+#define __unset_entry .set _ENTRY_IN, 1
+#define __set_kprobe .set _KPROBE_IN, 0
+#define __unset_kprobe .set _KPROBE_IN, 1
+
+#define __check_entry \
+ .ifdef _ENTRY_IN; \
+ .ifeq _ENTRY_IN; \
+ .error "Do not mess regular ENTRY and KPROBE!"; \
+ .abort; \
+ .endif; \
+ .endif
+
+#define __check_kprobe \
+ .ifdef _KPROBE_IN; \
+ .ifeq _KPROBE_IN; \
+ .error "Do not mess regular ENTRY and KPROBE!"; \
+ .abort; \
+ .endif; \
+ .endif
+
+#define __check_entry_kprobe \
+ __check_entry; \
+ __check_kprobe
+
#ifndef ENTRY
#define ENTRY(name) \
+ __check_entry_kprobe; \
+ __set_entry; \
.globl name; \
ALIGN; \
name:
@@ -65,16 +92,24 @@
#endif
#define KPROBE_ENTRY(name) \
+ __check_entry_kprobe; \
+ __set_kprobe; \
.pushsection .kprobes.text, "ax"; \
- ENTRY(name)
+ .globl name; \
+ ALIGN; \
+ name:
#define KPROBE_END(name) \
- END(name); \
- .popsection
+ __unset_kprobe; \
+ __check_entry_kprobe; \
+ .size name, .-name; \
+ .popsection
#ifndef END
#define END(name) \
- .size name, .-name
+ __unset_entry; \
+ __check_entry_kprobe; \
+ .size name, .-name
#endif
/* If symbol 'name' is treated as a subroutine (gets called, and returns)
[Ingo Molnar - Sun, Nov 23, 2008 at 04:31:40PM +0100]
|
| * Cyrill Gorcunov <[email protected]> wrote:
|
| > [Alexander van Heukelum - Sun, Nov 23, 2008 at 04:04:18PM +0100]
| > | On Sun, Nov 23, 2008 at 05:12:37PM +0300, Cyrill Gorcunov wrote:
| > | > [Cyrill Gorcunov - Sun, Nov 23, 2008 at 04:51:34PM +0300]
| > | > | [Ingo Molnar - Sun, Nov 23, 2008 at 02:27:52PM +0100]
| > | > | |
| > | > | | * Alexander van Heukelum <[email protected]> wrote:
| > | > | |
| > | > | | > Impact: moves some code out of .kprobes.text
| > | > | | >
| > | > | | > KPROBE_ENTRY switches code generation to .kprobes.text, and KPROBE_END
| > | > | | > uses .popsection to get back to the previous section (.text, normally).
| > | > | | > Also replace ENDPROC by END, for consistency.
| > | > | | >
| > | > | | > Signed-off-by: Alexander van Heukelum <[email protected]>
| > | > | |
| > | > | | applied to tip/x86/irq, thanks Alexander!
| > | > | |
| > | > | | > One more small change for today. The xen-related functions
| > | > | | > xen_do_hypervisor_callback and xen_failsafe_callback are put
| > | > | | > in the .kprobes.text even in the current kernel: ignore_sysret
| > | > | | > is enclosed in KPROBE_ENTRY / ENDPROC, instead of KPROBE_ENTRY /
| > | > | | > KPROBE_END, but I guess the situation is harmless.
| > | > | |
| > | > | | yeah. It narrows no-kprobes protection for that code, but it should
| > | > | | indeed be fine (and that's the intention as well).
| > | > | |
| > | > | | Note that this is a reoccuring bug type, and rather long-lived. Can
| > | > | | you think of any way to get automated nesting protection of both the
| > | > | | .cfi_startproc/endproc macros and kprobes start/end? A poor man's
| > | > | | solution would be to grep the number of start and end methods and
| > | > | | enforce that they are equal.
| > | > | |
| > | > | | Ingo
| > | > | |
| > | > |
| > | > | I think we could play with preprocessor and check if ENTRY/END matches.
| > | > | Looking now.
| > | > |
| > | > | - Cyrill -
| > | >
| > | > Here is what I've done
| > | >
| > | > 1) Add some macros like:
| > | >
| > | > .macro __set_entry
| > | > .set _ENTRY_IN, 1
| > | > .endm
| > | >
| > | > .macro __unset_entry
| > | > .set _ENTRY_IN, 0
| > | > .endm
| > | >
| > | > .macro __check_entry
| > | > .ifeq _ENTRY_IN
| > | > .error "END should be used"
| > | > .abort
| > | > .endif
| > | > .endm
| > | >
| > | > So the code
| > | >
| > | > ENTRY(mcount)
| > | > __unset_entry
| > | > retq
| > | > __check_entry
| > | > END(mcount)
| > |
| > | Looks like a good approach to me. But I assume the ENTRY cppmacro
| > | will include magic?
| > |
| > | Greetings,
| > | Alexander
| > |
| >
| > yes, but now I'm in doubts since we have this definition in common
| > linkage.h I dont know if such approach would be usable on other
| > platforms.
|
| i'd suggest to introduce another entry macro name for that, for the
| time being. If other architectures want to pick up the method, they
| can generalize and test it.
|
| ( but this is assembly magic so i'm doubtful - while the features used
| are generic GAS features that should work everywhere, binutils
| variants tend to be rather fragile. So lets go with some other name
| like X86_ENTRY()/X86_END() or so - or maybe ENTRY_CFI()/END_CFI(). )
|
| Ingo
|
Indeed! I've just sent some quite raw version. Maybe we should introduce
asmmacro.h as ia64 did?
- Cyrill -
* Cyrill Gorcunov <[email protected]> wrote:
> OK, the patch (below) found the first problem. The patch is still
> quite rough and not good for usage in general but it found that we
> have an unbalanced ENTRY for ENTRY(native_usergs_sysret64).
very nice! Looks like the way to go.
Ingo
Alexander van Heukelum wrote:
>
> I put a ".p2align 5" in earlier in the series which caused the
> apicinterrupts to be 32-byte aligned. But it is a hack, really,
> relying on the generated code per stub to be between 17 and 32
> bytes, on the default alignment to be 16 bytes and all stubs
> to be in the .text section.
>
> I'm in favour of aligning all of the interrupt/exception stubs
> to 32 bytes, but it should be implemented the right way ;),
> which means that we need KPROBE_ENTRY_P5ALIGNED and so on :-/.
>
I'm sorry, I really don't follow that logic at all. Why the heck would
KPROBE_ENTRY_P5ALIGNED be better than .p5align?
For the record, I think we already have way to much macro crappage. It
makes the code painful to read and hard to figure out what the real
constraints on it is.
-hpa
>>> Alexander van Heukelum <[email protected]> 23.11.08 10:15 >>>
>@@ -1282,7 +1282,7 @@ gs_change:
> CFI_ADJUST_CFA_OFFSET -8
> ret
> CFI_ENDPROC
>-ENDPROC(native_load_gs_index)
>+END(native_load_gs_index)
>
> .section __ex_table,"a"
> .align 8
I disagree to this and similar changes in this patch: Why do we need to
get rid of the ENDPROC() here? It's a procedure that's being ended, and
using ENDPROC() is the only (existing) way to mark something as a
procedure in assembly code.
And btw., while described so in the patch comment, this change has nothing
to do with the subject of the patch.
Jan
On Sun, Nov 23, 2008 at 12:13:18PM -0800, H. Peter Anvin wrote:
> Alexander van Heukelum wrote:
> >
> > I put a ".p2align 5" in earlier in the series which caused the
> > apicinterrupts to be 32-byte aligned. But it is a hack, really,
> > relying on the generated code per stub to be between 17 and 32
> > bytes, on the default alignment to be 16 bytes and all stubs
> > to be in the .text section.
> >
> > I'm in favour of aligning all of the interrupt/exception stubs
> > to 32 bytes, but it should be implemented the right way ;),
> > which means that we need KPROBE_ENTRY_P5ALIGNED and so on :-/.
> >
>
> I'm sorry, I really don't follow that logic at all. Why the heck would
> KPROBE_ENTRY_P5ALIGNED be better than .p5align?
Hello hpa,
KPROBE_ENTRY_P5ALIGNED It isn't better, that's the point.
It's needed, because we have this problem:
.p2align 5
KPROBE_ENTRY(something)
somecode
KPROBE_END(something)
This does not align "something", because something is not in the
same section as the ".p2align 5".
> For the record, I think we already have way to much macro crappage. It
> makes the code painful to read and hard to figure out what the real
> constraints on it is.
I agree with that to some point. But even without the macro's, the
code is hard to read. As an alternative to more macro's, what do you
think of Cyrills suggestion of putting CFI-annotations next to the
instuctions, like:
pushq %rax ; CFI_ADJUST_CFA_OFFSET 8
movq %rax, RAX+8(%rsp) ; CFI_REL_OFFSET rax, RAX+8
instead? That still leaves the problem that the instruction and the
annotation might not match, but it leaves the code more readable for
someone who is used to reading x86 assembly.
Greetings,
Alexander
> -hpa
On Mon, Nov 24, 2008 at 09:17:29AM +0000, Jan Beulich wrote:
> >>> Alexander van Heukelum <[email protected]> 23.11.08 10:15 >>>
> >@@ -1282,7 +1282,7 @@ gs_change:
> > CFI_ADJUST_CFA_OFFSET -8
> > ret
> > CFI_ENDPROC
> >-ENDPROC(native_load_gs_index)
> >+END(native_load_gs_index)
> >
> > .section __ex_table,"a"
> > .align 8
>
> I disagree to this and similar changes in this patch: Why do we need to
> get rid of the ENDPROC() here? It's a procedure that's being ended, and
> using ENDPROC() is the only (existing) way to mark something as a
> procedure in assembly code.
Hallo Jan Beulich,
You are right. ENDPROC(name) adds ".type name, @function;" as compared
to END(name). So I agree that using ENDPROC is in fact better.
> And btw., while described so in the patch comment, this change has nothing
> to do with the subject of the patch.
Right. I thought of END and ENDPROC as equivalent, so I added the change
to this patch as a small cleanup only. But if we want this .type
annotation, what about KPROBE_END? should it include one there too?
I'm getting a feeling we would be better off removing KPROBE_ENTRY and
KPROBE_END if favour of explicitly changing sections in the .S files?
And using the ENDPROC annotation for all procedures?
Greetings,
Alexander
> Jan
>Right. I thought of END and ENDPROC as equivalent, so I added the change
>to this patch as a small cleanup only. But if we want this .type
>annotation, what about KPROBE_END? should it include one there too?
Yes, it always bothered me that there's no KPROBE_ENDPROC() (or
alternatively, as this being code is implied by the macro, it didn't do the
annotation by default).
>I'm getting a feeling we would be better off removing KPROBE_ENTRY and
>KPROBE_END if favour of explicitly changing sections in the .S files?
>And using the ENDPROC annotation for all procedures?
It got explicitly added a while back, so there must have been a reason to
*not* do the section adjustments explicitly. And given the current discussion
I'd also assume that more hiding of code in macros is the preferred route.
Jan
entry_64.S is the only user of KPROBE_ENTRY / KPROBE_END on
x86_64. This patch reorders entry_64.S and explicitly generates
a separate section for functions that need the protection. The
generated code before and after the patch is equal.
Implicitly changing sections in assembly files makes it more
difficult to follow why the assembler is doing certain things.
For example,
.p2align 5
KPROBE_ENTRY(...)
was not doing what you would expect. Other section changes
(__ex_table, .fixup, .init.rodata) are done explicitly already.
Signed-off-by: Alexander van Heukelum <[email protected]>
Cc: Jan Beulich <[email protected]>
---
arch/x86/kernel/entry_64.S | 444 ++++++++++++++++++++++----------------------
1 files changed, 220 insertions(+), 224 deletions(-)
On Mon, Nov 24, 2008 at 10:35:53AM +0000, Jan Beulich wrote:
> [...]
>
> >I'm getting a feeling we would be better off removing KPROBE_ENTRY and
> >KPROBE_END if favour of explicitly changing sections in the .S files?
> >And using the ENDPROC annotation for all procedures?
>
> It got explicitly added a while back, so there must have been a reason to
> *not* do the section adjustments explicitly. And given the current discussion
> I'd also assume that more hiding of code in macros is the preferred route.
>
> Jan
Hallo Jan Beulich,
Let's see if this catches on. On i386 there is also only one user
of KPROBES_ENTRY / KPROBES_END: entry_32.S. If we remove those
entries in a similar way, we can get rid of them tree-wide.
Greetings,
Alexander
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index f2d546e..38fcd05 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1002,7 +1002,7 @@ END(\sym)
.endm
.macro paranoidzeroentry sym do_sym
-KPROBE_ENTRY(\sym)
+ENTRY(\sym)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $-1 /* ORIG_RAX: no syscall to restart */
@@ -1015,11 +1015,11 @@ KPROBE_ENTRY(\sym)
call \do_sym
jmp paranoid_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
-KPROBE_END(\sym)
+END(\sym)
.endm
.macro paranoidzeroentry_ist sym do_sym ist
-KPROBE_ENTRY(\sym)
+ENTRY(\sym)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $-1 /* ORIG_RAX: no syscall to restart */
@@ -1035,15 +1035,11 @@ KPROBE_ENTRY(\sym)
addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
jmp paranoid_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
-KPROBE_END(\sym)
+END(\sym)
.endm
-.macro errorentry sym do_sym entry=0
-.if \entry
-KPROBE_ENTRY(\sym)
-.else
+.macro errorentry sym do_sym
ENTRY(\sym)
-.endif
XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
subq $15*8,%rsp
@@ -1056,20 +1052,12 @@ ENTRY(\sym)
call \do_sym
jmp error_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
-.if \entry
-KPROBE_END(\sym)
-.else
END(\sym)
-.endif
.endm
/* error code is on the stack already */
-.macro paranoiderrorentry sym do_sym entry=1
-.if \entry
-KPROBE_ENTRY(\sym)
-.else
+.macro paranoiderrorentry sym do_sym
ENTRY(\sym)
-.endif
XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
subq $15*8,%rsp
@@ -1083,166 +1071,23 @@ ENTRY(\sym)
call \do_sym
jmp paranoid_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
-.if \entry
-KPROBE_END(\sym)
-.else
END(\sym)
-.endif
.endm
zeroentry divide_error do_divide_error
-paranoidzeroentry_ist debug do_debug DEBUG_STACK
-paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
zeroentry overflow do_overflow
zeroentry bounds do_bounds
zeroentry invalid_op do_invalid_op
zeroentry device_not_available do_device_not_available
-paranoiderrorentry double_fault do_double_fault 0
+paranoiderrorentry double_fault do_double_fault
zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
errorentry invalid_TSS do_invalid_TSS
errorentry segment_not_present do_segment_not_present
-paranoiderrorentry stack_segment do_stack_segment
-errorentry general_protection do_general_protection 1
-errorentry page_fault do_page_fault 1
zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
zeroentry coprocessor_error do_coprocessor_error
errorentry alignment_check do_alignment_check
-#ifdef CONFIG_X86_MCE
-paranoidzeroentry machine_check do_machine_check
-#endif
zeroentry simd_coprocessor_error do_simd_coprocessor_error
- /*
- * "Paranoid" exit path from exception stack.
- * Paranoid because this is used by NMIs and cannot take
- * any kernel state for granted.
- * We don't do kernel preemption checks here, because only
- * NMI should be common and it does not enable IRQs and
- * cannot get reschedule ticks.
- *
- * "trace" is 0 for the NMI handler only, because irq-tracing
- * is fundamentally NMI-unsafe. (we cannot change the soft and
- * hard flags at once, atomically)
- */
-
- /* ebx: no swapgs flag */
-KPROBE_ENTRY(paranoid_exit)
- INTR_FRAME
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- testl %ebx,%ebx /* swapgs needed? */
- jnz paranoid_restore
- testl $3,CS(%rsp)
- jnz paranoid_userspace
-paranoid_swapgs:
- TRACE_IRQS_IRETQ 0
- SWAPGS_UNSAFE_STACK
-paranoid_restore:
- RESTORE_ALL 8
- jmp irq_return
-paranoid_userspace:
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%ebx
- andl $_TIF_WORK_MASK,%ebx
- jz paranoid_swapgs
- movq %rsp,%rdi /* &pt_regs */
- call sync_regs
- movq %rax,%rsp /* switch stack for scheduling */
- testl $_TIF_NEED_RESCHED,%ebx
- jnz paranoid_schedule
- movl %ebx,%edx /* arg3: thread flags */
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- xorl %esi,%esi /* arg2: oldset */
- movq %rsp,%rdi /* arg1: &pt_regs */
- call do_notify_resume
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- jmp paranoid_userspace
-paranoid_schedule:
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY)
- call schedule
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
- jmp paranoid_userspace
- CFI_ENDPROC
-KPROBE_END(paranoid_exit)
-
-/*
- * Exception entry point. This expects an error code/orig_rax on the stack.
- * returns in "no swapgs flag" in %ebx.
- */
-KPROBE_ENTRY(error_entry)
- XCPT_FRAME
- CFI_ADJUST_CFA_OFFSET 15*8
- /* oldrax contains error code */
- cld
- movq_cfi rdi, RDI+8
- movq_cfi rsi, RSI+8
- movq_cfi rdx, RDX+8
- movq_cfi rcx, RCX+8
- movq_cfi rax, RAX+8
- movq_cfi r8, R8+8
- movq_cfi r9, R9+8
- movq_cfi r10, R10+8
- movq_cfi r11, R11+8
- movq_cfi rbx, RBX+8
- movq_cfi rbp, RBP+8
- movq_cfi r12, R12+8
- movq_cfi r13, R13+8
- movq_cfi r14, R14+8
- movq_cfi r15, R15+8
- xorl %ebx,%ebx
- testl $3,CS+8(%rsp)
- je error_kernelspace
-error_swapgs:
- SWAPGS
-error_sti:
- TRACE_IRQS_OFF
- ret
- CFI_ENDPROC
-
-/*
- * There are two places in the kernel that can potentially fault with
- * usergs. Handle them here. The exception handlers after iret run with
- * kernel gs again, so don't set the user space flag. B stepping K8s
- * sometimes report an truncated RIP for IRET exceptions returning to
- * compat mode. Check for these here too.
- */
-error_kernelspace:
- incl %ebx
- leaq irq_return(%rip),%rcx
- cmpq %rcx,RIP+8(%rsp)
- je error_swapgs
- movl %ecx,%ecx /* zero extend */
- cmpq %rcx,RIP+8(%rsp)
- je error_swapgs
- cmpq $gs_change,RIP+8(%rsp)
- je error_swapgs
- jmp error_sti
-KPROBE_END(error_entry)
-
-
-/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
-KPROBE_ENTRY(error_exit)
- DEFAULT_FRAME
- movl %ebx,%eax
- RESTORE_REST
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- GET_THREAD_INFO(%rcx)
- testl %eax,%eax
- jne retint_kernel
- LOCKDEP_SYS_EXIT_IRQ
- movl TI_flags(%rcx),%edx
- movl $_TIF_WORK_MASK,%edi
- andl %edi,%edx
- jnz retint_careful
- jmp retint_swapgs
- CFI_ENDPROC
-KPROBE_END(error_exit)
-
/* Reload gs selector with exception handling */
/* edi: new selector */
ENTRY(native_load_gs_index)
@@ -1362,61 +1207,6 @@ ENTRY(kernel_execve)
CFI_ENDPROC
END(kernel_execve)
- /* runs on exception stack */
-KPROBE_ENTRY(nmi)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1
- subq $15*8, %rsp
- CFI_ADJUST_CFA_OFFSET 15*8
- call save_paranoid
- DEFAULT_FRAME 0
- /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
- movq %rsp,%rdi
- movq $-1,%rsi
- call do_nmi
-#ifdef CONFIG_TRACE_IRQFLAGS
- /* paranoidexit; without TRACE_IRQS_OFF */
- /* ebx: no swapgs flag */
- DISABLE_INTERRUPTS(CLBR_NONE)
- testl %ebx,%ebx /* swapgs needed? */
- jnz nmi_restore
- testl $3,CS(%rsp)
- jnz nmi_userspace
-nmi_swapgs:
- SWAPGS_UNSAFE_STACK
-nmi_restore:
- RESTORE_ALL 8
- jmp irq_return
-nmi_userspace:
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%ebx
- andl $_TIF_WORK_MASK,%ebx
- jz nmi_swapgs
- movq %rsp,%rdi /* &pt_regs */
- call sync_regs
- movq %rax,%rsp /* switch stack for scheduling */
- testl $_TIF_NEED_RESCHED,%ebx
- jnz nmi_schedule
- movl %ebx,%edx /* arg3: thread flags */
- ENABLE_INTERRUPTS(CLBR_NONE)
- xorl %esi,%esi /* arg2: oldset */
- movq %rsp,%rdi /* arg1: &pt_regs */
- call do_notify_resume
- DISABLE_INTERRUPTS(CLBR_NONE)
- jmp nmi_userspace
-nmi_schedule:
- ENABLE_INTERRUPTS(CLBR_ANY)
- call schedule
- DISABLE_INTERRUPTS(CLBR_ANY)
- jmp nmi_userspace
- CFI_ENDPROC
-#else
- jmp paranoid_exit
- CFI_ENDPROC
-#endif
-KPROBE_END(nmi)
-
/* Call softirq on interrupt stack. Interrupts are off. */
ENTRY(call_softirq)
CFI_STARTPROC
@@ -1437,13 +1227,6 @@ ENTRY(call_softirq)
CFI_ENDPROC
END(call_softirq)
-KPROBE_ENTRY(ignore_sysret)
- CFI_STARTPROC
- mov $-ENOSYS,%eax
- sysret
- CFI_ENDPROC
-KPROBE_END(ignore_sysret)
-
#ifdef CONFIG_XEN
zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
@@ -1540,3 +1323,216 @@ ENTRY(xen_failsafe_callback)
END(xen_failsafe_callback)
#endif /* CONFIG_XEN */
+
+/*
+ * Some functions should be protected against kprobes
+ */
+ .pushsection .kprobes.text, "ax"
+
+paranoidzeroentry_ist debug do_debug DEBUG_STACK
+paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
+paranoiderrorentry stack_segment do_stack_segment
+errorentry general_protection do_general_protection
+errorentry page_fault do_page_fault
+#ifdef CONFIG_X86_MCE
+paranoidzeroentry machine_check do_machine_check
+#endif
+
+ /*
+ * "Paranoid" exit path from exception stack.
+ * Paranoid because this is used by NMIs and cannot take
+ * any kernel state for granted.
+ * We don't do kernel preemption checks here, because only
+ * NMI should be common and it does not enable IRQs and
+ * cannot get reschedule ticks.
+ *
+ * "trace" is 0 for the NMI handler only, because irq-tracing
+ * is fundamentally NMI-unsafe. (we cannot change the soft and
+ * hard flags at once, atomically)
+ */
+
+ /* ebx: no swapgs flag */
+ENTRY(paranoid_exit)
+ INTR_FRAME
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ testl %ebx,%ebx /* swapgs needed? */
+ jnz paranoid_restore
+ testl $3,CS(%rsp)
+ jnz paranoid_userspace
+paranoid_swapgs:
+ TRACE_IRQS_IRETQ 0
+ SWAPGS_UNSAFE_STACK
+paranoid_restore:
+ RESTORE_ALL 8
+ jmp irq_return
+paranoid_userspace:
+ GET_THREAD_INFO(%rcx)
+ movl TI_flags(%rcx),%ebx
+ andl $_TIF_WORK_MASK,%ebx
+ jz paranoid_swapgs
+ movq %rsp,%rdi /* &pt_regs */
+ call sync_regs
+ movq %rax,%rsp /* switch stack for scheduling */
+ testl $_TIF_NEED_RESCHED,%ebx
+ jnz paranoid_schedule
+ movl %ebx,%edx /* arg3: thread flags */
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ xorl %esi,%esi /* arg2: oldset */
+ movq %rsp,%rdi /* arg1: &pt_regs */
+ call do_notify_resume
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ jmp paranoid_userspace
+paranoid_schedule:
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_ANY)
+ call schedule
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_OFF
+ jmp paranoid_userspace
+ CFI_ENDPROC
+END(paranoid_exit)
+
+/*
+ * Exception entry point. This expects an error code/orig_rax on the stack.
+ * returns in "no swapgs flag" in %ebx.
+ */
+ENTRY(error_entry)
+ XCPT_FRAME
+ CFI_ADJUST_CFA_OFFSET 15*8
+ /* oldrax contains error code */
+ cld
+ movq_cfi rdi, RDI+8
+ movq_cfi rsi, RSI+8
+ movq_cfi rdx, RDX+8
+ movq_cfi rcx, RCX+8
+ movq_cfi rax, RAX+8
+ movq_cfi r8, R8+8
+ movq_cfi r9, R9+8
+ movq_cfi r10, R10+8
+ movq_cfi r11, R11+8
+ movq_cfi rbx, RBX+8
+ movq_cfi rbp, RBP+8
+ movq_cfi r12, R12+8
+ movq_cfi r13, R13+8
+ movq_cfi r14, R14+8
+ movq_cfi r15, R15+8
+ xorl %ebx,%ebx
+ testl $3,CS+8(%rsp)
+ je error_kernelspace
+error_swapgs:
+ SWAPGS
+error_sti:
+ TRACE_IRQS_OFF
+ ret
+ CFI_ENDPROC
+
+/*
+ * There are two places in the kernel that can potentially fault with
+ * usergs. Handle them here. The exception handlers after iret run with
+ * kernel gs again, so don't set the user space flag. B stepping K8s
+ * sometimes report an truncated RIP for IRET exceptions returning to
+ * compat mode. Check for these here too.
+ */
+error_kernelspace:
+ incl %ebx
+ leaq irq_return(%rip),%rcx
+ cmpq %rcx,RIP+8(%rsp)
+ je error_swapgs
+ movl %ecx,%ecx /* zero extend */
+ cmpq %rcx,RIP+8(%rsp)
+ je error_swapgs
+ cmpq $gs_change,RIP+8(%rsp)
+ je error_swapgs
+ jmp error_sti
+END(error_entry)
+
+
+/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
+ENTRY(error_exit)
+ DEFAULT_FRAME
+ movl %ebx,%eax
+ RESTORE_REST
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ GET_THREAD_INFO(%rcx)
+ testl %eax,%eax
+ jne retint_kernel
+ LOCKDEP_SYS_EXIT_IRQ
+ movl TI_flags(%rcx),%edx
+ movl $_TIF_WORK_MASK,%edi
+ andl %edi,%edx
+ jnz retint_careful
+ jmp retint_swapgs
+ CFI_ENDPROC
+END(error_exit)
+
+
+ /* runs on exception stack */
+ENTRY(nmi)
+ INTR_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
+ pushq_cfi $-1
+ subq $15*8, %rsp
+ CFI_ADJUST_CFA_OFFSET 15*8
+ call save_paranoid
+ DEFAULT_FRAME 0
+ /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
+ movq %rsp,%rdi
+ movq $-1,%rsi
+ call do_nmi
+#ifdef CONFIG_TRACE_IRQFLAGS
+ /* paranoidexit; without TRACE_IRQS_OFF */
+ /* ebx: no swapgs flag */
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ testl %ebx,%ebx /* swapgs needed? */
+ jnz nmi_restore
+ testl $3,CS(%rsp)
+ jnz nmi_userspace
+nmi_swapgs:
+ SWAPGS_UNSAFE_STACK
+nmi_restore:
+ RESTORE_ALL 8
+ jmp irq_return
+nmi_userspace:
+ GET_THREAD_INFO(%rcx)
+ movl TI_flags(%rcx),%ebx
+ andl $_TIF_WORK_MASK,%ebx
+ jz nmi_swapgs
+ movq %rsp,%rdi /* &pt_regs */
+ call sync_regs
+ movq %rax,%rsp /* switch stack for scheduling */
+ testl $_TIF_NEED_RESCHED,%ebx
+ jnz nmi_schedule
+ movl %ebx,%edx /* arg3: thread flags */
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ xorl %esi,%esi /* arg2: oldset */
+ movq %rsp,%rdi /* arg1: &pt_regs */
+ call do_notify_resume
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ jmp nmi_userspace
+nmi_schedule:
+ ENABLE_INTERRUPTS(CLBR_ANY)
+ call schedule
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ jmp nmi_userspace
+ CFI_ENDPROC
+#else
+ jmp paranoid_exit
+ CFI_ENDPROC
+#endif
+END(nmi)
+
+ENTRY(ignore_sysret)
+ CFI_STARTPROC
+ mov $-ENOSYS,%eax
+ sysret
+ CFI_ENDPROC
+END(ignore_sysret)
+
+/*
+ * End of kprobes section
+ */
+ .popsection
--
1.5.4.3
I like things being done this way (including the goal of doing away with
KPROBE_{ENTRY,END} altogether). Thanks a lot! Jan
>>> Alexander van Heukelum <[email protected]> 24.11.08 13:24 >>>
entry_64.S is the only user of KPROBE_ENTRY / KPROBE_END on
x86_64. This patch reorders entry_64.S and explicitly generates
a separate section for functions that need the protection. The
generated code before and after the patch is equal.
Implicitly changing sections in assembly files makes it more
difficult to follow why the assembler is doing certain things.
For example,
.p2align 5
KPROBE_ENTRY(...)
was not doing what you would expect. Other section changes
(__ex_table, .fixup, .init.rodata) are done explicitly already.
Signed-off-by: Alexander van Heukelum <[email protected]>
Cc: Jan Beulich <[email protected]>
---
arch/x86/kernel/entry_64.S | 444 ++++++++++++++++++++++----------------------
1 files changed, 220 insertions(+), 224 deletions(-)
On Mon, Nov 24, 2008 at 10:35:53AM +0000, Jan Beulich wrote:
> [...]
>
> >I'm getting a feeling we would be better off removing KPROBE_ENTRY and
> >KPROBE_END if favour of explicitly changing sections in the .S files?
> >And using the ENDPROC annotation for all procedures?
>
> It got explicitly added a while back, so there must have been a reason to
> *not* do the section adjustments explicitly. And given the current discussion
> I'd also assume that more hiding of code in macros is the preferred route.
>
> Jan
Hallo Jan Beulich,
Let's see if this catches on. On i386 there is also only one user
of KPROBES_ENTRY / KPROBES_END: entry_32.S. If we remove those
entries in a similar way, we can get rid of them tree-wide.
Greetings,
Alexander
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index f2d546e..38fcd05 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1002,7 +1002,7 @@ END(\sym)
.endm
.macro paranoidzeroentry sym do_sym
-KPROBE_ENTRY(\sym)
+ENTRY(\sym)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $-1 /* ORIG_RAX: no syscall to restart */
@@ -1015,11 +1015,11 @@ KPROBE_ENTRY(\sym)
call \do_sym
jmp paranoid_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
-KPROBE_END(\sym)
+END(\sym)
.endm
.macro paranoidzeroentry_ist sym do_sym ist
-KPROBE_ENTRY(\sym)
+ENTRY(\sym)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $-1 /* ORIG_RAX: no syscall to restart */
@@ -1035,15 +1035,11 @@ KPROBE_ENTRY(\sym)
addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
jmp paranoid_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
-KPROBE_END(\sym)
+END(\sym)
.endm
-.macro errorentry sym do_sym entry=0
-.if \entry
-KPROBE_ENTRY(\sym)
-.else
+.macro errorentry sym do_sym
ENTRY(\sym)
-.endif
XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
subq $15*8,%rsp
@@ -1056,20 +1052,12 @@ ENTRY(\sym)
call \do_sym
jmp error_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
-.if \entry
-KPROBE_END(\sym)
-.else
END(\sym)
-.endif
.endm
/* error code is on the stack already */
-.macro paranoiderrorentry sym do_sym entry=1
-.if \entry
-KPROBE_ENTRY(\sym)
-.else
+.macro paranoiderrorentry sym do_sym
ENTRY(\sym)
-.endif
XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
subq $15*8,%rsp
@@ -1083,166 +1071,23 @@ ENTRY(\sym)
call \do_sym
jmp paranoid_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
-.if \entry
-KPROBE_END(\sym)
-.else
END(\sym)
-.endif
.endm
zeroentry divide_error do_divide_error
-paranoidzeroentry_ist debug do_debug DEBUG_STACK
-paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
zeroentry overflow do_overflow
zeroentry bounds do_bounds
zeroentry invalid_op do_invalid_op
zeroentry device_not_available do_device_not_available
-paranoiderrorentry double_fault do_double_fault 0
+paranoiderrorentry double_fault do_double_fault
zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
errorentry invalid_TSS do_invalid_TSS
errorentry segment_not_present do_segment_not_present
-paranoiderrorentry stack_segment do_stack_segment
-errorentry general_protection do_general_protection 1
-errorentry page_fault do_page_fault 1
zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
zeroentry coprocessor_error do_coprocessor_error
errorentry alignment_check do_alignment_check
-#ifdef CONFIG_X86_MCE
-paranoidzeroentry machine_check do_machine_check
-#endif
zeroentry simd_coprocessor_error do_simd_coprocessor_error
- /*
- * "Paranoid" exit path from exception stack.
- * Paranoid because this is used by NMIs and cannot take
- * any kernel state for granted.
- * We don't do kernel preemption checks here, because only
- * NMI should be common and it does not enable IRQs and
- * cannot get reschedule ticks.
- *
- * "trace" is 0 for the NMI handler only, because irq-tracing
- * is fundamentally NMI-unsafe. (we cannot change the soft and
- * hard flags at once, atomically)
- */
-
- /* ebx: no swapgs flag */
-KPROBE_ENTRY(paranoid_exit)
- INTR_FRAME
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- testl %ebx,%ebx /* swapgs needed? */
- jnz paranoid_restore
- testl $3,CS(%rsp)
- jnz paranoid_userspace
-paranoid_swapgs:
- TRACE_IRQS_IRETQ 0
- SWAPGS_UNSAFE_STACK
-paranoid_restore:
- RESTORE_ALL 8
- jmp irq_return
-paranoid_userspace:
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%ebx
- andl $_TIF_WORK_MASK,%ebx
- jz paranoid_swapgs
- movq %rsp,%rdi /* &pt_regs */
- call sync_regs
- movq %rax,%rsp /* switch stack for scheduling */
- testl $_TIF_NEED_RESCHED,%ebx
- jnz paranoid_schedule
- movl %ebx,%edx /* arg3: thread flags */
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- xorl %esi,%esi /* arg2: oldset */
- movq %rsp,%rdi /* arg1: &pt_regs */
- call do_notify_resume
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- jmp paranoid_userspace
-paranoid_schedule:
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY)
- call schedule
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
- jmp paranoid_userspace
- CFI_ENDPROC
-KPROBE_END(paranoid_exit)
-
-/*
- * Exception entry point. This expects an error code/orig_rax on the stack.
- * returns in "no swapgs flag" in %ebx.
- */
-KPROBE_ENTRY(error_entry)
- XCPT_FRAME
- CFI_ADJUST_CFA_OFFSET 15*8
- /* oldrax contains error code */
- cld
- movq_cfi rdi, RDI+8
- movq_cfi rsi, RSI+8
- movq_cfi rdx, RDX+8
- movq_cfi rcx, RCX+8
- movq_cfi rax, RAX+8
- movq_cfi r8, R8+8
- movq_cfi r9, R9+8
- movq_cfi r10, R10+8
- movq_cfi r11, R11+8
- movq_cfi rbx, RBX+8
- movq_cfi rbp, RBP+8
- movq_cfi r12, R12+8
- movq_cfi r13, R13+8
- movq_cfi r14, R14+8
- movq_cfi r15, R15+8
- xorl %ebx,%ebx
- testl $3,CS+8(%rsp)
- je error_kernelspace
-error_swapgs:
- SWAPGS
-error_sti:
- TRACE_IRQS_OFF
- ret
- CFI_ENDPROC
-
-/*
- * There are two places in the kernel that can potentially fault with
- * usergs. Handle them here. The exception handlers after iret run with
- * kernel gs again, so don't set the user space flag. B stepping K8s
- * sometimes report an truncated RIP for IRET exceptions returning to
- * compat mode. Check for these here too.
- */
-error_kernelspace:
- incl %ebx
- leaq irq_return(%rip),%rcx
- cmpq %rcx,RIP+8(%rsp)
- je error_swapgs
- movl %ecx,%ecx /* zero extend */
- cmpq %rcx,RIP+8(%rsp)
- je error_swapgs
- cmpq $gs_change,RIP+8(%rsp)
- je error_swapgs
- jmp error_sti
-KPROBE_END(error_entry)
-
-
-/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
-KPROBE_ENTRY(error_exit)
- DEFAULT_FRAME
- movl %ebx,%eax
- RESTORE_REST
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- GET_THREAD_INFO(%rcx)
- testl %eax,%eax
- jne retint_kernel
- LOCKDEP_SYS_EXIT_IRQ
- movl TI_flags(%rcx),%edx
- movl $_TIF_WORK_MASK,%edi
- andl %edi,%edx
- jnz retint_careful
- jmp retint_swapgs
- CFI_ENDPROC
-KPROBE_END(error_exit)
-
/* Reload gs selector with exception handling */
/* edi: new selector */
ENTRY(native_load_gs_index)
@@ -1362,61 +1207,6 @@ ENTRY(kernel_execve)
CFI_ENDPROC
END(kernel_execve)
- /* runs on exception stack */
-KPROBE_ENTRY(nmi)
- INTR_FRAME
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1
- subq $15*8, %rsp
- CFI_ADJUST_CFA_OFFSET 15*8
- call save_paranoid
- DEFAULT_FRAME 0
- /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
- movq %rsp,%rdi
- movq $-1,%rsi
- call do_nmi
-#ifdef CONFIG_TRACE_IRQFLAGS
- /* paranoidexit; without TRACE_IRQS_OFF */
- /* ebx: no swapgs flag */
- DISABLE_INTERRUPTS(CLBR_NONE)
- testl %ebx,%ebx /* swapgs needed? */
- jnz nmi_restore
- testl $3,CS(%rsp)
- jnz nmi_userspace
-nmi_swapgs:
- SWAPGS_UNSAFE_STACK
-nmi_restore:
- RESTORE_ALL 8
- jmp irq_return
-nmi_userspace:
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%ebx
- andl $_TIF_WORK_MASK,%ebx
- jz nmi_swapgs
- movq %rsp,%rdi /* &pt_regs */
- call sync_regs
- movq %rax,%rsp /* switch stack for scheduling */
- testl $_TIF_NEED_RESCHED,%ebx
- jnz nmi_schedule
- movl %ebx,%edx /* arg3: thread flags */
- ENABLE_INTERRUPTS(CLBR_NONE)
- xorl %esi,%esi /* arg2: oldset */
- movq %rsp,%rdi /* arg1: &pt_regs */
- call do_notify_resume
- DISABLE_INTERRUPTS(CLBR_NONE)
- jmp nmi_userspace
-nmi_schedule:
- ENABLE_INTERRUPTS(CLBR_ANY)
- call schedule
- DISABLE_INTERRUPTS(CLBR_ANY)
- jmp nmi_userspace
- CFI_ENDPROC
-#else
- jmp paranoid_exit
- CFI_ENDPROC
-#endif
-KPROBE_END(nmi)
-
/* Call softirq on interrupt stack. Interrupts are off. */
ENTRY(call_softirq)
CFI_STARTPROC
@@ -1437,13 +1227,6 @@ ENTRY(call_softirq)
CFI_ENDPROC
END(call_softirq)
-KPROBE_ENTRY(ignore_sysret)
- CFI_STARTPROC
- mov $-ENOSYS,%eax
- sysret
- CFI_ENDPROC
-KPROBE_END(ignore_sysret)
-
#ifdef CONFIG_XEN
zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
@@ -1540,3 +1323,216 @@ ENTRY(xen_failsafe_callback)
END(xen_failsafe_callback)
#endif /* CONFIG_XEN */
+
+/*
+ * Some functions should be protected against kprobes
+ */
+ .pushsection .kprobes.text, "ax"
+
+paranoidzeroentry_ist debug do_debug DEBUG_STACK
+paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
+paranoiderrorentry stack_segment do_stack_segment
+errorentry general_protection do_general_protection
+errorentry page_fault do_page_fault
+#ifdef CONFIG_X86_MCE
+paranoidzeroentry machine_check do_machine_check
+#endif
+
+ /*
+ * "Paranoid" exit path from exception stack.
+ * Paranoid because this is used by NMIs and cannot take
+ * any kernel state for granted.
+ * We don't do kernel preemption checks here, because only
+ * NMI should be common and it does not enable IRQs and
+ * cannot get reschedule ticks.
+ *
+ * "trace" is 0 for the NMI handler only, because irq-tracing
+ * is fundamentally NMI-unsafe. (we cannot change the soft and
+ * hard flags at once, atomically)
+ */
+
+ /* ebx: no swapgs flag */
+ENTRY(paranoid_exit)
+ INTR_FRAME
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ testl %ebx,%ebx /* swapgs needed? */
+ jnz paranoid_restore
+ testl $3,CS(%rsp)
+ jnz paranoid_userspace
+paranoid_swapgs:
+ TRACE_IRQS_IRETQ 0
+ SWAPGS_UNSAFE_STACK
+paranoid_restore:
+ RESTORE_ALL 8
+ jmp irq_return
+paranoid_userspace:
+ GET_THREAD_INFO(%rcx)
+ movl TI_flags(%rcx),%ebx
+ andl $_TIF_WORK_MASK,%ebx
+ jz paranoid_swapgs
+ movq %rsp,%rdi /* &pt_regs */
+ call sync_regs
+ movq %rax,%rsp /* switch stack for scheduling */
+ testl $_TIF_NEED_RESCHED,%ebx
+ jnz paranoid_schedule
+ movl %ebx,%edx /* arg3: thread flags */
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ xorl %esi,%esi /* arg2: oldset */
+ movq %rsp,%rdi /* arg1: &pt_regs */
+ call do_notify_resume
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ jmp paranoid_userspace
+paranoid_schedule:
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_ANY)
+ call schedule
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_OFF
+ jmp paranoid_userspace
+ CFI_ENDPROC
+END(paranoid_exit)
+
+/*
+ * Exception entry point. This expects an error code/orig_rax on the stack.
+ * returns in "no swapgs flag" in %ebx.
+ */
+ENTRY(error_entry)
+ XCPT_FRAME
+ CFI_ADJUST_CFA_OFFSET 15*8
+ /* oldrax contains error code */
+ cld
+ movq_cfi rdi, RDI+8
+ movq_cfi rsi, RSI+8
+ movq_cfi rdx, RDX+8
+ movq_cfi rcx, RCX+8
+ movq_cfi rax, RAX+8
+ movq_cfi r8, R8+8
+ movq_cfi r9, R9+8
+ movq_cfi r10, R10+8
+ movq_cfi r11, R11+8
+ movq_cfi rbx, RBX+8
+ movq_cfi rbp, RBP+8
+ movq_cfi r12, R12+8
+ movq_cfi r13, R13+8
+ movq_cfi r14, R14+8
+ movq_cfi r15, R15+8
+ xorl %ebx,%ebx
+ testl $3,CS+8(%rsp)
+ je error_kernelspace
+error_swapgs:
+ SWAPGS
+error_sti:
+ TRACE_IRQS_OFF
+ ret
+ CFI_ENDPROC
+
+/*
+ * There are two places in the kernel that can potentially fault with
+ * usergs. Handle them here. The exception handlers after iret run with
+ * kernel gs again, so don't set the user space flag. B stepping K8s
+ * sometimes report an truncated RIP for IRET exceptions returning to
+ * compat mode. Check for these here too.
+ */
+error_kernelspace:
+ incl %ebx
+ leaq irq_return(%rip),%rcx
+ cmpq %rcx,RIP+8(%rsp)
+ je error_swapgs
+ movl %ecx,%ecx /* zero extend */
+ cmpq %rcx,RIP+8(%rsp)
+ je error_swapgs
+ cmpq $gs_change,RIP+8(%rsp)
+ je error_swapgs
+ jmp error_sti
+END(error_entry)
+
+
+/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
+ENTRY(error_exit)
+ DEFAULT_FRAME
+ movl %ebx,%eax
+ RESTORE_REST
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ GET_THREAD_INFO(%rcx)
+ testl %eax,%eax
+ jne retint_kernel
+ LOCKDEP_SYS_EXIT_IRQ
+ movl TI_flags(%rcx),%edx
+ movl $_TIF_WORK_MASK,%edi
+ andl %edi,%edx
+ jnz retint_careful
+ jmp retint_swapgs
+ CFI_ENDPROC
+END(error_exit)
+
+
+ /* runs on exception stack */
+ENTRY(nmi)
+ INTR_FRAME
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
+ pushq_cfi $-1
+ subq $15*8, %rsp
+ CFI_ADJUST_CFA_OFFSET 15*8
+ call save_paranoid
+ DEFAULT_FRAME 0
+ /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
+ movq %rsp,%rdi
+ movq $-1,%rsi
+ call do_nmi
+#ifdef CONFIG_TRACE_IRQFLAGS
+ /* paranoidexit; without TRACE_IRQS_OFF */
+ /* ebx: no swapgs flag */
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ testl %ebx,%ebx /* swapgs needed? */
+ jnz nmi_restore
+ testl $3,CS(%rsp)
+ jnz nmi_userspace
+nmi_swapgs:
+ SWAPGS_UNSAFE_STACK
+nmi_restore:
+ RESTORE_ALL 8
+ jmp irq_return
+nmi_userspace:
+ GET_THREAD_INFO(%rcx)
+ movl TI_flags(%rcx),%ebx
+ andl $_TIF_WORK_MASK,%ebx
+ jz nmi_swapgs
+ movq %rsp,%rdi /* &pt_regs */
+ call sync_regs
+ movq %rax,%rsp /* switch stack for scheduling */
+ testl $_TIF_NEED_RESCHED,%ebx
+ jnz nmi_schedule
+ movl %ebx,%edx /* arg3: thread flags */
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ xorl %esi,%esi /* arg2: oldset */
+ movq %rsp,%rdi /* arg1: &pt_regs */
+ call do_notify_resume
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ jmp nmi_userspace
+nmi_schedule:
+ ENABLE_INTERRUPTS(CLBR_ANY)
+ call schedule
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ jmp nmi_userspace
+ CFI_ENDPROC
+#else
+ jmp paranoid_exit
+ CFI_ENDPROC
+#endif
+END(nmi)
+
+ENTRY(ignore_sysret)
+ CFI_STARTPROC
+ mov $-ENOSYS,%eax
+ sysret
+ CFI_ENDPROC
+END(ignore_sysret)
+
+/*
+ * End of kprobes section
+ */
+ .popsection
--
1.5.4.3
entry_32.S is now the only user of KPROBE_ENTRY / KPROBE_END,
treewide. This patch reorders entry_64.S and explicitly generates
a separate section for functions that need the protection. The
generated code before and after the patch is equal.
The KPROBE_ENTRY and KPROBE_END macro's are removed too.
Signed-off-by: Alexander van Heukelum <[email protected]>
---
arch/x86/kernel/entry_32.S | 438 ++++++++++++++++++++++---------------------
include/linux/linkage.h | 8 -
2 files changed, 224 insertions(+), 222 deletions(-)
On Mon, Nov 24, 2008 at 01:33:49PM +0000, Jan Beulich wrote:
> I like things being done this way (including the goal of doing away with
> KPROBE_{ENTRY,END} altogether). Thanks a lot! Jan
Thanks for the comment. Here is the i386-side of the patch, including
the removal of the KPROBE_ENTRY and KPROBE_END macro's.
Greetings,
Alexander
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index bd02ec7..6e96028 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -688,65 +688,6 @@ ENDPROC(name)
/* The include is where all of the SMP etc. interrupts come from */
#include "entry_arch.h"
-KPROBE_ENTRY(page_fault)
- RING0_EC_FRAME
- pushl $do_page_fault
- CFI_ADJUST_CFA_OFFSET 4
- ALIGN
-error_code:
- /* the function address is in %fs's slot on the stack */
- pushl %es
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET es, 0*/
- pushl %ds
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET ds, 0*/
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET eax, 0
- pushl %ebp
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ebp, 0
- pushl %edi
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edi, 0
- pushl %esi
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET esi, 0
- pushl %edx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edx, 0
- pushl %ecx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx, 0
- pushl %ebx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ebx, 0
- cld
- pushl %fs
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET fs, 0*/
- movl $(__KERNEL_PERCPU), %ecx
- movl %ecx, %fs
- UNWIND_ESPFIX_STACK
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
- /*CFI_REGISTER es, ecx*/
- movl PT_FS(%esp), %edi # get the function address
- movl PT_ORIG_EAX(%esp), %edx # get the error code
- movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
- mov %ecx, PT_FS(%esp)
- /*CFI_REL_OFFSET fs, ES*/
- movl $(__USER_DS), %ecx
- movl %ecx, %ds
- movl %ecx, %es
- TRACE_IRQS_OFF
- movl %esp,%eax # pt_regs pointer
- call *%edi
- jmp ret_from_exception
- CFI_ENDPROC
-KPROBE_END(page_fault)
-
ENTRY(coprocessor_error)
RING0_INT_FRAME
pushl $0
@@ -777,140 +718,6 @@ ENTRY(device_not_available)
CFI_ENDPROC
END(device_not_available)
-/*
- * Debug traps and NMI can happen at the one SYSENTER instruction
- * that sets up the real kernel stack. Check here, since we can't
- * allow the wrong stack to be used.
- *
- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
- * already pushed 3 words if it hits on the sysenter instruction:
- * eflags, cs and eip.
- *
- * We just load the right stack, and push the three (known) values
- * by hand onto the new stack - while updating the return eip past
- * the instruction that would have done it for sysenter.
- */
-#define FIX_STACK(offset, ok, label) \
- cmpw $__KERNEL_CS,4(%esp); \
- jne ok; \
-label: \
- movl TSS_sysenter_sp0+offset(%esp),%esp; \
- CFI_DEF_CFA esp, 0; \
- CFI_UNDEFINED eip; \
- pushfl; \
- CFI_ADJUST_CFA_OFFSET 4; \
- pushl $__KERNEL_CS; \
- CFI_ADJUST_CFA_OFFSET 4; \
- pushl $sysenter_past_esp; \
- CFI_ADJUST_CFA_OFFSET 4; \
- CFI_REL_OFFSET eip, 0
-
-KPROBE_ENTRY(debug)
- RING0_INT_FRAME
- cmpl $ia32_sysenter_target,(%esp)
- jne debug_stack_correct
- FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
-debug_stack_correct:
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- TRACE_IRQS_OFF
- xorl %edx,%edx # error code 0
- movl %esp,%eax # pt_regs pointer
- call do_debug
- jmp ret_from_exception
- CFI_ENDPROC
-KPROBE_END(debug)
-
-/*
- * NMI is doubly nasty. It can happen _while_ we're handling
- * a debug fault, and the debug fault hasn't yet been able to
- * clear up the stack. So we first check whether we got an
- * NMI on the sysenter entry path, but after that we need to
- * check whether we got an NMI on the debug path where the debug
- * fault happened on the sysenter path.
- */
-KPROBE_ENTRY(nmi)
- RING0_INT_FRAME
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- movl %ss, %eax
- cmpw $__ESPFIX_SS, %ax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
- je nmi_espfix_stack
- cmpl $ia32_sysenter_target,(%esp)
- je nmi_stack_fixup
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- movl %esp,%eax
- /* Do not access memory above the end of our stack page,
- * it might not exist.
- */
- andl $(THREAD_SIZE-1),%eax
- cmpl $(THREAD_SIZE-20),%eax
- popl %eax
- CFI_ADJUST_CFA_OFFSET -4
- jae nmi_stack_correct
- cmpl $ia32_sysenter_target,12(%esp)
- je nmi_debug_stack_check
-nmi_stack_correct:
- /* We have a RING0_INT_FRAME here */
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- TRACE_IRQS_OFF
- xorl %edx,%edx # zero error code
- movl %esp,%eax # pt_regs pointer
- call do_nmi
- jmp restore_nocheck_notrace
- CFI_ENDPROC
-
-nmi_stack_fixup:
- RING0_INT_FRAME
- FIX_STACK(12,nmi_stack_correct, 1)
- jmp nmi_stack_correct
-
-nmi_debug_stack_check:
- /* We have a RING0_INT_FRAME here */
- cmpw $__KERNEL_CS,16(%esp)
- jne nmi_stack_correct
- cmpl $debug,(%esp)
- jb nmi_stack_correct
- cmpl $debug_esp_fix_insn,(%esp)
- ja nmi_stack_correct
- FIX_STACK(24,nmi_stack_correct, 1)
- jmp nmi_stack_correct
-
-nmi_espfix_stack:
- /* We have a RING0_INT_FRAME here.
- *
- * create the pointer to lss back
- */
- pushl %ss
- CFI_ADJUST_CFA_OFFSET 4
- pushl %esp
- CFI_ADJUST_CFA_OFFSET 4
- addw $4, (%esp)
- /* copy the iret frame of 12 bytes */
- .rept 3
- pushl 16(%esp)
- CFI_ADJUST_CFA_OFFSET 4
- .endr
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- TRACE_IRQS_OFF
- FIXUP_ESPFIX_STACK # %eax == %esp
- xorl %edx,%edx # zero error code
- call do_nmi
- RESTORE_REGS
- lss 12+4(%esp), %esp # back to espfix stack
- CFI_ADJUST_CFA_OFFSET -24
- jmp irq_return
- CFI_ENDPROC
-KPROBE_END(nmi)
-
#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
iret
@@ -926,19 +733,6 @@ ENTRY(native_irq_enable_sysexit)
END(native_irq_enable_sysexit)
#endif
-KPROBE_ENTRY(int3)
- RING0_INT_FRAME
- pushl $-1 # mark this as an int
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
- TRACE_IRQS_OFF
- xorl %edx,%edx # zero error code
- movl %esp,%eax # pt_regs pointer
- call do_int3
- jmp ret_from_exception
- CFI_ENDPROC
-KPROBE_END(int3)
-
ENTRY(overflow)
RING0_INT_FRAME
pushl $0
@@ -1003,14 +797,6 @@ ENTRY(stack_segment)
CFI_ENDPROC
END(stack_segment)
-KPROBE_ENTRY(general_protection)
- RING0_EC_FRAME
- pushl $do_general_protection
- CFI_ADJUST_CFA_OFFSET 4
- jmp error_code
- CFI_ENDPROC
-KPROBE_END(general_protection)
-
ENTRY(alignment_check)
RING0_EC_FRAME
pushl $do_alignment_check
@@ -1220,3 +1006,227 @@ END(mcount)
#include "syscall_table_32.S"
syscall_table_size=(.-sys_call_table)
+
+/*
+ * Some functions should be protected against kprobes
+ */
+ .pushsection .kprobes.text, "ax"
+
+ENTRY(page_fault)
+ RING0_EC_FRAME
+ pushl $do_page_fault
+ CFI_ADJUST_CFA_OFFSET 4
+ ALIGN
+error_code:
+ /* the function address is in %fs's slot on the stack */
+ pushl %es
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET es, 0*/
+ pushl %ds
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET ds, 0*/
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET eax, 0
+ pushl %ebp
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebp, 0
+ pushl %edi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edi, 0
+ pushl %esi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET esi, 0
+ pushl %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edx, 0
+ pushl %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx, 0
+ pushl %ebx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebx, 0
+ cld
+ pushl %fs
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET fs, 0*/
+ movl $(__KERNEL_PERCPU), %ecx
+ movl %ecx, %fs
+ UNWIND_ESPFIX_STACK
+ popl %ecx
+ CFI_ADJUST_CFA_OFFSET -4
+ /*CFI_REGISTER es, ecx*/
+ movl PT_FS(%esp), %edi # get the function address
+ movl PT_ORIG_EAX(%esp), %edx # get the error code
+ movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
+ mov %ecx, PT_FS(%esp)
+ /*CFI_REL_OFFSET fs, ES*/
+ movl $(__USER_DS), %ecx
+ movl %ecx, %ds
+ movl %ecx, %es
+ TRACE_IRQS_OFF
+ movl %esp,%eax # pt_regs pointer
+ call *%edi
+ jmp ret_from_exception
+ CFI_ENDPROC
+END(page_fault)
+
+/*
+ * Debug traps and NMI can happen at the one SYSENTER instruction
+ * that sets up the real kernel stack. Check here, since we can't
+ * allow the wrong stack to be used.
+ *
+ * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
+ * already pushed 3 words if it hits on the sysenter instruction:
+ * eflags, cs and eip.
+ *
+ * We just load the right stack, and push the three (known) values
+ * by hand onto the new stack - while updating the return eip past
+ * the instruction that would have done it for sysenter.
+ */
+#define FIX_STACK(offset, ok, label) \
+ cmpw $__KERNEL_CS,4(%esp); \
+ jne ok; \
+label: \
+ movl TSS_sysenter_sp0+offset(%esp),%esp; \
+ CFI_DEF_CFA esp, 0; \
+ CFI_UNDEFINED eip; \
+ pushfl; \
+ CFI_ADJUST_CFA_OFFSET 4; \
+ pushl $__KERNEL_CS; \
+ CFI_ADJUST_CFA_OFFSET 4; \
+ pushl $sysenter_past_esp; \
+ CFI_ADJUST_CFA_OFFSET 4; \
+ CFI_REL_OFFSET eip, 0
+
+ENTRY(debug)
+ RING0_INT_FRAME
+ cmpl $ia32_sysenter_target,(%esp)
+ jne debug_stack_correct
+ FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
+debug_stack_correct:
+ pushl $-1 # mark this as an int
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ TRACE_IRQS_OFF
+ xorl %edx,%edx # error code 0
+ movl %esp,%eax # pt_regs pointer
+ call do_debug
+ jmp ret_from_exception
+ CFI_ENDPROC
+END(debug)
+
+/*
+ * NMI is doubly nasty. It can happen _while_ we're handling
+ * a debug fault, and the debug fault hasn't yet been able to
+ * clear up the stack. So we first check whether we got an
+ * NMI on the sysenter entry path, but after that we need to
+ * check whether we got an NMI on the debug path where the debug
+ * fault happened on the sysenter path.
+ */
+ENTRY(nmi)
+ RING0_INT_FRAME
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ movl %ss, %eax
+ cmpw $__ESPFIX_SS, %ax
+ popl %eax
+ CFI_ADJUST_CFA_OFFSET -4
+ je nmi_espfix_stack
+ cmpl $ia32_sysenter_target,(%esp)
+ je nmi_stack_fixup
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ movl %esp,%eax
+ /* Do not access memory above the end of our stack page,
+ * it might not exist.
+ */
+ andl $(THREAD_SIZE-1),%eax
+ cmpl $(THREAD_SIZE-20),%eax
+ popl %eax
+ CFI_ADJUST_CFA_OFFSET -4
+ jae nmi_stack_correct
+ cmpl $ia32_sysenter_target,12(%esp)
+ je nmi_debug_stack_check
+nmi_stack_correct:
+ /* We have a RING0_INT_FRAME here */
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ TRACE_IRQS_OFF
+ xorl %edx,%edx # zero error code
+ movl %esp,%eax # pt_regs pointer
+ call do_nmi
+ jmp restore_nocheck_notrace
+ CFI_ENDPROC
+
+nmi_stack_fixup:
+ RING0_INT_FRAME
+ FIX_STACK(12,nmi_stack_correct, 1)
+ jmp nmi_stack_correct
+
+nmi_debug_stack_check:
+ /* We have a RING0_INT_FRAME here */
+ cmpw $__KERNEL_CS,16(%esp)
+ jne nmi_stack_correct
+ cmpl $debug,(%esp)
+ jb nmi_stack_correct
+ cmpl $debug_esp_fix_insn,(%esp)
+ ja nmi_stack_correct
+ FIX_STACK(24,nmi_stack_correct, 1)
+ jmp nmi_stack_correct
+
+nmi_espfix_stack:
+ /* We have a RING0_INT_FRAME here.
+ *
+ * create the pointer to lss back
+ */
+ pushl %ss
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl %esp
+ CFI_ADJUST_CFA_OFFSET 4
+ addw $4, (%esp)
+ /* copy the iret frame of 12 bytes */
+ .rept 3
+ pushl 16(%esp)
+ CFI_ADJUST_CFA_OFFSET 4
+ .endr
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ TRACE_IRQS_OFF
+ FIXUP_ESPFIX_STACK # %eax == %esp
+ xorl %edx,%edx # zero error code
+ call do_nmi
+ RESTORE_REGS
+ lss 12+4(%esp), %esp # back to espfix stack
+ CFI_ADJUST_CFA_OFFSET -24
+ jmp irq_return
+ CFI_ENDPROC
+END(nmi)
+
+ENTRY(int3)
+ RING0_INT_FRAME
+ pushl $-1 # mark this as an int
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ TRACE_IRQS_OFF
+ xorl %edx,%edx # zero error code
+ movl %esp,%eax # pt_regs pointer
+ call do_int3
+ jmp ret_from_exception
+ CFI_ENDPROC
+END(int3)
+
+ENTRY(general_protection)
+ RING0_EC_FRAME
+ pushl $do_general_protection
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp error_code
+ CFI_ENDPROC
+END(general_protection)
+
+/*
+ * End of kprobes section
+ */
+ .popsection
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 9fd1f85..fee9e59 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -64,14 +64,6 @@
name:
#endif
-#define KPROBE_ENTRY(name) \
- .pushsection .kprobes.text, "ax"; \
- ENTRY(name)
-
-#define KPROBE_END(name) \
- END(name); \
- .popsection
-
#ifndef END
#define END(name) \
.size name, .-name
--
1.5.4.3
Alexander van Heukelum wrote:
>
> Hello hpa,
>
> KPROBE_ENTRY_P5ALIGNED It isn't better, that's the point.
> It's needed, because we have this problem:
>
> .p2align 5
> KPROBE_ENTRY(something)
> somecode
> KPROBE_END(something)
>
> This does not align "something", because something is not in the
> same section as the ".p2align 5".
>
I think that is braindamaged. If KPROBE_ENTRY() changes the section
behind the programmers back, that's completely and totally stupid in an
utterly reckless sort of way. The author of a macro like that should be
taken out and shot, because it makes the programmer think the code does
something completely different than what the code actually does.
This isn't merely ugly, it is downright dangerous.
>> For the record, I think we already have way to much macro crappage. It
>> makes the code painful to read and hard to figure out what the real
>> constraints on it is.
>
> I agree with that to some point. But even without the macro's, the
> code is hard to read. As an alternative to more macro's, what do you
> think of Cyrills suggestion of putting CFI-annotations next to the
> instuctions, like:
>
> pushq %rax ; CFI_ADJUST_CFA_OFFSET 8
> movq %rax, RAX+8(%rsp) ; CFI_REL_OFFSET rax, RAX+8
>
> instead? That still leaves the problem that the instruction and the
> annotation might not match, but it leaves the code more readable for
> someone who is used to reading x86 assembly.
I think using the macros for opcodes make sense, as the CFI operation is
tied to the operation. That is a good way to use macros.
-hpa