2015-04-03 13:06:03

by Denys Vlasenko

[permalink] [raw]
Subject: [PATCH] x86/asm/entry/64: change interrupt entry macro to function

The macro is largish, and we have 24 instances of it.

By moving almost its entire body into a function,
1500..3000 bytes are saved (depending on .config):

text data bss dec hex filename
12546 0 0 12546 3102 entry_64.o.before
9394 0 0 9394 24b2 entry_64.o

Run-tested.

Signed-off-by: Denys Vlasenko <[email protected]>
CC: Linus Torvalds <[email protected]>
CC: Steven Rostedt <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: Borislav Petkov <[email protected]>
CC: "H. Peter Anvin" <[email protected]>
CC: Andy Lutomirski <[email protected]>
CC: Oleg Nesterov <[email protected]>
CC: Frederic Weisbecker <[email protected]>
CC: Alexei Starovoitov <[email protected]>
CC: Will Drewry <[email protected]>
CC: Kees Cook <[email protected]>
CC: [email protected]
CC: [email protected]
---
arch/x86/kernel/entry_64.S | 130 +++++++++++++++++++++++++--------------------
1 file changed, 71 insertions(+), 59 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b00ca22..1d33816 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -606,6 +606,71 @@ ENTRY(ret_from_fork)
CFI_ENDPROC
END(ret_from_fork)

+
+/*
+ * Common code for all interrupt entry points:
+ * save C-clobbered registers, maybe swapgs, maybe switch stacks.
+ * On return, %rdi points to pt_regs, old %rsp is on stack.
+ */
+ENTRY(interrupt_entry)
+ /*
+ * Since nothing in interrupt handling code touches r12...r15 members
+ * of "struct pt_regs", and since interrupts can nest, we can save
+ * four stack slots and simultaneously provide
+ * an unwind-friendly stack layout by saving "truncated" pt_regs
+ * exactly up to rbp slot, without these members.
+ * rbx slot is not populated. rbp slot is, but not used for restore.
+ */
+ XCPT_FRAME 1 15*8-RBP
+ ASM_CLAC
+ cld
+ /* ALLOC_PT_GPREGS_ON_STACK -RBP -- must be done by caller */
+ SAVE_C_REGS -RBP+8
+ /* Store %rpb to 8(%rsp) for unwinder, not for saving it per se */
+ SAVE_EXTRA_REGS_RBP -RBP+8
+
+ /* Caller expects us to return pointer to pt_regs in %rdi */
+ leaq -RBP+8(%rsp), %rdi
+
+ testb $3, CS-RBP+8(%rsp)
+ jz 1f
+ SWAPGS
+1:
+ /*
+ * Optionally switch to interrupt stack. Push previous stack pointer.
+ * irq_count is used to check if a CPU is already on an interrupt stack
+ * or not. While this is essentially redundant with preempt_count,
+ * it is a little cheaper to use a separate per-CPU counter (short of
+ * moving irq_enter into assembly, which would be too much work)
+ */
+ popq %rax /* get return address */
+ movq %rsp, %rsi
+ incl PER_CPU_VAR(irq_count)
+ cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
+ CFI_DEF_CFA_REGISTER rsi
+
+ pushq %rsi
+ pushq %rax
+ ret
+ CFI_ENDPROC
+END(interrupt_entry)
+.macro save_regs_and_call_intr_handler func
+ ALLOC_PT_GPREGS_ON_STACK -RBP
+ call interrupt_entry
+ /*
+ * For debugger:
+ * "CFA (Current Frame Address) is the value on stack + offset"
+ */
+ CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
+ 0x77 /* DW_OP_breg7 (rsp) */, 0, \
+ 0x06 /* DW_OP_deref */, \
+ 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \
+ 0x22 /* DW_OP_plus */
+ /* We entered an interrupt context - irqs are off: */
+ TRACE_IRQS_OFF
+ call \func /* expects %rdi -> pt_regs */
+.endm
+
/*
* Build the entry stubs and pointer table with some assembler magic.
* We pack 7 stubs into a single 32-byte chunk, which will fit in a
@@ -685,69 +750,17 @@ END(interrupt)
/*
* Interrupt entry/exit.
*
- * Interrupt entry points save only callee clobbered registers in fast path.
- *
* Entry runs with interrupts off.
+ *
+ * The interrupt stubs pushed (~vector+0x80) onto the stack and
+ * then jumped to common_interrupt.
+ * Interrupt entry points save only callee clobbered registers in fast path.
*/
-
-/* 0(%rsp): ~(interrupt number) */
- .macro interrupt func
- cld
- /*
- * Since nothing in interrupt handling code touches r12...r15 members
- * of "struct pt_regs", and since interrupts can nest, we can save
- * four stack slots and simultaneously provide
- * an unwind-friendly stack layout by saving "truncated" pt_regs
- * exactly up to rbp slot, without these members.
- */
- ALLOC_PT_GPREGS_ON_STACK -RBP
- SAVE_C_REGS -RBP
- /* this goes to 0(%rsp) for unwinder, not for saving the value: */
- SAVE_EXTRA_REGS_RBP -RBP
-
- leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */
-
- testl $3, CS-RBP(%rsp)
- je 1f
- SWAPGS
-1:
- /*
- * Save previous stack pointer, optionally switch to interrupt stack.
- * irq_count is used to check if a CPU is already on an interrupt stack
- * or not. While this is essentially redundant with preempt_count it is
- * a little cheaper to use a separate counter in the PDA (short of
- * moving irq_enter into assembly, which would be too much work)
- */
- movq %rsp, %rsi
- incl PER_CPU_VAR(irq_count)
- cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
- CFI_DEF_CFA_REGISTER rsi
- pushq %rsi
- /*
- * For debugger:
- * "CFA (Current Frame Address) is the value on stack + offset"
- */
- CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
- 0x77 /* DW_OP_breg7 (rsp) */, 0, \
- 0x06 /* DW_OP_deref */, \
- 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \
- 0x22 /* DW_OP_plus */
- /* We entered an interrupt context - irqs are off: */
- TRACE_IRQS_OFF
-
- call \func
- .endm
-
- /*
- * The interrupt stubs push (~vector+0x80) onto the stack and
- * then jump to common_interrupt.
- */
.align ALIGN_common_interrupt
common_interrupt:
XCPT_FRAME
- ASM_CLAC
addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
- interrupt do_IRQ
+ save_regs_and_call_intr_handler do_IRQ
/* 0(%rsp): old RSP */
ret_from_intr:
DISABLE_INTERRUPTS(CLBR_NONE)
@@ -902,10 +915,9 @@ END(common_interrupt)
.macro apicinterrupt3 num sym do_sym
ENTRY(\sym)
INTR_FRAME
- ASM_CLAC
pushq_cfi $~(\num)
.Lcommon_\sym:
- interrupt \do_sym
+ save_regs_and_call_intr_handler \do_sym
jmp ret_from_intr
CFI_ENDPROC
END(\sym)
--
1.8.1.4


2015-04-06 07:20:40

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] x86/asm/entry/64: change interrupt entry macro to function


* Denys Vlasenko <[email protected]> wrote:

> The macro is largish, and we have 24 instances of it.
>
> By moving almost its entire body into a function,
> 1500..3000 bytes are saved (depending on .config):
>
> text data bss dec hex filename
> 12546 0 0 12546 3102 entry_64.o.before
> 9394 0 0 9394 24b2 entry_64.o
>
> Run-tested.

Please list costs and benefits in the changelog. (i.e. increased cache
locality vs. extra instruction[s] executed.)

Thanks,

Ingo