2020-05-21 20:35:20

by Thomas Gleixner

[permalink] [raw]
Subject: [patch V9 21/39] x86/irq: Convey vector as argument and not in ptregs

From: Thomas Gleixner <[email protected]>

Device interrupts which go through do_IRQ() or the spurious interrupt
handler have their separate entry code on 64 bit for no good reason.

Both 32 and 64 bit transport the vector number through ORIG_[RE]AX in
pt_regs. Further the vector number is forced to fit into an u8 and is
complemented and offset by 0x80 so it's in the signed character
range. Otherwise GAS would expand the pushq to a 5 byte instruction for any
vector > 0x7F.

Treat the vector number like an error code and hand it to the C function as
argument. This allows to get rid of the extra entry code in a later step.

Simplify the error code push magic by implementing the pushq imm8 via a
'.byte 0x6a, vector' sequence so GAS is not able to screw it up. As the
pushq imm8 is sign extending the resulting error code needs to be truncated
to 8 bits in C code.

Originally-by: Andy Lutomirski <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Acked-by: Andy Lutomirski <[email protected]>
---
V9: Add protection against GAS emitting more than 8 byte per vector
---
arch/x86/entry/calling.h | 5 +++-
arch/x86/entry/entry_32.S | 33 +++-------------------------
arch/x86/entry/entry_64.S | 40 ++++++----------------------------
arch/x86/include/asm/entry_arch.h | 2 -
arch/x86/include/asm/hw_irq.h | 1
arch/x86/include/asm/idtentry.h | 44 ++++++++++++++++++++++++++++++++++++++
arch/x86/include/asm/irq.h | 2 -
arch/x86/include/asm/traps.h | 3 +-
arch/x86/kernel/apic/apic.c | 31 +++++++++++++++++++++-----
arch/x86/kernel/idt.c | 2 -
arch/x86/kernel/irq.c | 14 +++++++-----
11 files changed, 99 insertions(+), 78 deletions(-)

--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -341,7 +341,10 @@ For 32-bit we have the following convent
#endif
.endm

-#endif /* CONFIG_X86_64 */
+#else /* CONFIG_X86_64 */
+# undef UNWIND_HINT_IRET_REGS
+# define UNWIND_HINT_IRET_REGS
+#endif /* !CONFIG_X86_64 */

.macro STACKLEAK_ERASE
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1215,40 +1215,15 @@ SYM_FUNC_END(entry_INT80_32)
#endif
.endm

-/*
- * Build the entry stubs with some assembler magic.
- * We pack 1 stub into every 8-byte block.
- */
- .align 8
-SYM_CODE_START(irq_entries_start)
- vector=FIRST_EXTERNAL_VECTOR
- .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
- pushl $(~vector+0x80) /* Note: always in signed byte range */
- vector=vector+1
- jmp common_interrupt
- .align 8
- .endr
-SYM_CODE_END(irq_entries_start)
-
#ifdef CONFIG_X86_LOCAL_APIC
- .align 8
-SYM_CODE_START(spurious_entries_start)
- vector=FIRST_SYSTEM_VECTOR
- .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
- pushl $(~vector+0x80) /* Note: always in signed byte range */
- vector=vector+1
- jmp common_spurious
- .align 8
- .endr
-SYM_CODE_END(spurious_entries_start)
-
SYM_CODE_START_LOCAL(common_spurious)
ASM_CLAC
- addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */
SAVE_ALL switch_stacks=1
ENCODE_FRAME_POINTER
TRACE_IRQS_OFF
movl %esp, %eax
+ movl PT_ORIG_EAX(%esp), %edx /* get the vector from stack */
+ movl $-1, PT_ORIG_EAX(%esp) /* no syscall to restart */
call smp_spurious_interrupt
jmp ret_from_intr
SYM_CODE_END(common_spurious)
@@ -1261,12 +1236,12 @@ SYM_CODE_END(common_spurious)
.p2align CONFIG_X86_L1_CACHE_SHIFT
SYM_CODE_START_LOCAL(common_interrupt)
ASM_CLAC
- addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */
-
SAVE_ALL switch_stacks=1
ENCODE_FRAME_POINTER
TRACE_IRQS_OFF
movl %esp, %eax
+ movl PT_ORIG_EAX(%esp), %edx /* get the vector from stack */
+ movl $-1, PT_ORIG_EAX(%esp) /* no syscall to restart */
call do_IRQ
jmp ret_from_intr
SYM_CODE_END(common_interrupt)
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -358,34 +358,6 @@ SYM_CODE_START(ret_from_fork)
SYM_CODE_END(ret_from_fork)
.popsection

-/*
- * Build the entry stubs with some assembler magic.
- * We pack 1 stub into every 8-byte block.
- */
- .align 8
-SYM_CODE_START(irq_entries_start)
- vector=FIRST_EXTERNAL_VECTOR
- .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
- UNWIND_HINT_IRET_REGS
- pushq $(~vector+0x80) /* Note: always in signed byte range */
- jmp common_interrupt
- .align 8
- vector=vector+1
- .endr
-SYM_CODE_END(irq_entries_start)
-
- .align 8
-SYM_CODE_START(spurious_entries_start)
- vector=FIRST_SYSTEM_VECTOR
- .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
- UNWIND_HINT_IRET_REGS
- pushq $(~vector+0x80) /* Note: always in signed byte range */
- jmp common_spurious
- .align 8
- vector=vector+1
- .endr
-SYM_CODE_END(spurious_entries_start)
-
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
#ifdef CONFIG_DEBUG_ENTRY
pushq %rax
@@ -755,13 +727,14 @@ SYM_CODE_END(interrupt_entry)
/* Interrupt entry/exit. */

/*
- * The interrupt stubs push (~vector+0x80) onto the stack and
+ * The interrupt stubs push vector onto the stack and
* then jump to common_spurious/interrupt.
*/
SYM_CODE_START_LOCAL(common_spurious)
- addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
call interrupt_entry
UNWIND_HINT_REGS indirect=1
+ movq ORIG_RAX(%rdi), %rsi /* get vector from stack */
+ movq $-1, ORIG_RAX(%rdi) /* no syscall to restart */
call smp_spurious_interrupt /* rdi points to pt_regs */
jmp ret_from_intr
SYM_CODE_END(common_spurious)
@@ -770,10 +743,11 @@ SYM_CODE_END(common_spurious)
/* common_interrupt is a hotpath. Align it */
.p2align CONFIG_X86_L1_CACHE_SHIFT
SYM_CODE_START_LOCAL(common_interrupt)
- addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
call interrupt_entry
UNWIND_HINT_REGS indirect=1
- call do_IRQ /* rdi points to pt_regs */
+ movq ORIG_RAX(%rdi), %rsi /* get vector from stack */
+ movq $-1, ORIG_RAX(%rdi) /* no syscall to restart */
+ call do_IRQ /* rdi points to pt_regs */
/* 0(%rsp): old RSP */
ret_from_intr:
DISABLE_INTERRUPTS(CLBR_ANY)
@@ -1022,7 +996,7 @@ apicinterrupt RESCHEDULE_VECTOR resche
#endif

apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt
-apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt
+apicinterrupt SPURIOUS_APIC_VECTOR spurious_apic_interrupt smp_spurious_apic_interrupt

#ifdef CONFIG_IRQ_WORK
apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -35,7 +35,7 @@ BUILD_INTERRUPT(kvm_posted_intr_nested_i

BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
-BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
+BUILD_INTERRUPT(spurious_apic_interrupt,SPURIOUS_APIC_VECTOR)
BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)

#ifdef CONFIG_IRQ_WORK
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -39,6 +39,7 @@ extern asmlinkage void irq_work_interrup
extern asmlinkage void uv_bau_message_intr1(void);

extern asmlinkage void spurious_interrupt(void);
+extern asmlinkage void spurious_apic_interrupt(void);
extern asmlinkage void thermal_interrupt(void);
extern asmlinkage void reschedule_interrupt(void);

--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -326,6 +326,50 @@ static __always_inline void __##func(str
#define DECLARE_IDTENTRY_XEN(vector, func) \
idtentry vector asm_exc_xen##func exc_##func has_error_code=0

+/*
+ * ASM code to emit the common vector entry stubs where each stub is
+ * packed into 8 bytes.
+ *
+ * Note, that the 'pushq imm8' is emitted via '.byte 0x6a, vector' because
+ * GCC treats the local vector variable as unsigned int and would expand
+ * all vectors above 0x7F to a 5 byte push. The original code did an
+ * adjustment of the vector number to be in the signed byte range to avoid
+ * this. While clever it's mindboggling counterintuitive and requires the
+ * odd conversion back to a real vector number in the C entry points. Using
+ * .byte achieves the same thing and the only fixup needed in the C entry
+ * point is to mask off the bits above bit 7 because the push is sign
+ * extending.
+ */
+ .align 8
+SYM_CODE_START(irq_entries_start)
+ vector=FIRST_EXTERNAL_VECTOR
+ pos = .
+ .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+ UNWIND_HINT_IRET_REGS
+ .byte 0x6a, vector
+ jmp common_interrupt
+ .nops (pos + 8) - .
+ pos=pos+8
+ vector=vector+1
+ .endr
+SYM_CODE_END(irq_entries_start)
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ .align 8
+SYM_CODE_START(spurious_entries_start)
+ vector=FIRST_SYSTEM_VECTOR
+ pos = .
+ .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
+ UNWIND_HINT_IRET_REGS
+ .byte 0x6a, vector
+ jmp common_spurious
+ .nops (pos + 8) - .
+ pos=pos+8
+ vector=vector+1
+ .endr
+SYM_CODE_END(spurious_entries_start)
+#endif
+
#endif /* __ASSEMBLY__ */

/*
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -36,7 +36,7 @@ extern void native_init_IRQ(void);

extern void handle_irq(struct irq_desc *desc, struct pt_regs *regs);

-extern __visible void do_IRQ(struct pt_regs *regs);
+extern __visible void do_IRQ(struct pt_regs *regs, unsigned long vector);

extern void init_ISA_irqs(void);

--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -41,8 +41,9 @@ asmlinkage void smp_deferred_error_inter
#endif

void smp_apic_timer_interrupt(struct pt_regs *regs);
-void smp_spurious_interrupt(struct pt_regs *regs);
void smp_error_interrupt(struct pt_regs *regs);
+void smp_spurious_apic_interrupt(struct pt_regs *regs);
+void smp_spurious_interrupt(struct pt_regs *regs, unsigned long vector);
asmlinkage void smp_irq_move_cleanup_interrupt(void);

#ifdef CONFIG_VMAP_STACK
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2153,15 +2153,29 @@ void __init register_lapic_address(unsig
* Local APIC interrupts
*/

-/*
- * This interrupt should _never_ happen with our APIC/SMP architecture
+/**
+ * smp_spurious_interrupt - Catch all for interrupts raised on unused vectors
+ * @regs: Pointer to pt_regs on stack
+ * @error_code: The vector number is in the lower 8 bits
+ *
+ * This is invoked from ASM entry code to catch all interrupts which
+ * trigger on an entry which is routed to the common_spurious idtentry
+ * point.
+ *
+ * Also called from smp_spurious_apic_interrupt().
*/
-__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs,
+ unsigned long vector)
{
- u8 vector = ~regs->orig_ax;
u32 v;

entering_irq();
+ /*
+ * The push in the entry ASM code which stores the vector number on
+ * the stack in the error code slot is sign expanding. Just use the
+ * lower 8 bits.
+ */
+ vector &= 0xFF;
trace_spurious_apic_entry(vector);

inc_irq_stat(irq_spurious_count);
@@ -2182,11 +2196,11 @@ void __init register_lapic_address(unsig
*/
v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1));
if (v & (1 << (vector & 0x1f))) {
- pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Acked\n",
+ pr_info("Spurious interrupt (vector 0x%02lx) on CPU#%d. Acked\n",
vector, smp_processor_id());
ack_APIC_irq();
} else {
- pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Not pending!\n",
+ pr_info("Spurious interrupt (vector 0x%02lx) on CPU#%d. Not pending!\n",
vector, smp_processor_id());
}
out:
@@ -2194,6 +2208,11 @@ void __init register_lapic_address(unsig
exiting_irq();
}

+__visible void smp_spurious_apic_interrupt(struct pt_regs *regs)
+{
+ smp_spurious_interrupt(regs, SPURIOUS_APIC_VECTOR);
+}
+
/*
* This interrupt should never happen with our APIC/SMP architecture
*/
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -145,7 +145,7 @@ static const __initconst struct idt_data
#ifdef CONFIG_X86_UV
INTG(UV_BAU_MESSAGE, uv_bau_message_intr1),
#endif
- INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt),
+ INTG(SPURIOUS_APIC_VECTOR, spurious_apic_interrupt),
INTG(ERROR_APIC_VECTOR, error_interrupt),
#endif
};
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -227,14 +227,18 @@ u64 arch_irq_stat(void)
* SMP cross-CPU interrupts have their own specific
* handlers).
*/
-__visible void __irq_entry do_IRQ(struct pt_regs *regs)
+__visible void __irq_entry do_IRQ(struct pt_regs *regs, unsigned long vector)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- struct irq_desc * desc;
- /* high bit used in ret_from_ code */
- unsigned vector = ~regs->orig_ax;
+ struct irq_desc *desc;

entering_irq();
+ /*
+ * The push in the entry ASM code which stores the vector number on
+ * the stack in the error code slot is sign expanding. Just use the
+ * lower 8 bits.
+ */
+ vector &= 0xFF;

/* entering_irq() tells RCU that we're not quiescent. Check it. */
RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
@@ -249,7 +253,7 @@ u64 arch_irq_stat(void)
ack_APIC_irq();

if (desc == VECTOR_UNUSED) {
- pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n",
+ pr_emerg_ratelimited("%s: %d.%lu No irq handler for vector\n",
__func__, smp_processor_id(),
vector);
} else {


2020-05-22 19:37:18

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch V9 21/39] x86/irq: Convey vector as argument and not in ptregs

Thomas Gleixner <[email protected]> writes:
> + .align 8
> +SYM_CODE_START(irq_entries_start)
> + vector=FIRST_EXTERNAL_VECTOR
> + pos = .
> + .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
> + UNWIND_HINT_IRET_REGS
> + .byte 0x6a, vector
> + jmp common_interrupt
> + .nops (pos + 8) - .

Boris pointed out that .nops requires binutils >=2.31 ... Sigh!

The below delta patch works for the purpose.

Thanks,

tglx

--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -475,7 +475,9 @@ SYM_CODE_START(irq_entries_start)
UNWIND_HINT_IRET_REGS
.byte 0x6a, vector
jmp asm_common_interrupt
- .nops (pos + 8) - .
+ nop
+ /* Ensure that the above is 8 bytes max */
+ . = pos + 8
pos=pos+8
vector=vector+1
.endr
@@ -490,7 +492,9 @@ SYM_CODE_START(spurious_entries_start)
UNWIND_HINT_IRET_REGS
.byte 0x6a, vector
jmp asm_spurious_interrupt
- .nops (pos + 8) - .
+ nop
+ /* Ensure that the above is 8 bytes max */
+ . = pos + 8
pos=pos+8
vector=vector+1
.endr


Subject: [tip: x86/entry] x86/irq: Convey vector as argument and not in ptregs

The following commit has been merged into the x86/entry branch of tip:

Commit-ID: 2f83c60d35e049112189fb9e30c346c1b97c70e4
Gitweb: https://git.kernel.org/tip/2f83c60d35e049112189fb9e30c346c1b97c70e4
Author: Thomas Gleixner <[email protected]>
AuthorDate: Thu, 21 May 2020 22:05:34 +02:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Tue, 26 May 2020 19:06:28 +02:00

x86/irq: Convey vector as argument and not in ptregs

Device interrupts which go through do_IRQ() or the spurious interrupt
handler have their separate entry code on 64 bit for no good reason.

Both 32 and 64 bit transport the vector number through ORIG_[RE]AX in
pt_regs. Further the vector number is forced to fit into an u8 and is
complemented and offset by 0x80 so it's in the signed character
range. Otherwise GAS would expand the pushq to a 5 byte instruction for any
vector > 0x7F.

Treat the vector number like an error code and hand it to the C function as
argument. This allows to get rid of the extra entry code in a later step.

Simplify the error code push magic by implementing the pushq imm8 via a
'.byte 0x6a, vector' sequence so GAS is not able to screw it up. As the
pushq imm8 is sign extending the resulting error code needs to be truncated
to 8 bits in C code.

Originally-by: Andy Lutomirski <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Acked-by: Andy Lutomirski <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
arch/x86/entry/calling.h | 5 ++-
arch/x86/entry/entry_32.S | 33 ++-------------------
arch/x86/entry/entry_64.S | 40 ++++---------------------
arch/x86/include/asm/entry_arch.h | 2 +-
arch/x86/include/asm/hw_irq.h | 1 +-
arch/x86/include/asm/idtentry.h | 48 ++++++++++++++++++++++++++++++-
arch/x86/include/asm/irq.h | 2 +-
arch/x86/include/asm/traps.h | 3 +-
arch/x86/kernel/apic/apic.c | 31 +++++++++++++++----
arch/x86/kernel/idt.c | 2 +-
arch/x86/kernel/irq.c | 14 +++++----
11 files changed, 103 insertions(+), 78 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 1c7f13b..98da0d3 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -341,7 +341,10 @@ For 32-bit we have the following conventions - kernel is built with
#endif
.endm

-#endif /* CONFIG_X86_64 */
+#else /* CONFIG_X86_64 */
+# undef UNWIND_HINT_IRET_REGS
+# define UNWIND_HINT_IRET_REGS
+#endif /* !CONFIG_X86_64 */

.macro STACKLEAK_ERASE
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 1bf2dcb..53a6447 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1215,40 +1215,15 @@ SYM_FUNC_END(entry_INT80_32)
#endif
.endm

-/*
- * Build the entry stubs with some assembler magic.
- * We pack 1 stub into every 8-byte block.
- */
- .align 8
-SYM_CODE_START(irq_entries_start)
- vector=FIRST_EXTERNAL_VECTOR
- .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
- pushl $(~vector+0x80) /* Note: always in signed byte range */
- vector=vector+1
- jmp common_interrupt
- .align 8
- .endr
-SYM_CODE_END(irq_entries_start)
-
#ifdef CONFIG_X86_LOCAL_APIC
- .align 8
-SYM_CODE_START(spurious_entries_start)
- vector=FIRST_SYSTEM_VECTOR
- .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
- pushl $(~vector+0x80) /* Note: always in signed byte range */
- vector=vector+1
- jmp common_spurious
- .align 8
- .endr
-SYM_CODE_END(spurious_entries_start)
-
SYM_CODE_START_LOCAL(common_spurious)
ASM_CLAC
- addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */
SAVE_ALL switch_stacks=1
ENCODE_FRAME_POINTER
TRACE_IRQS_OFF
movl %esp, %eax
+ movl PT_ORIG_EAX(%esp), %edx /* get the vector from stack */
+ movl $-1, PT_ORIG_EAX(%esp) /* no syscall to restart */
call smp_spurious_interrupt
jmp ret_from_intr
SYM_CODE_END(common_spurious)
@@ -1261,12 +1236,12 @@ SYM_CODE_END(common_spurious)
.p2align CONFIG_X86_L1_CACHE_SHIFT
SYM_CODE_START_LOCAL(common_interrupt)
ASM_CLAC
- addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */
-
SAVE_ALL switch_stacks=1
ENCODE_FRAME_POINTER
TRACE_IRQS_OFF
movl %esp, %eax
+ movl PT_ORIG_EAX(%esp), %edx /* get the vector from stack */
+ movl $-1, PT_ORIG_EAX(%esp) /* no syscall to restart */
call do_IRQ
jmp ret_from_intr
SYM_CODE_END(common_interrupt)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 7699359..e7434cd 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -358,34 +358,6 @@ SYM_CODE_START(ret_from_fork)
SYM_CODE_END(ret_from_fork)
.popsection

-/*
- * Build the entry stubs with some assembler magic.
- * We pack 1 stub into every 8-byte block.
- */
- .align 8
-SYM_CODE_START(irq_entries_start)
- vector=FIRST_EXTERNAL_VECTOR
- .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
- UNWIND_HINT_IRET_REGS
- pushq $(~vector+0x80) /* Note: always in signed byte range */
- jmp common_interrupt
- .align 8
- vector=vector+1
- .endr
-SYM_CODE_END(irq_entries_start)
-
- .align 8
-SYM_CODE_START(spurious_entries_start)
- vector=FIRST_SYSTEM_VECTOR
- .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
- UNWIND_HINT_IRET_REGS
- pushq $(~vector+0x80) /* Note: always in signed byte range */
- jmp common_spurious
- .align 8
- vector=vector+1
- .endr
-SYM_CODE_END(spurious_entries_start)
-
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
#ifdef CONFIG_DEBUG_ENTRY
pushq %rax
@@ -755,13 +727,14 @@ _ASM_NOKPROBE(interrupt_entry)
/* Interrupt entry/exit. */

/*
- * The interrupt stubs push (~vector+0x80) onto the stack and
+ * The interrupt stubs push vector onto the stack and
* then jump to common_spurious/interrupt.
*/
SYM_CODE_START_LOCAL(common_spurious)
- addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
call interrupt_entry
UNWIND_HINT_REGS indirect=1
+ movq ORIG_RAX(%rdi), %rsi /* get vector from stack */
+ movq $-1, ORIG_RAX(%rdi) /* no syscall to restart */
call smp_spurious_interrupt /* rdi points to pt_regs */
jmp ret_from_intr
SYM_CODE_END(common_spurious)
@@ -770,10 +743,11 @@ _ASM_NOKPROBE(common_spurious)
/* common_interrupt is a hotpath. Align it */
.p2align CONFIG_X86_L1_CACHE_SHIFT
SYM_CODE_START_LOCAL(common_interrupt)
- addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
call interrupt_entry
UNWIND_HINT_REGS indirect=1
- call do_IRQ /* rdi points to pt_regs */
+ movq ORIG_RAX(%rdi), %rsi /* get vector from stack */
+ movq $-1, ORIG_RAX(%rdi) /* no syscall to restart */
+ call do_IRQ /* rdi points to pt_regs */
/* 0(%rsp): old RSP */
ret_from_intr:
DISABLE_INTERRUPTS(CLBR_ANY)
@@ -1022,7 +996,7 @@ apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt
#endif

apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt
-apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt
+apicinterrupt SPURIOUS_APIC_VECTOR spurious_apic_interrupt smp_spurious_apic_interrupt

#ifdef CONFIG_IRQ_WORK
apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 4164227..cd57ce6 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -35,7 +35,7 @@ BUILD_INTERRUPT(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR)

BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
-BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
+BUILD_INTERRUPT(spurious_apic_interrupt,SPURIOUS_APIC_VECTOR)
BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)

#ifdef CONFIG_IRQ_WORK
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 4154bc5..0ffe807 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -39,6 +39,7 @@ extern asmlinkage void irq_work_interrupt(void);
extern asmlinkage void uv_bau_message_intr1(void);

extern asmlinkage void spurious_interrupt(void);
+extern asmlinkage void spurious_apic_interrupt(void);
extern asmlinkage void thermal_interrupt(void);
extern asmlinkage void reschedule_interrupt(void);

diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 53e7f14..b089997 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -326,6 +326,54 @@ __visible noinstr void func(struct pt_regs *regs, \
#define DECLARE_IDTENTRY_XEN(vector, func) \
idtentry vector asm_exc_xen##func exc_##func has_error_code=0

+/*
+ * ASM code to emit the common vector entry stubs where each stub is
+ * packed into 8 bytes.
+ *
+ * Note, that the 'pushq imm8' is emitted via '.byte 0x6a, vector' because
+ * GCC treats the local vector variable as unsigned int and would expand
+ * all vectors above 0x7F to a 5 byte push. The original code did an
+ * adjustment of the vector number to be in the signed byte range to avoid
+ * this. While clever it's mindboggling counterintuitive and requires the
+ * odd conversion back to a real vector number in the C entry points. Using
+ * .byte achieves the same thing and the only fixup needed in the C entry
+ * point is to mask off the bits above bit 7 because the push is sign
+ * extending.
+ */
+ .align 8
+SYM_CODE_START(irq_entries_start)
+ vector=FIRST_EXTERNAL_VECTOR
+ pos = .
+ .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+ UNWIND_HINT_IRET_REGS
+ .byte 0x6a, vector
+ jmp common_interrupt
+ nop
+ /* Ensure that the above is 8 bytes max */
+ . = pos + 8
+ pos=pos+8
+ vector=vector+1
+ .endr
+SYM_CODE_END(irq_entries_start)
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ .align 8
+SYM_CODE_START(spurious_entries_start)
+ vector=FIRST_SYSTEM_VECTOR
+ pos = .
+ .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
+ UNWIND_HINT_IRET_REGS
+ .byte 0x6a, vector
+ jmp common_spurious
+ nop
+ /* Ensure that the above is 8 bytes max */
+ . = pos + 8
+ pos=pos+8
+ vector=vector+1
+ .endr
+SYM_CODE_END(spurious_entries_start)
+#endif
+
#endif /* __ASSEMBLY__ */

/*
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 72fba0e..74690a3 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -36,7 +36,7 @@ extern void native_init_IRQ(void);

extern void handle_irq(struct irq_desc *desc, struct pt_regs *regs);

-extern __visible void do_IRQ(struct pt_regs *regs);
+extern __visible void do_IRQ(struct pt_regs *regs, unsigned long vector);

extern void init_ISA_irqs(void);

diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index d7de360..32b2bec 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -41,8 +41,9 @@ asmlinkage void smp_deferred_error_interrupt(struct pt_regs *regs);
#endif

void smp_apic_timer_interrupt(struct pt_regs *regs);
-void smp_spurious_interrupt(struct pt_regs *regs);
void smp_error_interrupt(struct pt_regs *regs);
+void smp_spurious_apic_interrupt(struct pt_regs *regs);
+void smp_spurious_interrupt(struct pt_regs *regs, unsigned long vector);
asmlinkage void smp_irq_move_cleanup_interrupt(void);

#ifdef CONFIG_VMAP_STACK
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e53dda2..a4218a3 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2153,15 +2153,29 @@ void __init register_lapic_address(unsigned long address)
* Local APIC interrupts
*/

-/*
- * This interrupt should _never_ happen with our APIC/SMP architecture
+/**
+ * smp_spurious_interrupt - Catch all for interrupts raised on unused vectors
+ * @regs: Pointer to pt_regs on stack
+ * @error_code: The vector number is in the lower 8 bits
+ *
+ * This is invoked from ASM entry code to catch all interrupts which
+ * trigger on an entry which is routed to the common_spurious idtentry
+ * point.
+ *
+ * Also called from smp_spurious_apic_interrupt().
*/
-__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs,
+ unsigned long vector)
{
- u8 vector = ~regs->orig_ax;
u32 v;

entering_irq();
+ /*
+ * The push in the entry ASM code which stores the vector number on
+ * the stack in the error code slot is sign expanding. Just use the
+ * lower 8 bits.
+ */
+ vector &= 0xFF;
trace_spurious_apic_entry(vector);

inc_irq_stat(irq_spurious_count);
@@ -2182,11 +2196,11 @@ __visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
*/
v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1));
if (v & (1 << (vector & 0x1f))) {
- pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Acked\n",
+ pr_info("Spurious interrupt (vector 0x%02lx) on CPU#%d. Acked\n",
vector, smp_processor_id());
ack_APIC_irq();
} else {
- pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Not pending!\n",
+ pr_info("Spurious interrupt (vector 0x%02lx) on CPU#%d. Not pending!\n",
vector, smp_processor_id());
}
out:
@@ -2194,6 +2208,11 @@ out:
exiting_irq();
}

+__visible void smp_spurious_apic_interrupt(struct pt_regs *regs)
+{
+ smp_spurious_interrupt(regs, SPURIOUS_APIC_VECTOR);
+}
+
/*
* This interrupt should never happen with our APIC/SMP architecture
*/
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index ddb1115..20408e3 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -145,7 +145,7 @@ static const __initconst struct idt_data apic_idts[] = {
#ifdef CONFIG_X86_UV
INTG(UV_BAU_MESSAGE, uv_bau_message_intr1),
#endif
- INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt),
+ INTG(SPURIOUS_APIC_VECTOR, spurious_apic_interrupt),
INTG(ERROR_APIC_VECTOR, error_interrupt),
#endif
};
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 252065d..c766936 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -227,14 +227,18 @@ u64 arch_irq_stat(void)
* SMP cross-CPU interrupts have their own specific
* handlers).
*/
-__visible void __irq_entry do_IRQ(struct pt_regs *regs)
+__visible void __irq_entry do_IRQ(struct pt_regs *regs, unsigned long vector)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- struct irq_desc * desc;
- /* high bit used in ret_from_ code */
- unsigned vector = ~regs->orig_ax;
+ struct irq_desc *desc;

entering_irq();
+ /*
+ * The push in the entry ASM code which stores the vector number on
+ * the stack in the error code slot is sign expanding. Just use the
+ * lower 8 bits.
+ */
+ vector &= 0xFF;

/* entering_irq() tells RCU that we're not quiescent. Check it. */
RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
@@ -249,7 +253,7 @@ __visible void __irq_entry do_IRQ(struct pt_regs *regs)
ack_APIC_irq();

if (desc == VECTOR_UNUSED) {
- pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n",
+ pr_emerg_ratelimited("%s: %d.%lu No irq handler for vector\n",
__func__, smp_processor_id(),
vector);
} else {

2020-08-24 17:32:02

by Alexander Graf

[permalink] [raw]
Subject: Re: [patch V9 21/39] x86/irq: Convey vector as argument and not in ptregs

Hi Thomas,

On 21.05.20 22:05, Thomas Gleixner wrote:
> From: Thomas Gleixner <[email protected]>
>
> Device interrupts which go through do_IRQ() or the spurious interrupt
> handler have their separate entry code on 64 bit for no good reason.
>
> Both 32 and 64 bit transport the vector number through ORIG_[RE]AX in
> pt_regs. Further the vector number is forced to fit into an u8 and is
> complemented and offset by 0x80 so it's in the signed character
> range. Otherwise GAS would expand the pushq to a 5 byte instruction for any
> vector > 0x7F.
>
> Treat the vector number like an error code and hand it to the C function as
> argument. This allows to get rid of the extra entry code in a later step.
>
> Simplify the error code push magic by implementing the pushq imm8 via a
> '.byte 0x6a, vector' sequence so GAS is not able to screw it up. As the
> pushq imm8 is sign extending the resulting error code needs to be truncated
> to 8 bits in C code.
>
> Originally-by: Andy Lutomirski <[email protected]>
> Signed-off-by: Thomas Gleixner <[email protected]>
> Acked-by: Andy Lutomirski <[email protected]>

I'm currently trying to understand a performance regression with
ScyllaDB on i3en.3xlarge (KVM based VM on Skylake) which we reliably
bisected down to this commit:

https://github.com/scylladb/scylla/issues/7036

What we're seeing is that syscalls such as membarrier() take forever
(0-10 µs would be normal):

% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ----------------
53.26 12.458881 185953 67 membarrier
15.79 3.693651 17843 207 49 read
11.17 2.613350 67008 39 io_pgetevents
10.89 2.547772 11795 216 timerfd_settime
6.91 1.616802 11073 146 rt_sigprocmask
1.39 0.325955 3542 92 timer_settime
0.36 0.083691 526 159 io_submit
0.22 0.051399 535 96 write
0.00 0.000783 37 21 sendmsg
0.00 0.000057 3 18 9 ioctl
------ ----------- ----------- --------- --------- ----------------
100.00 23.392341 1061 58 total

That again seems to stem from a vastly slowed down
smp_call_function_many_cond():

Samples: 218K of event 'cpu-clock', 4000 Hz
Overhead Shared Object Symbol
94.51% [kernel] [k] smp_call_function_many_cond
0.76% [kernel] [k] __do_softirq
0.32% [kernel] [k] native_queued_spin_lock_slowpath
[...]

which is stuck in

│ csd_lock_wait():
│ smp_cond_load_acquire(&csd->flags, !(VAL &
0.00 │ mov 0x8(%rcx),%edx
0.00 │ and $0x1,%edx
│ ↓ je 2b9
│ rep_nop():
0.70 │2af: pause
│ csd_lock_wait():
92.82 │ mov 0x8(%rcx),%edx
6.48 │ and $0x1,%edx
0.00 │ ↑ jne 2af
0.00 │2b9: ↑ jmp 282


Given the patch at hand I was expecting lost IPIs, but I can't quite see
anything getting lost.

Do you have any further pointers I could look at?


Thanks,

Alex



Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879


2020-08-25 10:29:55

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch V9 21/39] x86/irq: Convey vector as argument and not in ptregs

Alex,

On Mon, Aug 24 2020 at 19:29, Alexander Graf wrote:
> I'm currently trying to understand a performance regression with
> ScyllaDB on i3en.3xlarge (KVM based VM on Skylake) which we reliably
> bisected down to this commit:
>
> https://github.com/scylladb/scylla/issues/7036
>
> What we're seeing is that syscalls such as membarrier() take forever
> (0-10 µs would be normal):
...
> That again seems to stem from a vastly slowed down
> smp_call_function_many_cond():
>
> Samples: 218K of event 'cpu-clock', 4000 Hz
> Overhead Shared Object Symbol
> 94.51% [kernel] [k] smp_call_function_many_cond
> 0.76% [kernel] [k] __do_softirq
> 0.32% [kernel] [k] native_queued_spin_lock_slowpath
> [...]
>
> which is stuck in
>
> │ csd_lock_wait():
> │ smp_cond_load_acquire(&csd->flags, !(VAL &
> 0.00 │ mov 0x8(%rcx),%edx
> 0.00 │ and $0x1,%edx
> │ ↓ je 2b9
> │ rep_nop():
> 0.70 │2af: pause
> │ csd_lock_wait():
> 92.82 │ mov 0x8(%rcx),%edx
> 6.48 │ and $0x1,%edx
> 0.00 │ ↑ jne 2af
> 0.00 │2b9: ↑ jmp 282
>
>
> Given the patch at hand I was expecting lost IPIs, but I can't quite see
> anything getting lost.

I have no idea how that patch should be related to IPI and smp function
calls. It's changing the way how regular device interrupts and their
spurious counterpart are handled and not the way how IPIs are
handled. They are handled by direct vectors and do not go through
do_IRQ() at all.

Aside of that the commit just changes the way how the interrupt vector
of a regular device interrupt is stored and conveyed. The extra read and
write on the cache hot stack is hardly related to anything IPI.

Thanks,

tglx


2020-08-25 23:20:07

by Alexander Graf

[permalink] [raw]
Subject: Re: [patch V9 21/39] x86/irq: Convey vector as argument and not in ptregs

Hi Thomas,

On 25.08.20 12:28, Thomas Gleixner wrote:
>
> Alex,
>
> On Mon, Aug 24 2020 at 19:29, Alexander Graf wrote:
>> I'm currently trying to understand a performance regression with
>> ScyllaDB on i3en.3xlarge (KVM based VM on Skylake) which we reliably
>> bisected down to this commit:
>>
>> https://github.com/scylladb/scylla/issues/7036
>>
>> What we're seeing is that syscalls such as membarrier() take forever
>> (0-10 µs would be normal):
> ...
>> That again seems to stem from a vastly slowed down
>> smp_call_function_many_cond():
>>
>> Samples: 218K of event 'cpu-clock', 4000 Hz
>> Overhead Shared Object Symbol
>> 94.51% [kernel] [k] smp_call_function_many_cond
>> 0.76% [kernel] [k] __do_softirq
>> 0.32% [kernel] [k] native_queued_spin_lock_slowpath
>> [...]
>>
>> which is stuck in
>>
>> │ csd_lock_wait():
>> │ smp_cond_load_acquire(&csd->flags, !(VAL &
>> 0.00 │ mov 0x8(%rcx),%edx
>> 0.00 │ and $0x1,%edx
>> │ ↓ je 2b9
>> │ rep_nop():
>> 0.70 │2af: pause
>> │ csd_lock_wait():
>> 92.82 │ mov 0x8(%rcx),%edx
>> 6.48 │ and $0x1,%edx
>> 0.00 │ ↑ jne 2af
>> 0.00 │2b9: ↑ jmp 282
>>
>>
>> Given the patch at hand I was expecting lost IPIs, but I can't quite see
>> anything getting lost.
>
> I have no idea how that patch should be related to IPI and smp function
> calls. It's changing the way how regular device interrupts and their
> spurious counterpart are handled and not the way how IPIs are
> handled. They are handled by direct vectors and do not go through
> do_IRQ() at all.
>
> Aside of that the commit just changes the way how the interrupt vector
> of a regular device interrupt is stored and conveyed. The extra read and
> write on the cache hot stack is hardly related to anything IPI.

I am as puzzled as you are, but the bisect is very clear: 79b9c183021e
works fast and 633260fa1 (as well as mainline) shows the weird behavior
above.

It gets even better. This small (demonstrative only, mangled) patch on
top of 633260fa1 also resolves the performance issue:

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index c766936..7e91e9a 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -239,6 +239,7 @@ __visible void __irq_entry do_IRQ(struct pt_regs
*regs, unsigned long vector)
* lower 8 bits.
*/
vector &= 0xFF;
+ regs->orig_ax = ~vector;

/* entering_irq() tells RCU that we're not quiescent. Check it. */
RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");


To me that sounds like some irq exit code somewhere must still be
looking at orig_ax to decide on something - and that something is wrong
now that we removed the negation of the vector. It also seems to have an
impact on remote function calls.

I'll have a deeper look tomorrow again if I can find any such place, but
I wouldn't mind if anyone could point me into the right direction
earlier :).


Alex



Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879


2020-08-25 23:45:40

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [patch V9 21/39] x86/irq: Convey vector as argument and not in ptregs

On Tue, Aug 25, 2020 at 4:18 PM Alexander Graf <[email protected]> wrote:
>
> Hi Thomas,
>
> On 25.08.20 12:28, Thomas Gleixner wrote:
> >void irq_complete_move(struct irq_cfg *cfg)
{
__irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
}

> > Alex,
> >
> > On Mon, Aug 24 2020 at 19:29, Alexander Graf wrote:
> >> I'm currently trying to understand a performance regression with
> >> ScyllaDB on i3en.3xlarge (KVM based VM on Skylake) which we reliably
> >> bisected down to this commit:
> >>
> >> https://github.com/scylladb/scylla/issues/7036
> >>
> >> What we're seeing is that syscalls such as membarrier() take forever
> >> (0-10 µs would be normal):
> > ...
> >> That again seems to stem from a vastly slowed down
> >> smp_call_function_many_cond():
> >>
> >> Samples: 218K of event 'cpu-clock', 4000 Hz
> >> Overhead Shared Object Symbol
> >> 94.51% [kernel] [k] smp_call_function_many_cond
> >> 0.76% [kernel] [k] __do_softirq
> >> 0.32% [kernel] [k] native_queued_spin_lock_slowpath
> >> [...]
> >>
> >> which is stuck in
> >>
> >> │ csd_lock_wait():
> >> │ smp_cond_load_acquire(&csd->flags, !(VAL &
> >> 0.00 │ mov 0x8(%rcx),%edx
> >> 0.00 │ and $0x1,%edx
> >> │ ↓ je 2b9
> >> │ rep_nop():
> >> 0.70 │2af: pause
> >> │ csd_lock_wait():
> >> 92.82 │ mov 0x8(%rcx),%edx
> >> 6.48 │ and $0x1,%edx
> >> 0.00 │ ↑ jne 2af
> >> 0.00 │2b9: ↑ jmp 282
> >>
> >>
> >> Given the patch at hand I was expecting lost IPIs, but I can't quite see
> >> anything getting lost.
> >
> > I have no idea how that patch should be related to IPI and smp function
> > calls. It's changing the way how regular device interrupts and their
> > spurious counterpart are handled and not the way how IPIs are
> > handled. They are handled by direct vectors and do not go through
> > do_IRQ() at all.
> >
> > Aside of that the commit just changes the way how the interrupt vector
> > of a regular device interrupt is stored and conveyed. The extra read and
> > write on the cache hot stack is hardly related to anything IPI.
>
> I am as puzzled as you are, but the bisect is very clear: 79b9c183021e
> works fast and 633260fa1 (as well as mainline) shows the weird behavior
> above.
>
> It gets even better. This small (demonstrative only, mangled) patch on
> top of 633260fa1 also resolves the performance issue:
>
> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
> index c766936..7e91e9a 100644
> --- a/arch/x86/kernel/irq.c
> +++ b/arch/x86/kernel/irq.c
> @@ -239,6 +239,7 @@ __visible void __irq_entry do_IRQ(struct pt_regs
> *regs, unsigned long vector)
> * lower 8 bits.
> */
> vector &= 0xFF;
> + regs->orig_ax = ~vector;
>
> /* entering_irq() tells RCU that we're not quiescent. Check it. */
> RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
>
>
> To me that sounds like some irq exit code somewhere must still be
> looking at orig_ax to decide on something - and that something is wrong
> now that we removed the negation of the vector. It also seems to have an
> impact on remote function calls.
>
> I'll have a deeper look tomorrow again if I can find any such place, but
> I wouldn't mind if anyone could point me into the right direction
> earlier :).

How about this:

void irq_complete_move(struct irq_cfg *cfg)
{
__irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
}

in arch/x86/kernel/apic/vector.c.

2020-08-26 00:08:51

by Alexander Graf

[permalink] [raw]
Subject: Re: [patch V9 21/39] x86/irq: Convey vector as argument and not in ptregs

Hi Andy,

On 26.08.20 01:41, Andy Lutomirski wrote:
>
> On Tue, Aug 25, 2020 at 4:18 PM Alexander Graf <[email protected]> wrote:
>>
>> Hi Thomas,
>>
>> On 25.08.20 12:28, Thomas Gleixner wrote:
>>> void irq_complete_move(struct irq_cfg *cfg)
> {
> __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
> }
>
>>> Alex,
>>>
>>> On Mon, Aug 24 2020 at 19:29, Alexander Graf wrote:
>>>> I'm currently trying to understand a performance regression with
>>>> ScyllaDB on i3en.3xlarge (KVM based VM on Skylake) which we reliably
>>>> bisected down to this commit:
>>>>
>>>> https://github.com/scylladb/scylla/issues/7036
>>>>
>>>> What we're seeing is that syscalls such as membarrier() take forever
>>>> (0-10 µs would be normal):
>>> ...
>>>> That again seems to stem from a vastly slowed down
>>>> smp_call_function_many_cond():
>>>>
>>>> Samples: 218K of event 'cpu-clock', 4000 Hz
>>>> Overhead Shared Object Symbol
>>>> 94.51% [kernel] [k] smp_call_function_many_cond
>>>> 0.76% [kernel] [k] __do_softirq
>>>> 0.32% [kernel] [k] native_queued_spin_lock_slowpath
>>>> [...]
>>>>
>>>> which is stuck in
>>>>
>>>> │ csd_lock_wait():
>>>> │ smp_cond_load_acquire(&csd->flags, !(VAL &
>>>> 0.00 │ mov 0x8(%rcx),%edx
>>>> 0.00 │ and $0x1,%edx
>>>> │ ↓ je 2b9
>>>> │ rep_nop():
>>>> 0.70 │2af: pause
>>>> │ csd_lock_wait():
>>>> 92.82 │ mov 0x8(%rcx),%edx
>>>> 6.48 │ and $0x1,%edx
>>>> 0.00 │ ↑ jne 2af
>>>> 0.00 │2b9: ↑ jmp 282
>>>>
>>>>
>>>> Given the patch at hand I was expecting lost IPIs, but I can't quite see
>>>> anything getting lost.
>>>
>>> I have no idea how that patch should be related to IPI and smp function
>>> calls. It's changing the way how regular device interrupts and their
>>> spurious counterpart are handled and not the way how IPIs are
>>> handled. They are handled by direct vectors and do not go through
>>> do_IRQ() at all.
>>>
>>> Aside of that the commit just changes the way how the interrupt vector
>>> of a regular device interrupt is stored and conveyed. The extra read and
>>> write on the cache hot stack is hardly related to anything IPI.
>>
>> I am as puzzled as you are, but the bisect is very clear: 79b9c183021e
>> works fast and 633260fa1 (as well as mainline) shows the weird behavior
>> above.
>>
>> It gets even better. This small (demonstrative only, mangled) patch on
>> top of 633260fa1 also resolves the performance issue:
>>
>> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
>> index c766936..7e91e9a 100644
>> --- a/arch/x86/kernel/irq.c
>> +++ b/arch/x86/kernel/irq.c
>> @@ -239,6 +239,7 @@ __visible void __irq_entry do_IRQ(struct pt_regs
>> *regs, unsigned long vector)
>> * lower 8 bits.
>> */
>> vector &= 0xFF;
>> + regs->orig_ax = ~vector;
>>
>> /* entering_irq() tells RCU that we're not quiescent. Check it. */
>> RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
>>
>>
>> To me that sounds like some irq exit code somewhere must still be
>> looking at orig_ax to decide on something - and that something is wrong
>> now that we removed the negation of the vector. It also seems to have an
>> impact on remote function calls.
>>
>> I'll have a deeper look tomorrow again if I can find any such place, but
>> I wouldn't mind if anyone could point me into the right direction
>> earlier :).
>
> How about this:
>
> void irq_complete_move(struct irq_cfg *cfg)
> {
> __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
> }
>
> in arch/x86/kernel/apic/vector.c.
>

Thanks a lot, I stumbled over the same thing just after I sent the email
as well and had been trying to see if I can quickly patch it up before I
fall asleep :).

The code path above is used by the APIC vector move (irqbalance) logic,
which explains why not everyone was seeing issues.

So with 633260fa1 applied, we never get out of moving state for our IRQ
because orig_ax is always -1. That means we send an IPI to the cleanup
vector on every single device interrupt, completely occupying the poor
CPU that we moved the IRQ from.

I've confirmed that the patch below fixes the issue and will send a
proper, more complete patch on top of mainline with fancy description
and stable tag tomorrow.


Alex



diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index e7434cd..a474e6e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -734,7 +734,6 @@ SYM_CODE_START_LOCAL(common_spurious)
call interrupt_entry
UNWIND_HINT_REGS indirect=1
movq ORIG_RAX(%rdi), %rsi /* get vector from stack */
- movq $-1, ORIG_RAX(%rdi) /* no syscall to restart */
call smp_spurious_interrupt /* rdi points to pt_regs */
jmp ret_from_intr
SYM_CODE_END(common_spurious)
@@ -746,7 +745,6 @@ SYM_CODE_START_LOCAL(common_interrupt)
call interrupt_entry
UNWIND_HINT_REGS indirect=1
movq ORIG_RAX(%rdi), %rsi /* get vector from stack */
- movq $-1, ORIG_RAX(%rdi) /* no syscall to restart */
call do_IRQ /* rdi points to pt_regs */
/* 0(%rsp): old RSP */
ret_from_intr:
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 67768e5443..5b6f74e 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -934,7 +934,7 @@ static void __irq_complete_move(struct irq_cfg *cfg,
unsigned vector)

void irq_complete_move(struct irq_cfg *cfg)
{
- __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
+ __irq_complete_move(cfg, get_irq_regs()->orig_ax);
}

/*



Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879


2020-08-26 01:12:57

by Brian Gerst

[permalink] [raw]
Subject: Re: [patch V9 21/39] x86/irq: Convey vector as argument and not in ptregs

On Tue, Aug 25, 2020 at 8:04 PM Alexander Graf <[email protected]> wrote:
>
> Hi Andy,
>
> On 26.08.20 01:41, Andy Lutomirski wrote:
> >
> > On Tue, Aug 25, 2020 at 4:18 PM Alexander Graf <[email protected]> wrote:
> >>
> >> Hi Thomas,
> >>
> >> On 25.08.20 12:28, Thomas Gleixner wrote:
> >>> void irq_complete_move(struct irq_cfg *cfg)
> > {
> > __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
> > }
> >
> >>> Alex,
> >>>
> >>> On Mon, Aug 24 2020 at 19:29, Alexander Graf wrote:
> >>>> I'm currently trying to understand a performance regression with
> >>>> ScyllaDB on i3en.3xlarge (KVM based VM on Skylake) which we reliably
> >>>> bisected down to this commit:
> >>>>
> >>>> https://github.com/scylladb/scylla/issues/7036
> >>>>
> >>>> What we're seeing is that syscalls such as membarrier() take forever
> >>>> (0-10 µs would be normal):
> >>> ...
> >>>> That again seems to stem from a vastly slowed down
> >>>> smp_call_function_many_cond():
> >>>>
> >>>> Samples: 218K of event 'cpu-clock', 4000 Hz
> >>>> Overhead Shared Object Symbol
> >>>> 94.51% [kernel] [k] smp_call_function_many_cond
> >>>> 0.76% [kernel] [k] __do_softirq
> >>>> 0.32% [kernel] [k] native_queued_spin_lock_slowpath
> >>>> [...]
> >>>>
> >>>> which is stuck in
> >>>>
> >>>> │ csd_lock_wait():
> >>>> │ smp_cond_load_acquire(&csd->flags, !(VAL &
> >>>> 0.00 │ mov 0x8(%rcx),%edx
> >>>> 0.00 │ and $0x1,%edx
> >>>> │ ↓ je 2b9
> >>>> │ rep_nop():
> >>>> 0.70 │2af: pause
> >>>> │ csd_lock_wait():
> >>>> 92.82 │ mov 0x8(%rcx),%edx
> >>>> 6.48 │ and $0x1,%edx
> >>>> 0.00 │ ↑ jne 2af
> >>>> 0.00 │2b9: ↑ jmp 282
> >>>>
> >>>>
> >>>> Given the patch at hand I was expecting lost IPIs, but I can't quite see
> >>>> anything getting lost.
> >>>
> >>> I have no idea how that patch should be related to IPI and smp function
> >>> calls. It's changing the way how regular device interrupts and their
> >>> spurious counterpart are handled and not the way how IPIs are
> >>> handled. They are handled by direct vectors and do not go through
> >>> do_IRQ() at all.
> >>>
> >>> Aside of that the commit just changes the way how the interrupt vector
> >>> of a regular device interrupt is stored and conveyed. The extra read and
> >>> write on the cache hot stack is hardly related to anything IPI.
> >>
> >> I am as puzzled as you are, but the bisect is very clear: 79b9c183021e
> >> works fast and 633260fa1 (as well as mainline) shows the weird behavior
> >> above.
> >>
> >> It gets even better. This small (demonstrative only, mangled) patch on
> >> top of 633260fa1 also resolves the performance issue:
> >>
> >> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
> >> index c766936..7e91e9a 100644
> >> --- a/arch/x86/kernel/irq.c
> >> +++ b/arch/x86/kernel/irq.c
> >> @@ -239,6 +239,7 @@ __visible void __irq_entry do_IRQ(struct pt_regs
> >> *regs, unsigned long vector)
> >> * lower 8 bits.
> >> */
> >> vector &= 0xFF;
> >> + regs->orig_ax = ~vector;
> >>
> >> /* entering_irq() tells RCU that we're not quiescent. Check it. */
> >> RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
> >>
> >>
> >> To me that sounds like some irq exit code somewhere must still be
> >> looking at orig_ax to decide on something - and that something is wrong
> >> now that we removed the negation of the vector. It also seems to have an
> >> impact on remote function calls.
> >>
> >> I'll have a deeper look tomorrow again if I can find any such place, but
> >> I wouldn't mind if anyone could point me into the right direction
> >> earlier :).
> >
> > How about this:
> >
> > void irq_complete_move(struct irq_cfg *cfg)
> > {
> > __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
> > }
> >
> > in arch/x86/kernel/apic/vector.c.
> >
>
> Thanks a lot, I stumbled over the same thing just after I sent the email
> as well and had been trying to see if I can quickly patch it up before I
> fall asleep :).
>
> The code path above is used by the APIC vector move (irqbalance) logic,
> which explains why not everyone was seeing issues.
>
> So with 633260fa1 applied, we never get out of moving state for our IRQ
> because orig_ax is always -1. That means we send an IPI to the cleanup
> vector on every single device interrupt, completely occupying the poor
> CPU that we moved the IRQ from.
>
> I've confirmed that the patch below fixes the issue and will send a
> proper, more complete patch on top of mainline with fancy description
> and stable tag tomorrow.
>
>
> Alex
>
>
>
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index e7434cd..a474e6e 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -734,7 +734,6 @@ SYM_CODE_START_LOCAL(common_spurious)
> call interrupt_entry
> UNWIND_HINT_REGS indirect=1
> movq ORIG_RAX(%rdi), %rsi /* get vector from stack */
> - movq $-1, ORIG_RAX(%rdi) /* no syscall to restart */
> call smp_spurious_interrupt /* rdi points to pt_regs */
> jmp ret_from_intr
> SYM_CODE_END(common_spurious)
> @@ -746,7 +745,6 @@ SYM_CODE_START_LOCAL(common_interrupt)
> call interrupt_entry
> UNWIND_HINT_REGS indirect=1
> movq ORIG_RAX(%rdi), %rsi /* get vector from stack */
> - movq $-1, ORIG_RAX(%rdi) /* no syscall to restart */
> call do_IRQ /* rdi points to pt_regs */
> /* 0(%rsp): old RSP */
> ret_from_intr:
> diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
> index 67768e5443..5b6f74e 100644
> --- a/arch/x86/kernel/apic/vector.c
> +++ b/arch/x86/kernel/apic/vector.c
> @@ -934,7 +934,7 @@ static void __irq_complete_move(struct irq_cfg *cfg,
> unsigned vector)
>
> void irq_complete_move(struct irq_cfg *cfg)
> {
> - __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
> + __irq_complete_move(cfg, get_irq_regs()->orig_ax);
> }

I think you need to also truncate the vector to 8-bits, since it now
gets sign-extended when pushed into the orig_ax slot.

--
Brian Gerst

2020-08-26 01:15:37

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch V9 21/39] x86/irq: Convey vector as argument and not in ptregs

On Tue, Aug 25 2020 at 16:41, Andy Lutomirski wrote:
> How about this:
>
> void irq_complete_move(struct irq_cfg *cfg)
> {
> __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
> }
>
> in arch/x86/kernel/apic/vector.c.

Duh....