Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752944AbdHBLIr (ORCPT ); Wed, 2 Aug 2017 07:08:47 -0400 Received: from mx2.suse.de ([195.135.220.15]:42263 "EHLO mx1.suse.de" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S1751166AbdHBLIq (ORCPT ); Wed, 2 Aug 2017 07:08:46 -0400 Subject: Re: [PATCH v2] xen: get rid of paravirt op adjust_exception_frame To: Andy Lutomirski Cc: "linux-kernel@vger.kernel.org" , "xen-devel@lists.xenproject.org" , X86 ML , Boris Ostrovsky , "H. Peter Anvin" , Thomas Gleixner , Ingo Molnar References: <20170801103954.23904-1-jgross@suse.com> From: Juergen Gross Message-ID: <18bd920a-8b68-37a9-771e-b486b3801739@suse.com> Date: Wed, 2 Aug 2017 13:08:42 +0200 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Thunderbird/52.2.1 MIME-Version: 1.0 In-Reply-To: Content-Type: text/plain; charset=utf-8 Content-Language: en-US Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 11611 Lines: 287 On 01/08/17 21:45, Andy Lutomirski wrote: > On Tue, Aug 1, 2017 at 3:39 AM, Juergen Gross wrote: >> When running as Xen pv-guest the exception frame on the stack contains >> %r11 and %rcx additional to the other data pushed by the processor. >> >> Instead of having a paravirt op being called for each exception type >> prepend the Xen specific code to each exception entry. When running as >> Xen pv-guest just use the exception entry with prepended instructions, >> otherwise use the entry without the Xen specific code. >> >> Signed-off-by: Juergen Gross >> --- >> arch/x86/entry/entry_64.S | 22 ++------------ >> arch/x86/entry/entry_64_compat.S | 1 - >> arch/x86/include/asm/desc.h | 16 ++++++++++ >> arch/x86/include/asm/paravirt.h | 5 --- >> arch/x86/include/asm/paravirt_types.h | 4 --- >> arch/x86/include/asm/proto.h | 3 ++ >> arch/x86/include/asm/traps.h | 51 +++++++++++++++++++++++++++++-- >> arch/x86/kernel/asm-offsets_64.c | 1 - >> arch/x86/kernel/paravirt.c | 3 -- >> arch/x86/kernel/traps.c | 57 ++++++++++++++++++----------------- >> arch/x86/xen/enlighten_pv.c | 16 +++++----- >> arch/x86/xen/irq.c | 3 -- >> arch/x86/xen/xen-asm_64.S | 45 ++++++++++++++++++++++++--- >> arch/x86/xen/xen-ops.h | 1 - >> 14 files changed, 147 insertions(+), 81 deletions(-) >> >> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S >> index a9a8027a6c0e..602bcf68a32c 100644 >> --- a/arch/x86/entry/entry_64.S >> +++ b/arch/x86/entry/entry_64.S >> @@ -745,7 +745,6 @@ ENTRY(\sym) >> .endif >> >> ASM_CLAC >> - PARAVIRT_ADJUST_EXCEPTION_FRAME >> >> .ifeq \has_error_code >> pushq $-1 /* ORIG_RAX: no syscall to restart */ >> @@ -901,7 +900,7 @@ ENTRY(do_softirq_own_stack) >> END(do_softirq_own_stack) >> >> #ifdef CONFIG_XEN >> -idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 >> +idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 >> >> /* >> * A note on the "critical region" in our callback handler. >> @@ -967,8 +966,6 @@ ENTRY(xen_failsafe_callback) >> movq 8(%rsp), %r11 >> addq $0x30, %rsp >> pushq $0 /* RIP */ >> - pushq %r11 >> - pushq %rcx >> jmp general_protection >> 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ >> movq (%rsp), %rcx >> @@ -997,9 +994,8 @@ idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK >> idtentry stack_segment do_stack_segment has_error_code=1 >> >> #ifdef CONFIG_XEN >> -idtentry xen_debug do_debug has_error_code=0 >> -idtentry xen_int3 do_int3 has_error_code=0 >> -idtentry xen_stack_segment do_stack_segment has_error_code=1 >> +idtentry xendebug do_debug has_error_code=0 >> +idtentry xenint3 do_int3 has_error_code=0 >> #endif >> >> idtentry general_protection do_general_protection has_error_code=1 >> @@ -1161,18 +1157,6 @@ END(error_exit) >> /* Runs on exception stack */ >> ENTRY(nmi) >> /* >> - * Fix up the exception frame if we're on Xen. >> - * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most >> - * one value to the stack on native, so it may clobber the rdx >> - * scratch slot, but it won't clobber any of the important >> - * slots past it. >> - * >> - * Xen is a different story, because the Xen frame itself overlaps >> - * the "NMI executing" variable. >> - */ > > Based on Andrew Cooper's email, it sounds like this function is just > straight-up broken on Xen PV. Maybe change the comment to "XXX: > broken on Xen PV" or similar. Fine with me. > >> +#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) >> +#define pv_trap_entry(name) (void *)(xen_pv_domain() ? xen_ ## name : name) >> +#else >> +#define pv_trap_entry(name) (void *)(name) >> +#endif >> + > > Seems reasonable to me. > >> #ifdef CONFIG_X86_64 >> >> static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, >> @@ -482,6 +490,14 @@ static inline void _set_gate(int gate, unsigned type, void *addr, >> 0, 0, __KERNEL_CS); \ >> } while (0) >> >> +#define set_intr_gate_pv(n, addr) \ >> + do { \ >> + set_intr_gate_notrace(n, pv_trap_entry(addr)); \ >> + _trace_set_gate(n, GATE_INTERRUPT, \ >> + pv_trap_entry(trace_##addr), \ >> + 0, 0, __KERNEL_CS); \ >> + } while (0) > > Any reason this can't be set_intr_gate((n), pv_trap_entry(addr))? Or > does that get preprocessed wrong? trace_##addr won't look like anything the compiler will accept with addr being "(void *)(xen_pv_domain() ? xen_foo : foo)". :-) >> diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h >> index 01fd0a7f48cd..e2ae5f3b9c2c 100644 >> --- a/arch/x86/include/asm/traps.h >> +++ b/arch/x86/include/asm/traps.h >> @@ -13,9 +13,8 @@ asmlinkage void divide_error(void); >> asmlinkage void debug(void); >> asmlinkage void nmi(void); >> asmlinkage void int3(void); >> -asmlinkage void xen_debug(void); >> -asmlinkage void xen_int3(void); >> -asmlinkage void xen_stack_segment(void); >> +asmlinkage void xendebug(void); >> +asmlinkage void xenint3(void); > > What are xendebug and xenint3 for? Are they because they don't use IST on Xen? I believe so. >> __visible struct pv_cpu_ops pv_cpu_ops = { >> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c >> index bf54309b85da..a79a97a46a59 100644 >> --- a/arch/x86/kernel/traps.c >> +++ b/arch/x86/kernel/traps.c >> @@ -946,15 +946,15 @@ void __init early_trap_init(void) >> * early stage. DEBUG_STACK will be equipped after cpu_init() in >> * trap_init(). >> * >> - * We don't need to set trace_idt_table like set_intr_gate(), >> + * We don't need to set trace_idt_table like set_intr_gate_pv(), >> * since we don't have trace_debug and it will be reset to >> * 'debug' in trap_init() by set_intr_gate_ist(). >> */ >> set_intr_gate_notrace(X86_TRAP_DB, debug); > > This is confusing IMO. Maybe just drop the comment change? But how > does this work at all on Xen? TBH: I don't whether it works on Xen. The changes I did here were just mechanical ones. >> - set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun); >> - set_intr_gate(X86_TRAP_TS, invalid_TSS); >> - set_intr_gate(X86_TRAP_NP, segment_not_present); >> - set_intr_gate(X86_TRAP_SS, stack_segment); >> - set_intr_gate(X86_TRAP_GP, general_protection); >> - set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug); >> - set_intr_gate(X86_TRAP_MF, coprocessor_error); >> - set_intr_gate(X86_TRAP_AC, alignment_check); >> + set_intr_gate_pv(X86_TRAP_OLD_MF, coprocessor_segment_overrun); >> + set_intr_gate_pv(X86_TRAP_TS, invalid_TSS); >> + set_intr_gate_pv(X86_TRAP_NP, segment_not_present); >> + set_intr_gate_pv(X86_TRAP_SS, stack_segment); >> + set_intr_gate_pv(X86_TRAP_GP, general_protection); >> + set_intr_gate_pv(X86_TRAP_SPURIOUS, spurious_interrupt_bug); >> + set_intr_gate_pv(X86_TRAP_MF, coprocessor_error); >> + set_intr_gate_pv(X86_TRAP_AC, alignment_check); > > Hmm. I'm okay with this, but I'm wondering whether it might be nice > to try to have a pv op that changes what IDT writes do and remaps the > function pointer. Like... > >> diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c >> index 811e4ddb3f37..7e107142bc4f 100644 >> --- a/arch/x86/xen/enlighten_pv.c >> +++ b/arch/x86/xen/enlighten_pv.c >> @@ -598,24 +598,22 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, >> * so we should never see them. Warn if >> * there's an unexpected IST-using fault handler. >> */ >> - if (addr == (unsigned long)debug) >> - addr = (unsigned long)xen_debug; >> - else if (addr == (unsigned long)int3) >> - addr = (unsigned long)xen_int3; >> - else if (addr == (unsigned long)stack_segment) >> - addr = (unsigned long)xen_stack_segment; >> - else if (addr == (unsigned long)double_fault) { >> + if (addr == (unsigned long)xen_debug) >> + addr = (unsigned long)xen_xendebug; >> + else if (addr == (unsigned long)xen_int3) >> + addr = (unsigned long)xen_xenint3; >> + else if (addr == (unsigned long)xen_double_fault) { >> /* Don't need to handle these */ >> return 0; > > ...this? Can't this list just be extended to handle all of the pv > entries instead of using the macro magic? I'll have a try. > >> diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S >> index c3df43141e70..f72ff71cc897 100644 >> --- a/arch/x86/xen/xen-asm_64.S >> +++ b/arch/x86/xen/xen-asm_64.S >> @@ -22,11 +22,46 @@ >> >> #include "xen-asm.h" >> >> -ENTRY(xen_adjust_exception_frame) >> - mov 8+0(%rsp), %rcx >> - mov 8+8(%rsp), %r11 >> - ret $16 >> -ENDPROC(xen_adjust_exception_frame) >> +.macro xen_pv_trap name >> +ENTRY(xen_\name) >> + pop %rcx >> + pop %r11 >> + jmp \name >> +END(xen_\name) >> +.endm >> + >> +xen_pv_trap divide_error >> +xen_pv_trap debug >> +xen_pv_trap xendebug >> +xen_pv_trap int3 >> +xen_pv_trap xenint3 >> +xen_pv_trap nmi >> +xen_pv_trap overflow >> +xen_pv_trap bounds >> +xen_pv_trap invalid_op >> +xen_pv_trap device_not_available >> +xen_pv_trap double_fault >> +xen_pv_trap coprocessor_segment_overrun >> +xen_pv_trap invalid_TSS >> +xen_pv_trap segment_not_present >> +xen_pv_trap stack_segment >> +xen_pv_trap general_protection >> +xen_pv_trap page_fault >> +xen_pv_trap async_page_fault >> +xen_pv_trap spurious_interrupt_bug >> +xen_pv_trap coprocessor_error >> +xen_pv_trap alignment_check >> +#ifdef CONFIG_X86_MCE >> +xen_pv_trap machine_check >> +#endif /* CONFIG_X86_MCE */ >> +xen_pv_trap simd_coprocessor_error >> +#ifdef CONFIG_IA32_EMULATION >> +xen_pv_trap entry_INT80_compat >> +#endif >> +#ifdef CONFIG_TRACING >> +xen_pv_trap trace_page_fault >> +#endif >> +xen_pv_trap hypervisor_callback > > I like this. > > Also, IMO it would be nice to fully finish the job. Remaining steps are: > > 1. Unsuck the SYSCALL entries on Xen PV. > 2. Unsuck the SYENTER entry on Xen PV. > 3. Make a xen_nmi that's actually correct (should be trivial) > > #1 is here: > > https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86/entry_syscall&id=14fee348e3e86c994400d68085217d1232a637d6 > > Can you test it and, if you like it, either add it to your series or > ack it? If you have extra spare cycles, you could try to do #2 and > #3, too :) For #2, it might actually make sense to rig up the Xen > sysenter entry to redirect to entry_SYSCALL_compat or even an entirely > new entry point -- SYSENTER is really weird. I'll look into it. Juergen