Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753712AbeAKAzb (ORCPT + 1 other); Wed, 10 Jan 2018 19:55:31 -0500 Received: from mail-pg0-f66.google.com ([74.125.83.66]:45345 "EHLO mail-pg0-f66.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752827AbeAKAza (ORCPT ); Wed, 10 Jan 2018 19:55:30 -0500 X-Google-Smtp-Source: ACJfBoubNK7AnPMy8ERai3mGWgptqj4ydFdCewrzxajOqG9jykpP2UY3wq1Nopli7+qO6wzcvLH1+A== Content-Type: text/plain; charset=us-ascii Mime-Version: 1.0 (1.0) Subject: Re: [PATCH v1 1/8] x86/entry/clearregs: Remove partial stack frame in fast system call From: Andy Lutomirski X-Mailer: iPhone Mail (15C202) In-Reply-To: <20180111001626.m5cgtngkmoskuhyh@two.firstfloor.org> Date: Wed, 10 Jan 2018 16:55:27 -0800 Cc: Brian Gerst , Thomas Gleixner , the arch/x86 maintainers , Linux Kernel Mailing List , Linus Torvalds , David Woodhouse , Paul Turner , Andy Lutomirski , Peter Zijlstra , Tom Lendacky , Tim Chen , Greg Kroah-Hartman , Dave Hansen , Jiri Kosina , Andi Kleen Content-Transfer-Encoding: 8BIT Message-Id: <550472D4-8A2E-4219-B1ED-71EA5597E3A2@amacapital.net> References: <20180110010328.22163-1-andi@firstfloor.org> <20180110010328.22163-2-andi@firstfloor.org> <20180111001626.m5cgtngkmoskuhyh@two.firstfloor.org> To: Andi Kleen Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Return-Path: > On Jan 10, 2018, at 4:16 PM, Andi Kleen wrote: > >> On Tue, Jan 09, 2018 at 09:46:16PM -0500, Brian Gerst wrote: >>> On Tue, Jan 9, 2018 at 8:03 PM, Andi Kleen wrote: >>> From: Andi Kleen >>> >>> Remove the partial stack frame in the 64bit syscall fast path. >>> In the next patch we want to clear the extra registers, which requires >>> to always save all registers. So remove the partial stack frame >>> in the syscall fast path and always save everything. >>> >>> This actually simplifies the code because the ptregs stubs >>> are not needed anymore. >>> >>> arch/x86/entry/entry_64.S | 57 ++++----------------------------------------------------- >>> arch/x86/entry/syscall_64.c | 2 +- >>> >>> Signed-off-by: Andi Kleen >>> --- >>> arch/x86/entry/entry_64.S | 57 ++++----------------------------------------- >>> arch/x86/entry/syscall_64.c | 2 +- >>> 2 files changed, 5 insertions(+), 54 deletions(-) >>> >>> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S >>> index 58dbf7a12a05..bbdfbdd817d6 100644 >>> --- a/arch/x86/entry/entry_64.S >>> +++ b/arch/x86/entry/entry_64.S >>> @@ -234,7 +234,9 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) >>> pushq %r9 /* pt_regs->r9 */ >>> pushq %r10 /* pt_regs->r10 */ >>> pushq %r11 /* pt_regs->r11 */ >>> - sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ >>> + sub $(6*8), %rsp >>> + SAVE_EXTRA_REGS >>> + >> >> Continue using pushes here >> >>> UNWIND_HINT_REGS extra=0 >>> >>> /* >>> @@ -262,11 +264,6 @@ entry_SYSCALL_64_fastpath: >>> ja 1f /* return -ENOSYS (already in pt_regs->ax) */ >>> movq %r10, %rcx >>> >>> - /* >>> - * This call instruction is handled specially in stub_ptregs_64. >>> - * It might end up jumping to the slow path. If it jumps, RAX >>> - * and all argument registers are clobbered. >>> - */ >>> #ifdef CONFIG_RETPOLINE >>> movq sys_call_table(, %rax, 8), %rax >>> call __x86_indirect_thunk_rax >>> @@ -293,9 +290,7 @@ entry_SYSCALL_64_fastpath: >>> TRACE_IRQS_ON /* user mode is traced as IRQs on */ >>> movq RIP(%rsp), %rcx >>> movq EFLAGS(%rsp), %r11 >>> - addq $6*8, %rsp /* skip extra regs -- they were preserved */ >>> - UNWIND_HINT_EMPTY >>> - jmp .Lpop_c_regs_except_rcx_r11_and_sysret >>> + jmp syscall_return_via_sysret >>> >>> 1: >>> /* >>> @@ -305,14 +300,12 @@ entry_SYSCALL_64_fastpath: >>> */ >>> TRACE_IRQS_ON >>> ENABLE_INTERRUPTS(CLBR_ANY) >>> - SAVE_EXTRA_REGS >>> movq %rsp, %rdi >>> call syscall_return_slowpath /* returns with IRQs disabled */ >>> jmp return_from_SYSCALL_64 >>> >>> entry_SYSCALL64_slow_path: >>> /* IRQs are off. */ >>> - SAVE_EXTRA_REGS >>> movq %rsp, %rdi >>> call do_syscall_64 /* returns with IRQs disabled */ >>> >>> @@ -389,7 +382,6 @@ syscall_return_via_sysret: >>> /* rcx and r11 are already restored (see code above) */ >>> UNWIND_HINT_EMPTY >>> POP_EXTRA_REGS >>> -.Lpop_c_regs_except_rcx_r11_and_sysret: >>> popq %rsi /* skip r11 */ >>> popq %r10 >>> popq %r9 >>> @@ -420,47 +412,6 @@ syscall_return_via_sysret: >>> USERGS_SYSRET64 >>> END(entry_SYSCALL_64) >>> >>> -ENTRY(stub_ptregs_64) >>> - /* >>> - * Syscalls marked as needing ptregs land here. >>> - * If we are on the fast path, we need to save the extra regs, >>> - * which we achieve by trying again on the slow path. If we are on >>> - * the slow path, the extra regs are already saved. >>> - * >>> - * RAX stores a pointer to the C function implementing the syscall. >>> - * IRQs are on. >>> - */ >>> - cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp) >>> - jne 1f >>> - >>> - /* >>> - * Called from fast path -- disable IRQs again, pop return address >>> - * and jump to slow path >>> - */ >>> - DISABLE_INTERRUPTS(CLBR_ANY) >>> - TRACE_IRQS_OFF >>> - popq %rax >>> - UNWIND_HINT_REGS extra=0 >>> - jmp entry_SYSCALL64_slow_path >>> - >>> -1: >>> - JMP_NOSPEC %rax /* Called from C */ >>> -END(stub_ptregs_64) >>> - >>> -.macro ptregs_stub func >>> -ENTRY(ptregs_\func) >>> - UNWIND_HINT_FUNC >>> - leaq \func(%rip), %rax >>> - jmp stub_ptregs_64 >>> -END(ptregs_\func) >>> -.endm >>> - >>> -/* Instantiate ptregs_stub for each ptregs-using syscall */ >>> -#define __SYSCALL_64_QUAL_(sym) >>> -#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym >>> -#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym) >>> -#include >>> - >> >> You can't just blindly remove this. We need to make sure that >> syscalls that modify registers take the slow path exit, because they >> may change the registers to be incompatible with SYSRET. > > That's a good point. I checked the ptregs calls: > > iopl: should be fine, we will be restoring the correct IOPL through > SYSRET > > clone/fork: fine too, the original return is fine and ret_from_fork > takes care of the child > > execve et.al.: we will be leaking r11(rflags), rcx(orig return) into > the new process. but that seems acceptable. > > rt_sigreturn: that's the only one who has problems. I added a new > TIF_FULL_RESTORE to force it into the slow path. > So your series removes the old declarative annotation and then will add a new TI flag to make it work again? This whole thing seems to be at the wrong end of the cost benefit curve.