Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753094AbeAKBWv (ORCPT + 1 other); Wed, 10 Jan 2018 20:22:51 -0500 Received: from mail-pg0-f68.google.com ([74.125.83.68]:35207 "EHLO mail-pg0-f68.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752312AbeAKBWt (ORCPT ); Wed, 10 Jan 2018 20:22:49 -0500 X-Google-Smtp-Source: ACJfBov4tjIo1h5+YN9VK21pmDTWdjv4CG8yGBMame9VPup/nIMY1BjRQVnkPxKqRFN47psyfZ9NjQ== Content-Type: text/plain; charset=us-ascii Mime-Version: 1.0 (1.0) Subject: Re: [PATCH v1 1/8] x86/entry/clearregs: Remove partial stack frame in fast system call From: Andy Lutomirski X-Mailer: iPhone Mail (15C202) In-Reply-To: Date: Wed, 10 Jan 2018 17:22:45 -0800 Cc: Andi Kleen , Thomas Gleixner , the arch/x86 maintainers , Linux Kernel Mailing List , Linus Torvalds , David Woodhouse , Paul Turner , Andy Lutomirski , Peter Zijlstra , Tom Lendacky , Tim Chen , Greg Kroah-Hartman , Dave Hansen , Jiri Kosina , Andi Kleen Content-Transfer-Encoding: 8BIT Message-Id: <4E2A660B-720F-43B2-A08F-F48CC8D91E05@amacapital.net> References: <20180110010328.22163-1-andi@firstfloor.org> <20180110010328.22163-2-andi@firstfloor.org> <20180111001626.m5cgtngkmoskuhyh@two.firstfloor.org> <550472D4-8A2E-4219-B1ED-71EA5597E3A2@amacapital.net> To: Brian Gerst Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Return-Path: > On Jan 10, 2018, at 5:01 PM, Brian Gerst wrote: > >> On Wed, Jan 10, 2018 at 7:55 PM, Andy Lutomirski wrote: >> >> >>>> On Jan 10, 2018, at 4:16 PM, Andi Kleen wrote: >>>> >>>>> On Tue, Jan 09, 2018 at 09:46:16PM -0500, Brian Gerst wrote: >>>>> On Tue, Jan 9, 2018 at 8:03 PM, Andi Kleen wrote: >>>>> From: Andi Kleen >>>>> >>>>> Remove the partial stack frame in the 64bit syscall fast path. >>>>> In the next patch we want to clear the extra registers, which requires >>>>> to always save all registers. So remove the partial stack frame >>>>> in the syscall fast path and always save everything. >>>>> >>>>> This actually simplifies the code because the ptregs stubs >>>>> are not needed anymore. >>>>> >>>>> arch/x86/entry/entry_64.S | 57 ++++----------------------------------------------------- >>>>> arch/x86/entry/syscall_64.c | 2 +- >>>>> >>>>> Signed-off-by: Andi Kleen >>>>> --- >>>>> arch/x86/entry/entry_64.S | 57 ++++----------------------------------------- >>>>> arch/x86/entry/syscall_64.c | 2 +- >>>>> 2 files changed, 5 insertions(+), 54 deletions(-) >>>>> >>>>> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S >>>>> index 58dbf7a12a05..bbdfbdd817d6 100644 >>>>> --- a/arch/x86/entry/entry_64.S >>>>> +++ b/arch/x86/entry/entry_64.S >>>>> @@ -234,7 +234,9 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) >>>>> pushq %r9 /* pt_regs->r9 */ >>>>> pushq %r10 /* pt_regs->r10 */ >>>>> pushq %r11 /* pt_regs->r11 */ >>>>> - sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ >>>>> + sub $(6*8), %rsp >>>>> + SAVE_EXTRA_REGS >>>>> + >>>> >>>> Continue using pushes here >>>> >>>>> UNWIND_HINT_REGS extra=0 >>>>> >>>>> /* >>>>> @@ -262,11 +264,6 @@ entry_SYSCALL_64_fastpath: >>>>> ja 1f /* return -ENOSYS (already in pt_regs->ax) */ >>>>> movq %r10, %rcx >>>>> >>>>> - /* >>>>> - * This call instruction is handled specially in stub_ptregs_64. >>>>> - * It might end up jumping to the slow path. If it jumps, RAX >>>>> - * and all argument registers are clobbered. >>>>> - */ >>>>> #ifdef CONFIG_RETPOLINE >>>>> movq sys_call_table(, %rax, 8), %rax >>>>> call __x86_indirect_thunk_rax >>>>> @@ -293,9 +290,7 @@ entry_SYSCALL_64_fastpath: >>>>> TRACE_IRQS_ON /* user mode is traced as IRQs on */ >>>>> movq RIP(%rsp), %rcx >>>>> movq EFLAGS(%rsp), %r11 >>>>> - addq $6*8, %rsp /* skip extra regs -- they were preserved */ >>>>> - UNWIND_HINT_EMPTY >>>>> - jmp .Lpop_c_regs_except_rcx_r11_and_sysret >>>>> + jmp syscall_return_via_sysret >>>>> >>>>> 1: >>>>> /* >>>>> @@ -305,14 +300,12 @@ entry_SYSCALL_64_fastpath: >>>>> */ >>>>> TRACE_IRQS_ON >>>>> ENABLE_INTERRUPTS(CLBR_ANY) >>>>> - SAVE_EXTRA_REGS >>>>> movq %rsp, %rdi >>>>> call syscall_return_slowpath /* returns with IRQs disabled */ >>>>> jmp return_from_SYSCALL_64 >>>>> >>>>> entry_SYSCALL64_slow_path: >>>>> /* IRQs are off. */ >>>>> - SAVE_EXTRA_REGS >>>>> movq %rsp, %rdi >>>>> call do_syscall_64 /* returns with IRQs disabled */ >>>>> >>>>> @@ -389,7 +382,6 @@ syscall_return_via_sysret: >>>>> /* rcx and r11 are already restored (see code above) */ >>>>> UNWIND_HINT_EMPTY >>>>> POP_EXTRA_REGS >>>>> -.Lpop_c_regs_except_rcx_r11_and_sysret: >>>>> popq %rsi /* skip r11 */ >>>>> popq %r10 >>>>> popq %r9 >>>>> @@ -420,47 +412,6 @@ syscall_return_via_sysret: >>>>> USERGS_SYSRET64 >>>>> END(entry_SYSCALL_64) >>>>> >>>>> -ENTRY(stub_ptregs_64) >>>>> - /* >>>>> - * Syscalls marked as needing ptregs land here. >>>>> - * If we are on the fast path, we need to save the extra regs, >>>>> - * which we achieve by trying again on the slow path. If we are on >>>>> - * the slow path, the extra regs are already saved. >>>>> - * >>>>> - * RAX stores a pointer to the C function implementing the syscall. >>>>> - * IRQs are on. >>>>> - */ >>>>> - cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp) >>>>> - jne 1f >>>>> - >>>>> - /* >>>>> - * Called from fast path -- disable IRQs again, pop return address >>>>> - * and jump to slow path >>>>> - */ >>>>> - DISABLE_INTERRUPTS(CLBR_ANY) >>>>> - TRACE_IRQS_OFF >>>>> - popq %rax >>>>> - UNWIND_HINT_REGS extra=0 >>>>> - jmp entry_SYSCALL64_slow_path >>>>> - >>>>> -1: >>>>> - JMP_NOSPEC %rax /* Called from C */ >>>>> -END(stub_ptregs_64) >>>>> - >>>>> -.macro ptregs_stub func >>>>> -ENTRY(ptregs_\func) >>>>> - UNWIND_HINT_FUNC >>>>> - leaq \func(%rip), %rax >>>>> - jmp stub_ptregs_64 >>>>> -END(ptregs_\func) >>>>> -.endm >>>>> - >>>>> -/* Instantiate ptregs_stub for each ptregs-using syscall */ >>>>> -#define __SYSCALL_64_QUAL_(sym) >>>>> -#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym >>>>> -#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym) >>>>> -#include >>>>> - >>>> >>>> You can't just blindly remove this. We need to make sure that >>>> syscalls that modify registers take the slow path exit, because they >>>> may change the registers to be incompatible with SYSRET. >>> >>> That's a good point. I checked the ptregs calls: >>> >>> iopl: should be fine, we will be restoring the correct IOPL through >>> SYSRET >>> >>> clone/fork: fine too, the original return is fine and ret_from_fork >>> takes care of the child >>> >>> execve et.al.: we will be leaking r11(rflags), rcx(orig return) into >>> the new process. but that seems acceptable. >>> >>> rt_sigreturn: that's the only one who has problems. I added a new >>> TIF_FULL_RESTORE to force it into the slow path. >>> >> >> So your series removes the old declarative annotation and then will add a new TI flag to make it work again? >> >> This whole thing seems to be at the wrong end of the cost benefit curve. > > We already check TIF flags after the syscall on the fast path. Adding > another bit to the mask costs nothing. > What I mean is: this whole series is almost certainly a performance regression, it has no off switch, and is doesn't obviously solve any problem. It' didn't qualify as a so. And no one has benchmarked it. I think we should seriously consider just not applying it.