2015-06-09 19:43:21

by Denys Vlasenko

[permalink] [raw]
Subject: [PATCH 1/5] x86/asm/entry/32: Fix fallout from r9 trick removal in SYSCALL code

I put %ebp restoration code too late. Under strace, it is not reached
and %ebp is not restored upon return to userspace.

This is the fix. Run-tested.

Signed-off-by: Denys Vlasenko <[email protected]>
CC: Linus Torvalds <[email protected]>
CC: Steven Rostedt <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: Borislav Petkov <[email protected]>
CC: "H. Peter Anvin" <[email protected]>
CC: Andy Lutomirski <[email protected]>
CC: Oleg Nesterov <[email protected]>
CC: Frederic Weisbecker <[email protected]>
CC: Alexei Starovoitov <[email protected]>
CC: Will Drewry <[email protected]>
CC: Kees Cook <[email protected]>
CC: [email protected]
CC: [email protected]
---
arch/x86/entry/entry_64_compat.S | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 2093ce6..2c44180 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -344,6 +344,7 @@ cstar_dispatch:
call *ia32_sys_call_table(, %rax, 8)
movq %rax, RAX(%rsp)
1:
+ movl RCX(%rsp), %ebp
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
@@ -351,7 +352,6 @@ cstar_dispatch:

sysretl_from_sys_call:
andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- movl RCX(%rsp), %ebp
RESTORE_RSI_RDI_RDX
movl RIP(%rsp), %ecx
movl EFLAGS(%rsp), %r11d
--
1.8.1.4


2015-06-09 18:54:29

by Denys Vlasenko

[permalink] [raw]
Subject: [PATCH 2/5] x86/asm/entry/32: Explain reloading of registers after __audit_syscall_entry

Here it is not obvious why we load pt_regs->cx to %esi etc.
Lets improve comments.

Explain that here we combine two things: first, we reload registers
since some of them are clobbered by the C function we just called;
and we also convert 32-bit syscall params to 64-bit C ABI,
because we are going to jump back to syscall dispatch code.

Move reloading of 6th argument into the macro instead of having it
after each of two macro invocations.

No actual code changes here.

Signed-off-by: Denys Vlasenko <[email protected]>
CC: Linus Torvalds <[email protected]>
CC: Steven Rostedt <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: Borislav Petkov <[email protected]>
CC: "H. Peter Anvin" <[email protected]>
CC: Andy Lutomirski <[email protected]>
CC: Oleg Nesterov <[email protected]>
CC: Frederic Weisbecker <[email protected]>
CC: Alexei Starovoitov <[email protected]>
CC: Will Drewry <[email protected]>
CC: Kees Cook <[email protected]>
CC: [email protected]
CC: [email protected]
---
arch/x86/entry/entry_64_compat.S | 20 ++++++++++++--------
1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 2c44180..0fa108c 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -185,12 +185,18 @@ sysexit_from_sys_call:
movl %ebx, %esi /* 2nd arg: 1st syscall arg */
movl %eax, %edi /* 1st arg: syscall number */
call __audit_syscall_entry
- movl ORIG_RAX(%rsp), %eax /* reload syscall number */
- movl %ebx, %edi /* reload 1st syscall arg */
- movl RCX(%rsp), %esi /* reload 2nd syscall arg */
- movl RDX(%rsp), %edx /* reload 3rd syscall arg */
- movl RSI(%rsp), %ecx /* reload 4th syscall arg */
- movl RDI(%rsp), %r8d /* reload 5th syscall arg */
+ /*
+ * We are going to jump back to syscall dispatch.
+ * Prepare syscall args as required by 64-bit C ABI.
+ * Clobbered registers are loaded from pt_regs on stack.
+ */
+ movl ORIG_RAX(%rsp), %eax /* syscall number */
+ movl %ebx, %edi /* arg1 */
+ movl RCX(%rsp), %esi /* arg2 */
+ movl RDX(%rsp), %edx /* arg3 */
+ movl RSI(%rsp), %ecx /* arg4 */
+ movl RDI(%rsp), %r8d /* arg5 */
+ movl %ebp, %r9d /* arg6 */
.endm

.macro auditsys_exit exit
@@ -221,7 +227,6 @@ sysexit_from_sys_call:

sysenter_auditsys:
auditsys_entry_common
- movl %ebp, %r9d /* reload 6th syscall arg */
jmp sysenter_dispatch

sysexit_audit:
@@ -379,7 +384,6 @@ sysretl_from_sys_call:
#ifdef CONFIG_AUDITSYSCALL
cstar_auditsys:
auditsys_entry_common
- movl %ebp, %r9d /* reload 6th syscall arg */
jmp cstar_dispatch

sysretl_audit:
--
1.8.1.4

2015-06-09 18:54:40

by Denys Vlasenko

[permalink] [raw]
Subject: [PATCH 3/5] x86/asm/entry/32: Shorten __audit_syscall_entry args preparation

We use three MOVs to swap edx and ecx. We can use one XCHG instead.

Expand the comments. It's difficult to keep track which arg# every register
corresponds to, so spell it out.

Signed-off-by: Denys Vlasenko <[email protected]>
CC: Linus Torvalds <[email protected]>
CC: Steven Rostedt <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: Borislav Petkov <[email protected]>
CC: "H. Peter Anvin" <[email protected]>
CC: Andy Lutomirski <[email protected]>
CC: Oleg Nesterov <[email protected]>
CC: Frederic Weisbecker <[email protected]>
CC: Alexei Starovoitov <[email protected]>
CC: Will Drewry <[email protected]>
CC: Kees Cook <[email protected]>
CC: [email protected]
CC: [email protected]
---
arch/x86/entry/entry_64_compat.S | 16 ++++++++++------
1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 0fa108c..a0ddcb6 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -178,12 +178,16 @@ sysexit_from_sys_call:

#ifdef CONFIG_AUDITSYSCALL
.macro auditsys_entry_common
- movl %esi, %r8d /* 5th arg: 4th syscall arg */
- movl %ecx, %r9d /* swap with edx */
- movl %edx, %ecx /* 4th arg: 3rd syscall arg */
- movl %r9d, %edx /* 3rd arg: 2nd syscall arg */
- movl %ebx, %esi /* 2nd arg: 1st syscall arg */
- movl %eax, %edi /* 1st arg: syscall number */
+ /*
+ * At this point, registers hold syscall args in 32-bit ABI:
+ * eax is syscall#, args are in ebx,ecx,edx,esi,edi,ebp.
+ * Shuffle them to match what __audit_syscall_entry() wants.
+ */
+ movl %esi, %r8d /* arg5 (r8): 4th syscall arg */
+ xchg %ecx, %edx /* arg4 (rcx): 3rd syscall arg (edx) */
+ /* arg3 (rdx): 2nd syscall arg (ecx) */
+ movl %ebx, %esi /* arg2 (rsi): 1st syscall arg */
+ movl %eax, %edi /* arg1 (rdi): syscall number */
call __audit_syscall_entry
/*
* We are going to jump back to syscall dispatch.
--
1.8.1.4

2015-06-09 18:55:04

by Denys Vlasenko

[permalink] [raw]
Subject: [PATCH 4/5] x86/asm/entry/32: Replace RESTORE_RSI_RDI[_RDX] with open-coded 32-bit reads

This doesn't change much, but this uses shorter 32-bit insns:

-48 8b 74 24 68 mov 0x68(%rsp),%rsi
-48 8b 7c 24 70 mov 0x70(%rsp),%rdi
-48 8b 54 24 60 mov 0x60(%rsp),%rdx
+8b 74 24 68 mov 0x68(%rsp),%esi
+8b 7c 24 70 mov 0x70(%rsp),%edi
+8b 54 24 60 mov 0x60(%rsp),%edx

Since these are the only uses of RESTORE_RSI_RDI[_RDX], drop these macros.

Signed-off-by: Denys Vlasenko <[email protected]>
CC: Linus Torvalds <[email protected]>
CC: Steven Rostedt <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: Borislav Petkov <[email protected]>
CC: "H. Peter Anvin" <[email protected]>
CC: Andy Lutomirski <[email protected]>
CC: Oleg Nesterov <[email protected]>
CC: Frederic Weisbecker <[email protected]>
CC: Alexei Starovoitov <[email protected]>
CC: Will Drewry <[email protected]>
CC: Kees Cook <[email protected]>
CC: [email protected]
CC: [email protected]
---
arch/x86/entry/calling.h | 6 ------
arch/x86/entry/entry_64_compat.S | 7 +++++--
2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index f4e6308..519207f 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -193,12 +193,6 @@ For 32-bit we have the following conventions - kernel is built with
.macro RESTORE_C_REGS_EXCEPT_RCX_R11
RESTORE_C_REGS_HELPER 1,0,0,1,1
.endm
- .macro RESTORE_RSI_RDI
- RESTORE_C_REGS_HELPER 0,0,0,0,0
- .endm
- .macro RESTORE_RSI_RDI_RDX
- RESTORE_C_REGS_HELPER 0,0,0,0,1
- .endm

.macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
subq $-(15*8+\addskip), %rsp
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index a0ddcb6..d83e7e3 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -140,7 +140,8 @@ sysexit_from_sys_call:
*/
andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
movl RIP(%rsp), %ecx /* User %eip */
- RESTORE_RSI_RDI
+ movl RSI(%rsp), %esi
+ movl RDI(%rsp), %edi
xorl %edx, %edx /* Do not leak kernel information */
xorq %r8, %r8
xorq %r9, %r9
@@ -361,7 +362,9 @@ cstar_dispatch:

sysretl_from_sys_call:
andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- RESTORE_RSI_RDI_RDX
+ movl RSI(%rsp), %esi
+ movl RDI(%rsp), %edi
+ movl RDX(%rsp), %edx
movl RIP(%rsp), %ecx
movl EFLAGS(%rsp), %r11d
xorq %r10, %r10
--
1.8.1.4

2015-06-09 18:54:53

by Denys Vlasenko

[permalink] [raw]
Subject: [PATCH 5/5] x86/asm/entry/32: Simplify ptrace register shuffling

Before this patch, we were clearing pt_regs->r8..r11 on stack.
We can as well just store actual r8..r11 registers there:
they came from userspace, we leak no information by showing them to ptrace.
This allows to get rid of one insn ("xor %eax,%eax").
Not a big deal, but still...

After call to syscall_trace_enter(), before this patch we were restoring
clobbered registers and jump to code which converts 32-bit syscall
ABI to 64-bit C ABI. This is unnecessary work, we can combine both
steps into one (similar to what audit code does already).

Also, we do not need "<ABI>_do_call" labels anymore.

text data bss dec hex filename
1391 0 0 1391 56f entry_64_compat.o.before
1327 0 0 1327 52f entry_64_compat.o.after

Signed-off-by: Denys Vlasenko <[email protected]>
CC: Linus Torvalds <[email protected]>
CC: Steven Rostedt <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: Borislav Petkov <[email protected]>
CC: "H. Peter Anvin" <[email protected]>
CC: Andy Lutomirski <[email protected]>
CC: Oleg Nesterov <[email protected]>
CC: Frederic Weisbecker <[email protected]>
CC: Alexei Starovoitov <[email protected]>
CC: Will Drewry <[email protected]>
CC: Kees Cook <[email protected]>
CC: [email protected]
CC: [email protected]
---
arch/x86/entry/entry_64_compat.S | 121 ++++++++++++++++++++++-----------------
1 file changed, 70 insertions(+), 51 deletions(-)

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index d83e7e3..9d40cae 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -110,7 +110,6 @@ sysenter_flags_fixed:
testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz sysenter_tracesys

-sysenter_do_call:
/* 32-bit syscall -> 64-bit C ABI argument conversion */
movl %edi, %r8d /* arg5 */
movl %ebp, %r9d /* arg6 */
@@ -248,24 +247,31 @@ sysenter_tracesys:
testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jz sysenter_auditsys
#endif
- SAVE_EXTRA_REGS
- xorl %eax, %eax /* Do not leak kernel information */
- movq %rax, R11(%rsp)
- movq %rax, R10(%rsp)
- movq %rax, R9(%rsp)
- movq %rax, R8(%rsp)
+ /*
+ * Ptrace needs complete pt_regs. Fill members we didn't save earlier.
+ * r8..r15 are not really meaningful, but leaving uninitialized data
+ * there may leak kernel data to ptrace.
+ */
+ movq %r11, R11(%rsp)
+ movq %r10, R10(%rsp)
+ movq %r9, R9(%rsp)
+ movq %r8, R8(%rsp)
+ SAVE_EXTRA_REGS /* pt_regs->bp, bx, r12-15 */
movq %rsp, %rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
-
- /* Reload arg registers from stack. (see sysenter_tracesys) */
- movl RCX(%rsp), %ecx
- movl RDX(%rsp), %edx
- movl RSI(%rsp), %esi
- movl RDI(%rsp), %edi
- movl %eax, %eax /* zero extension */
-
- RESTORE_EXTRA_REGS
- jmp sysenter_do_call
+ /*
+ * We are going to jump back to syscall dispatch.
+ * Prepare syscall args as required by 64-bit C ABI.
+ * We must read data from pt_regs: ptrace could have changed it.
+ */
+ movl %eax, %eax /* syscall number: zero extend it */
+ movl RBX(%rsp), %edi /* arg1 */
+ movl RCX(%rsp), %esi /* arg2 */
+ movl RDX(%rsp), %edx /* arg3 */
+ movl RSI(%rsp), %ecx /* arg4 */
+ movl RDI(%rsp), %r8d /* arg5 */
+ movl RBP(%rsp), %r9d /* arg6 */
+ jmp sysenter_dispatch
ENDPROC(entry_SYSENTER_compat)

/*
@@ -339,7 +348,6 @@ ENTRY(entry_SYSCALL_compat)
testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz cstar_tracesys

-cstar_do_call:
/* 32-bit syscall -> 64-bit C ABI argument conversion */
movl %edi, %r8d /* arg5 */
movl %ebp, %r9d /* arg6 */
@@ -402,24 +410,31 @@ cstar_tracesys:
testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jz cstar_auditsys
#endif
- SAVE_EXTRA_REGS
- xorl %eax, %eax /* Do not leak kernel information */
- movq %rax, R11(%rsp)
- movq %rax, R10(%rsp)
- movq %rax, R9(%rsp)
- movq %rax, R8(%rsp)
+ /*
+ * Ptrace needs complete pt_regs. Fill members we didn't save earlier.
+ * r8..r15 are not really meaningful, but leaving uninitialized data
+ * there may leak kernel data to ptrace.
+ */
+ movq %r11, R11(%rsp)
+ movq %r10, R10(%rsp)
+ movq %r9, R9(%rsp)
+ movq %r8, R8(%rsp)
+ SAVE_EXTRA_REGS /* pt_regs->bp, bx, r12-15 */
movq %rsp, %rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
-
- /* Reload arg registers from stack. (see sysenter_tracesys) */
- movl RCX(%rsp), %ecx
- movl RDX(%rsp), %edx
- movl RSI(%rsp), %esi
- movl RDI(%rsp), %edi
- movl %eax, %eax /* zero extension */
-
- RESTORE_EXTRA_REGS
- jmp cstar_do_call
+ /*
+ * We are going to jump back to syscall dispatch.
+ * Prepare syscall args as required by 64-bit C ABI.
+ * We must read data from pt_regs: ptrace could have changed it.
+ */
+ movl %eax, %eax /* syscall number: zero extend it */
+ movl RBX(%rsp), %edi /* arg1 */
+ movl RCX(%rsp), %esi /* arg2 */
+ movl RDX(%rsp), %edx /* arg3 */
+ movl RSI(%rsp), %ecx /* arg4 */
+ movl RDI(%rsp), %r8d /* arg5 */
+ movl RBP(%rsp), %r9d /* arg6 */
+ jmp cstar_dispatch
END(entry_SYSCALL_compat)

ia32_badarg:
@@ -483,15 +498,15 @@ ENTRY(entry_INT80_compat)

orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz ia32_tracesys
+ jnz int80_tracesys

-ia32_do_call:
/* 32-bit syscall -> 64-bit C ABI argument conversion */
movl %edi, %r8d /* arg5 */
movl %ebp, %r9d /* arg6 */
xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */
movl %ebx, %edi /* arg1 */
movl %edx, %edx /* arg3 (zero extension) */
+int80_dispatch:
cmpq $(IA32_NR_syscalls-1), %rax
ja 1f

@@ -500,24 +515,28 @@ ia32_do_call:
1:
jmp int_ret_from_sys_call

-ia32_tracesys:
- SAVE_EXTRA_REGS
- movq %rsp, %rdi /* &pt_regs -> arg1 */
+int80_tracesys:
+ /*
+ * Ptrace needs complete pt_regs. Fill members we didn't save earlier.
+ * r8..r15 are not really meaningful, but leaving uninitialized data
+ * there may leak kernel data to ptrace.
+ */
+ SAVE_EXTRA_REGS /* pt_regs->bp, bx, r12-15 */
+ movq %rsp, %rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
/*
- * Reload arg registers from stack in case ptrace changed them.
- * Don't reload %eax because syscall_trace_enter() returned
- * the %rax value we should see. But do truncate it to 32 bits.
- * If it's -1 to make us punt the syscall, then (u32)-1 is still
- * an appropriately invalid value.
+ * We are going to jump back to syscall dispatch.
+ * Prepare syscall args as required by 64-bit C ABI.
+ * We must read data from pt_regs: ptrace could have changed it.
*/
- movl RCX(%rsp), %ecx
- movl RDX(%rsp), %edx
- movl RSI(%rsp), %esi
- movl RDI(%rsp), %edi
- movl %eax, %eax /* zero extension */
- RESTORE_EXTRA_REGS
- jmp ia32_do_call
+ movl %eax, %eax /* syscall number: zero extend it */
+ movl RBX(%rsp), %edi /* arg1 */
+ movl RCX(%rsp), %esi /* arg2 */
+ movl RDX(%rsp), %edx /* arg3 */
+ movl RSI(%rsp), %ecx /* arg4 */
+ movl RDI(%rsp), %r8d /* arg5 */
+ movl RBP(%rsp), %r9d /* arg6 */
+ jmp int80_dispatch
END(entry_INT80_compat)

.macro PTREGSCALL label, func
--
1.8.1.4

2015-06-09 18:59:37

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH 5/5] x86/asm/entry/32: Simplify ptrace register shuffling

On Tue, Jun 9, 2015 at 11:54 AM, Denys Vlasenko <[email protected]> wrote:
> Before this patch, we were clearing pt_regs->r8..r11 on stack.
> We can as well just store actual r8..r11 registers there:
> they came from userspace, we leak no information by showing them to ptrace.
> This allows to get rid of one insn ("xor %eax,%eax").
> Not a big deal, but still...
>
> After call to syscall_trace_enter(), before this patch we were restoring
> clobbered registers and jump to code which converts 32-bit syscall
> ABI to 64-bit C ABI. This is unnecessary work, we can combine both
> steps into one (similar to what audit code does already).

I think like zeroing it better. There's nothing wrong with zeroing
it, and it makes testing (if we ever started testing this stuff)
easier, I think.

--Andy

2015-06-09 19:01:35

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH 4/5] x86/asm/entry/32: Replace RESTORE_RSI_RDI[_RDX] with open-coded 32-bit reads

On Tue, Jun 9, 2015 at 11:54 AM, Denys Vlasenko <[email protected]> wrote:
> This doesn't change much, but this uses shorter 32-bit insns:
>
> -48 8b 74 24 68 mov 0x68(%rsp),%rsi
> -48 8b 7c 24 70 mov 0x70(%rsp),%rdi
> -48 8b 54 24 60 mov 0x60(%rsp),%rdx
> +8b 74 24 68 mov 0x68(%rsp),%esi
> +8b 7c 24 70 mov 0x70(%rsp),%edi
> +8b 54 24 60 mov 0x60(%rsp),%edx
>
> Since these are the only uses of RESTORE_RSI_RDI[_RDX], drop these macros.
>

It probably doesn't matter for these fast paths, but, for the full
slow path return, we really do need to restore the full pt_regs.
After all, the syscall we're returning from might be sigreturn.

--Andy

2015-06-09 19:04:08

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH 4/5] x86/asm/entry/32: Replace RESTORE_RSI_RDI[_RDX] with open-coded 32-bit reads

On 06/09/2015 09:01 PM, Andy Lutomirski wrote:
> On Tue, Jun 9, 2015 at 11:54 AM, Denys Vlasenko <[email protected]> wrote:
>> This doesn't change much, but this uses shorter 32-bit insns:
>>
>> -48 8b 74 24 68 mov 0x68(%rsp),%rsi
>> -48 8b 7c 24 70 mov 0x70(%rsp),%rdi
>> -48 8b 54 24 60 mov 0x60(%rsp),%rdx
>> +8b 74 24 68 mov 0x68(%rsp),%esi
>> +8b 7c 24 70 mov 0x70(%rsp),%edi
>> +8b 54 24 60 mov 0x60(%rsp),%edx
>>
>> Since these are the only uses of RESTORE_RSI_RDI[_RDX], drop these macros.
>>
>
> It probably doesn't matter for these fast paths, but, for the full
> slow path return, we really do need to restore the full pt_regs.
> After all, the syscall we're returning from might be sigreturn.

This is compat 32-bit syscall handling code.
IIUC we do not restore high half of any registers for 32-bit tasks.

Am I missing something?

2015-06-09 19:11:55

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH 4/5] x86/asm/entry/32: Replace RESTORE_RSI_RDI[_RDX] with open-coded 32-bit reads

On Tue, Jun 9, 2015 at 12:03 PM, Denys Vlasenko <[email protected]> wrote:
> On 06/09/2015 09:01 PM, Andy Lutomirski wrote:
>> On Tue, Jun 9, 2015 at 11:54 AM, Denys Vlasenko <[email protected]> wrote:
>>> This doesn't change much, but this uses shorter 32-bit insns:
>>>
>>> -48 8b 74 24 68 mov 0x68(%rsp),%rsi
>>> -48 8b 7c 24 70 mov 0x70(%rsp),%rdi
>>> -48 8b 54 24 60 mov 0x60(%rsp),%rdx
>>> +8b 74 24 68 mov 0x68(%rsp),%esi
>>> +8b 7c 24 70 mov 0x70(%rsp),%edi
>>> +8b 54 24 60 mov 0x60(%rsp),%edx
>>>
>>> Since these are the only uses of RESTORE_RSI_RDI[_RDX], drop these macros.
>>>
>>
>> It probably doesn't matter for these fast paths, but, for the full
>> slow path return, we really do need to restore the full pt_regs.
>> After all, the syscall we're returning from might be sigreturn.
>
> This is compat 32-bit syscall handling code.
> IIUC we do not restore high half of any registers for 32-bit tasks.
>
> Am I missing something?

Yes -- 64-bit tasks can call 32-bit compat syscalls. In fact, we
should really excise the entire concept of "64-bit tasks" and "32-bit
tasks" from the kernel. The things that have bitness are the current
syscall (TS_COMPAT), CS, the mm, and the signal context. The task
should be agnostic.

--Andy

2015-06-09 19:14:47

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH 5/5] x86/asm/entry/32: Simplify ptrace register shuffling

On 06/09/2015 08:59 PM, Andy Lutomirski wrote:
> On Tue, Jun 9, 2015 at 11:54 AM, Denys Vlasenko <[email protected]> wrote:
>> Before this patch, we were clearing pt_regs->r8..r11 on stack.
>> We can as well just store actual r8..r11 registers there:
>> they came from userspace, we leak no information by showing them to ptrace.
>> This allows to get rid of one insn ("xor %eax,%eax").
>> Not a big deal, but still...
>>
>> After call to syscall_trace_enter(), before this patch we were restoring
>> clobbered registers and jump to code which converts 32-bit syscall
>> ABI to 64-bit C ABI. This is unnecessary work, we can combine both
>> steps into one (similar to what audit code does already).
>
> I think like zeroing it better. There's nothing wrong with zeroing
> it,

Yes, there is nothing wrong with zeroing. It just requires a bit
more code.

> and it makes testing (if we ever started testing this stuff)
> easier, I think.

Currently, we don't zero *all* high regs, only r8..10. (r11 is
nonzero due to SYSRET ABI; r12..15 are preserved by virtue of being
callee-saved regs).


2015-06-09 19:18:14

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH 4/5] x86/asm/entry/32: Replace RESTORE_RSI_RDI[_RDX] with open-coded 32-bit reads

On 06/09/2015 09:11 PM, Andy Lutomirski wrote:
> On Tue, Jun 9, 2015 at 12:03 PM, Denys Vlasenko <[email protected]> wrote:
>> On 06/09/2015 09:01 PM, Andy Lutomirski wrote:
>>> On Tue, Jun 9, 2015 at 11:54 AM, Denys Vlasenko <[email protected]> wrote:
>>>> This doesn't change much, but this uses shorter 32-bit insns:
>>>>
>>>> -48 8b 74 24 68 mov 0x68(%rsp),%rsi
>>>> -48 8b 7c 24 70 mov 0x70(%rsp),%rdi
>>>> -48 8b 54 24 60 mov 0x60(%rsp),%rdx
>>>> +8b 74 24 68 mov 0x68(%rsp),%esi
>>>> +8b 7c 24 70 mov 0x70(%rsp),%edi
>>>> +8b 54 24 60 mov 0x60(%rsp),%edx
>>>>
>>>> Since these are the only uses of RESTORE_RSI_RDI[_RDX], drop these macros.
>>>>
>>>
>>> It probably doesn't matter for these fast paths, but, for the full
>>> slow path return, we really do need to restore the full pt_regs.
>>> After all, the syscall we're returning from might be sigreturn.
>>
>> This is compat 32-bit syscall handling code.
>> IIUC we do not restore high half of any registers for 32-bit tasks.
>>
>> Am I missing something?
>
> Yes -- 64-bit tasks can call 32-bit compat syscalls.

Not via SYSCALL and SYSENTER code paths. This patch touches only those
code paths.

2015-06-09 19:27:28

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH 4/5] x86/asm/entry/32: Replace RESTORE_RSI_RDI[_RDX] with open-coded 32-bit reads

On Tue, Jun 9, 2015 at 12:18 PM, Denys Vlasenko <[email protected]> wrote:
> On 06/09/2015 09:11 PM, Andy Lutomirski wrote:
>> On Tue, Jun 9, 2015 at 12:03 PM, Denys Vlasenko <[email protected]> wrote:
>>> On 06/09/2015 09:01 PM, Andy Lutomirski wrote:
>>>> On Tue, Jun 9, 2015 at 11:54 AM, Denys Vlasenko <[email protected]> wrote:
>>>>> This doesn't change much, but this uses shorter 32-bit insns:
>>>>>
>>>>> -48 8b 74 24 68 mov 0x68(%rsp),%rsi
>>>>> -48 8b 7c 24 70 mov 0x70(%rsp),%rdi
>>>>> -48 8b 54 24 60 mov 0x60(%rsp),%rdx
>>>>> +8b 74 24 68 mov 0x68(%rsp),%esi
>>>>> +8b 7c 24 70 mov 0x70(%rsp),%edi
>>>>> +8b 54 24 60 mov 0x60(%rsp),%edx
>>>>>
>>>>> Since these are the only uses of RESTORE_RSI_RDI[_RDX], drop these macros.
>>>>>
>>>>
>>>> It probably doesn't matter for these fast paths, but, for the full
>>>> slow path return, we really do need to restore the full pt_regs.
>>>> After all, the syscall we're returning from might be sigreturn.
>>>
>>> This is compat 32-bit syscall handling code.
>>> IIUC we do not restore high half of any registers for 32-bit tasks.
>>>
>>> Am I missing something?
>>
>> Yes -- 64-bit tasks can call 32-bit compat syscalls.
>
> Not via SYSCALL and SYSENTER code paths. This patch touches only those
> code paths.

I suppose that's true enough even if it's not quite true. A 64-bit
task could far jump/call/return to compat mode and then do SYSCALL or
SYSENTER, but it will likely crash and burn because there's no 32-bit
vdso.

--Andy

2015-06-10 06:21:25

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 3/5] x86/asm/entry/32: Shorten __audit_syscall_entry args preparation


* Denys Vlasenko <[email protected]> wrote:

> We use three MOVs to swap edx and ecx. We can use one XCHG instead.
>
> Expand the comments. It's difficult to keep track which arg# every register
> corresponds to, so spell it out.

> + /*
> + * At this point, registers hold syscall args in 32-bit ABI:
> + * eax is syscall#, args are in ebx,ecx,edx,esi,edi,ebp.
> + * Shuffle them to match what __audit_syscall_entry() wants.
> + */
> + movl %esi, %r8d /* arg5 (r8): 4th syscall arg */
> + xchg %ecx, %edx /* arg4 (rcx): 3rd syscall arg (edx) */
> + /* arg3 (rdx): 2nd syscall arg (ecx) */
> + movl %ebx, %esi /* arg2 (rsi): 1st syscall arg */
> + movl %eax, %edi /* arg1 (rdi): syscall number */
> call __audit_syscall_entry

So while we are at it I improved this a bit more, to:

/*
* At this point, registers hold syscall args in 32-bit syscall ABI:
* eax is syscall#, args are in ebx,ecx,edx,esi,edi,ebp.
*
* We want to pass them to __audit_syscall_entry(), which is a 64-bit
* C function with 5 parameters, so shuffle them to match what
* __audit_syscall_entry() expects: rdi,rsi,rdx,rcx,r8.
*/
movl %esi, %r8d /* arg5 (r8 ) <= 4th syscall arg (esi) */
xchg %ecx, %edx /* arg4 (rcx) <= 3rd syscall arg (edx) */
/* arg3 (rdx) <= 2nd syscall arg (ecx) */
movl %ebx, %esi /* arg2 (rsi) <= 1st syscall arg (ebx) */
movl %eax, %edi /* arg1 (rdi) <= syscall number (eax) */
call __audit_syscall_entry

Btw., syscall auditing is not auditing syscall arguments #5 and #6?

Thanks,

Ingo

Subject: [tip:x86/asm] x86/asm/entry/32: Fix fallout from the R9 trick removal in the SYSCALL code

Commit-ID: aee4b013a71666f11ffeac11ab45bb7c6e0e394d
Gitweb: http://git.kernel.org/tip/aee4b013a71666f11ffeac11ab45bb7c6e0e394d
Author: Denys Vlasenko <[email protected]>
AuthorDate: Tue, 9 Jun 2015 20:54:07 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 10 Jun 2015 08:42:12 +0200

x86/asm/entry/32: Fix fallout from the R9 trick removal in the SYSCALL code

I put %ebp restoration code too late. Under strace, it is not
reached and %ebp is not restored upon return to userspace.

This is the fix. Run-tested.

Signed-off-by: Denys Vlasenko <[email protected]>
Cc: Alexei Starovoitov <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Kees Cook <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Will Drewry <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/entry/entry_64_compat.S | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 2093ce6..2c44180 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -344,6 +344,7 @@ cstar_dispatch:
call *ia32_sys_call_table(, %rax, 8)
movq %rax, RAX(%rsp)
1:
+ movl RCX(%rsp), %ebp
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
@@ -351,7 +352,6 @@ cstar_dispatch:

sysretl_from_sys_call:
andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- movl RCX(%rsp), %ebp
RESTORE_RSI_RDI_RDX
movl RIP(%rsp), %ecx
movl EFLAGS(%rsp), %r11d

Subject: [tip:x86/asm] x86/asm/entry/32: Explain reloading of registers after __audit_syscall_entry()

Commit-ID: 1536bb46fac7672ef04aaaa6a3b07848314263bc
Gitweb: http://git.kernel.org/tip/1536bb46fac7672ef04aaaa6a3b07848314263bc
Author: Denys Vlasenko <[email protected]>
AuthorDate: Tue, 9 Jun 2015 20:54:08 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 10 Jun 2015 08:42:13 +0200

x86/asm/entry/32: Explain reloading of registers after __audit_syscall_entry()

Here it is not obvious why we load pt_regs->cx to %esi etc.
Lets improve comments.

Explain that here we combine two things: first, we reload
registers since some of them are clobbered by the C function we
just called; and we also convert 32-bit syscall params to 64-bit
C ABI, because we are going to jump back to syscall dispatch
code.

Move reloading of 6th argument into the macro instead of having
it after each of two macro invocations.

No actual code changes here.

Signed-off-by: Denys Vlasenko <[email protected]>
Cc: Alexei Starovoitov <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Kees Cook <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Will Drewry <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/entry/entry_64_compat.S | 20 ++++++++++++--------
1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 2c44180..0fa108c 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -185,12 +185,18 @@ sysexit_from_sys_call:
movl %ebx, %esi /* 2nd arg: 1st syscall arg */
movl %eax, %edi /* 1st arg: syscall number */
call __audit_syscall_entry
- movl ORIG_RAX(%rsp), %eax /* reload syscall number */
- movl %ebx, %edi /* reload 1st syscall arg */
- movl RCX(%rsp), %esi /* reload 2nd syscall arg */
- movl RDX(%rsp), %edx /* reload 3rd syscall arg */
- movl RSI(%rsp), %ecx /* reload 4th syscall arg */
- movl RDI(%rsp), %r8d /* reload 5th syscall arg */
+ /*
+ * We are going to jump back to syscall dispatch.
+ * Prepare syscall args as required by 64-bit C ABI.
+ * Clobbered registers are loaded from pt_regs on stack.
+ */
+ movl ORIG_RAX(%rsp), %eax /* syscall number */
+ movl %ebx, %edi /* arg1 */
+ movl RCX(%rsp), %esi /* arg2 */
+ movl RDX(%rsp), %edx /* arg3 */
+ movl RSI(%rsp), %ecx /* arg4 */
+ movl RDI(%rsp), %r8d /* arg5 */
+ movl %ebp, %r9d /* arg6 */
.endm

.macro auditsys_exit exit
@@ -221,7 +227,6 @@ sysexit_from_sys_call:

sysenter_auditsys:
auditsys_entry_common
- movl %ebp, %r9d /* reload 6th syscall arg */
jmp sysenter_dispatch

sysexit_audit:
@@ -379,7 +384,6 @@ sysretl_from_sys_call:
#ifdef CONFIG_AUDITSYSCALL
cstar_auditsys:
auditsys_entry_common
- movl %ebp, %r9d /* reload 6th syscall arg */
jmp cstar_dispatch

sysretl_audit:

Subject: [tip:x86/asm] x86/asm/entry/32: Shorten __audit_syscall_entry() args preparation

Commit-ID: a92fde25231a89d7d10895482556260c1b63767d
Gitweb: http://git.kernel.org/tip/a92fde25231a89d7d10895482556260c1b63767d
Author: Denys Vlasenko <[email protected]>
AuthorDate: Tue, 9 Jun 2015 20:54:09 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 10 Jun 2015 08:42:13 +0200

x86/asm/entry/32: Shorten __audit_syscall_entry() args preparation

We use three MOVs to swap edx and ecx. We can use one XCHG
instead.

Expand the comments. It's difficult to keep track which arg#
every register corresponds to, so spell it out.

Signed-off-by: Denys Vlasenko <[email protected]>
Cc: Alexei Starovoitov <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Brian Gerst <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Kees Cook <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Will Drewry <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
[ Expanded the comments some more. ]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/entry/entry_64_compat.S | 27 ++++++++++++++++++---------
1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 0fa108c..bb187a6 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -178,17 +178,26 @@ sysexit_from_sys_call:

#ifdef CONFIG_AUDITSYSCALL
.macro auditsys_entry_common
- movl %esi, %r8d /* 5th arg: 4th syscall arg */
- movl %ecx, %r9d /* swap with edx */
- movl %edx, %ecx /* 4th arg: 3rd syscall arg */
- movl %r9d, %edx /* 3rd arg: 2nd syscall arg */
- movl %ebx, %esi /* 2nd arg: 1st syscall arg */
- movl %eax, %edi /* 1st arg: syscall number */
+ /*
+ * At this point, registers hold syscall args in the 32-bit syscall ABI:
+ * EAX is syscall number, the 6 args are in EBX,ECX,EDX,ESI,EDI,EBP.
+ *
+ * We want to pass them to __audit_syscall_entry(), which is a 64-bit
+ * C function with 5 parameters, so shuffle them to match what
+ * the function expects: RDI,RSI,RDX,RCX,R8.
+ */
+ movl %esi, %r8d /* arg5 (R8 ) <= 4th syscall arg (ESI) */
+ xchg %ecx, %edx /* arg4 (RCX) <= 3rd syscall arg (EDX) */
+ /* arg3 (RDX) <= 2nd syscall arg (ECX) */
+ movl %ebx, %esi /* arg2 (RSI) <= 1st syscall arg (EBX) */
+ movl %eax, %edi /* arg1 (RDI) <= syscall number (EAX) */
call __audit_syscall_entry
+
/*
- * We are going to jump back to syscall dispatch.
- * Prepare syscall args as required by 64-bit C ABI.
- * Clobbered registers are loaded from pt_regs on stack.
+ * We are going to jump back to the syscall dispatch code.
+ * Prepare syscall args as required by the 64-bit C ABI.
+ * Registers clobbered by __audit_syscall_entry() are
+ * loaded from pt_regs on stack:
*/
movl ORIG_RAX(%rsp), %eax /* syscall number */
movl %ebx, %edi /* arg1 */

2015-06-12 23:28:30

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH 3/5] x86/asm/entry/32: Shorten __audit_syscall_entry args preparation

On Jun 9, 2015 11:21 PM, "Ingo Molnar" <[email protected]> wrote:
>
>
> * Denys Vlasenko <[email protected]> wrote:
>
> > We use three MOVs to swap edx and ecx. We can use one XCHG instead.
> >
> > Expand the comments. It's difficult to keep track which arg# every register
> > corresponds to, so spell it out.
>
> > + /*
> > + * At this point, registers hold syscall args in 32-bit ABI:
> > + * eax is syscall#, args are in ebx,ecx,edx,esi,edi,ebp.
> > + * Shuffle them to match what __audit_syscall_entry() wants.
> > + */
> > + movl %esi, %r8d /* arg5 (r8): 4th syscall arg */
> > + xchg %ecx, %edx /* arg4 (rcx): 3rd syscall arg (edx) */
> > + /* arg3 (rdx): 2nd syscall arg (ecx) */
> > + movl %ebx, %esi /* arg2 (rsi): 1st syscall arg */
> > + movl %eax, %edi /* arg1 (rdi): syscall number */
> > call __audit_syscall_entry
>
> So while we are at it I improved this a bit more, to:
>
> /*
> * At this point, registers hold syscall args in 32-bit syscall ABI:
> * eax is syscall#, args are in ebx,ecx,edx,esi,edi,ebp.
> *
> * We want to pass them to __audit_syscall_entry(), which is a 64-bit
> * C function with 5 parameters, so shuffle them to match what
> * __audit_syscall_entry() expects: rdi,rsi,rdx,rcx,r8.
> */
> movl %esi, %r8d /* arg5 (r8 ) <= 4th syscall arg (esi) */
> xchg %ecx, %edx /* arg4 (rcx) <= 3rd syscall arg (edx) */
> /* arg3 (rdx) <= 2nd syscall arg (ecx) */
> movl %ebx, %esi /* arg2 (rsi) <= 1st syscall arg (ebx) */
> movl %eax, %edi /* arg1 (rdi) <= syscall number (eax) */
> call __audit_syscall_entry
>
> Btw., syscall auditing is not auditing syscall arguments #5 and #6?

Indeed. That's the least of its problems. Don't ever read that code
or you might accidentally git rm it.

--Andy

2015-06-14 08:41:14

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 4/5] x86/asm/entry/32: Replace RESTORE_RSI_RDI[_RDX] with open-coded 32-bit reads


* Denys Vlasenko <[email protected]> wrote:

> +8b 74 24 68 mov 0x68(%rsp),%esi
> +8b 7c 24 70 mov 0x70(%rsp),%edi
> +8b 54 24 60 mov 0x60(%rsp),%edx

Btw., could you (in another patch) order the restoration properly, by pt_regs
memory order, where possible?

So this:

> + movl RSI(%rsp), %esi
> + movl RDI(%rsp), %edi
> + movl RDX(%rsp), %edx
> movl RIP(%rsp), %ecx
> movl EFLAGS(%rsp), %r11d

would become:

movl RDX(%rsp), %edx
movl RSI(%rsp), %esi
movl RDI(%rsp), %edi
movl RIP(%rsp), %ecx
movl EFLAGS(%rsp), %r11d

... or so.

In fact I'd suggest we structure movl based restoration precisely after the field
order in the structure, and comment on cases where we depart from what's in
pt_regs - to make it all easier to verify.

Thanks,

Ingo

2015-06-14 15:21:34

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH 4/5] x86/asm/entry/32: Replace RESTORE_RSI_RDI[_RDX] with open-coded 32-bit reads

On 06/14/2015 10:40 AM, Ingo Molnar wrote:
>
> * Denys Vlasenko <[email protected]> wrote:
>
>> +8b 74 24 68 mov 0x68(%rsp),%esi
>> +8b 7c 24 70 mov 0x70(%rsp),%edi
>> +8b 54 24 60 mov 0x60(%rsp),%edx
>
> Btw., could you (in another patch) order the restoration properly, by pt_regs
> memory order, where possible?

Will do.

> So this:
>
>> + movl RSI(%rsp), %esi
>> + movl RDI(%rsp), %edi
>> + movl RDX(%rsp), %edx
>> movl RIP(%rsp), %ecx
>> movl EFLAGS(%rsp), %r11d
>
> would become:
>
> movl RDX(%rsp), %edx
> movl RSI(%rsp), %esi
> movl RDI(%rsp), %edi
> movl RIP(%rsp), %ecx
> movl EFLAGS(%rsp), %r11d
>
> ... or so.

Actually, ecx and r11 need to be loaded first. They are not so much "restored"
as "prepared for SYSRET insn". Every cycle lost in loading these
delays SYSRET. Other loads can be reordered by CPU past SYSRET.

> In fact I'd suggest we structure movl based restoration precisely after the field
> order in the structure, and comment on cases where we depart from what's in
> pt_regs - to make it all easier to verify.

Makes sense.

I'm sending a two-patch series which (1) open-codes RESTORE_RSI_RDI
and (2) sprinkles comments in this area.

2015-06-15 20:20:20

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 4/5] x86/asm/entry/32: Replace RESTORE_RSI_RDI[_RDX] with open-coded 32-bit reads


* Denys Vlasenko <[email protected]> wrote:

> On 06/14/2015 10:40 AM, Ingo Molnar wrote:
> >
> > * Denys Vlasenko <[email protected]> wrote:
> >
> >> +8b 74 24 68 mov 0x68(%rsp),%esi
> >> +8b 7c 24 70 mov 0x70(%rsp),%edi
> >> +8b 54 24 60 mov 0x60(%rsp),%edx
> >
> > Btw., could you (in another patch) order the restoration properly, by pt_regs
> > memory order, where possible?
>
> Will do.
>
> > So this:
> >
> >> + movl RSI(%rsp), %esi
> >> + movl RDI(%rsp), %edi
> >> + movl RDX(%rsp), %edx
> >> movl RIP(%rsp), %ecx
> >> movl EFLAGS(%rsp), %r11d
> >
> > would become:
> >
> > movl RDX(%rsp), %edx
> > movl RSI(%rsp), %esi
> > movl RDI(%rsp), %edi
> > movl RIP(%rsp), %ecx
> > movl EFLAGS(%rsp), %r11d
> >
> > ... or so.
>
> Actually, ecx and r11 need to be loaded first. They are not so much "restored"
> as "prepared for SYSRET insn". Every cycle lost in loading these delays SYSRET.
> [...]

So in the typical case they will still be cached, and so their max latency should
be around 3 cycles.

In fact because they are memory loads, they don't really have dependencies, so
they should be available to SYSRET almost immediately, i.e. within a cycle - and
there's no reason to believe why these loads wouldn't pipeline properly and
parallelize with the many other things SYSRET has to do to organize a return to
user-space, before it can actually use the target RIP and RFLAGS.

So I strongly doubt that the placement of the RCX and R11 load before the SYSRET
matters to performance.

In any case this should be testable by looking at syscall performance and
reordering the instructions.

Thanks,

Ingo

2015-06-16 00:24:48

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH 4/5] x86/asm/entry/32: Replace RESTORE_RSI_RDI[_RDX] with open-coded 32-bit reads

On 06/15/2015 10:20 PM, Ingo Molnar wrote:
>> Actually, ecx and r11 need to be loaded first. They are not so much "restored"
>> as "prepared for SYSRET insn". Every cycle lost in loading these delays SYSRET.
>> [...]
>
> So in the typical case they will still be cached, and so their max latency should
> be around 3 cycles.

If syscall flushes caches (say, a large read), or sleeps
and CPU schedules away, then pt_regs->ip,flags are evicted
and need to be reloaded.

> In fact because they are memory loads, they don't really have dependencies,
> they should be available to SYSRET almost immediately,

They depend on the memory data.

> i.e. within a cycle - and
> there's no reason to believe why these loads wouldn't pipeline properly and
> parallelize with the many other things SYSRET has to do to organize a return to
> user-space, before it can actually use the target RIP and RFLAGS.

This does not sound right.

If it takes, say, 20 cycles to pull data from e.g. L3 cache to ECX,
then SYSRET can't possibly complete sooner than in 20 cycles.

2015-06-18 09:31:43

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 4/5] x86/asm/entry/32: Replace RESTORE_RSI_RDI[_RDX] with open-coded 32-bit reads


* Denys Vlasenko <[email protected]> wrote:

> On 06/15/2015 10:20 PM, Ingo Molnar wrote:
> >> Actually, ecx and r11 need to be loaded first. They are not so much "restored"
> >> as "prepared for SYSRET insn". Every cycle lost in loading these delays SYSRET.
> >> [...]
> >
> > So in the typical case they will still be cached, and so their max latency should
> > be around 3 cycles.
>
> If syscall flushes caches (say, a large read), or sleeps
> and CPU schedules away, then pt_regs->ip,flags are evicted
> and need to be reloaded.
>
> > In fact because they are memory loads, they don't really have dependencies,
> > they should be available to SYSRET almost immediately,
>
> They depend on the memory data.
>
> > i.e. within a cycle - and
> > there's no reason to believe why these loads wouldn't pipeline properly and
> > parallelize with the many other things SYSRET has to do to organize a return to
> > user-space, before it can actually use the target RIP and RFLAGS.
>
> This does not sound right.
>
> If it takes, say, 20 cycles to pull data from e.g. L3 cache to ECX,
> then SYSRET can't possibly complete sooner than in 20 cycles.

Yeah, that's true, but my point is: SYSRET has to do a lot of other things
(permission checks, loading the user mode state - most of which are unrelated to
R11/RCX), which take dozens of cycles, and which are probably overlapped with any
cache misses on arguments such as R11/RCX.

It's not impossible that reordering helps, for example if SYSRET has some internal
dependencies that makes it parallelism worse than ideal - but I'd complicate this
code only if it gives a measurable improvement for cache-cold syscall performance.

Thanks,

Ingo

2015-06-18 09:33:28

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 5/5] x86/asm/entry/32: Simplify ptrace register shuffling


* Denys Vlasenko <[email protected]> wrote:

> Before this patch, we were clearing pt_regs->r8..r11 on stack.
> We can as well just store actual r8..r11 registers there:
> they came from userspace, we leak no information by showing them to ptrace.
> This allows to get rid of one insn ("xor %eax,%eax").
> Not a big deal, but still...
>
> After call to syscall_trace_enter(), before this patch we were restoring
> clobbered registers and jump to code which converts 32-bit syscall
> ABI to 64-bit C ABI. This is unnecessary work, we can combine both
> steps into one (similar to what audit code does already).

So this really needs a description about what kind of testing was done, as
technically this changes the ABI. Heavy ptrace users should be tried: strace,
UML, etc.

I don't expect any problems, but still it needs to be tested.

Thanks,

Ingo

2015-06-18 11:00:28

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH 4/5] x86/asm/entry/32: Replace RESTORE_RSI_RDI[_RDX] with open-coded 32-bit reads

On 06/18/2015 11:31 AM, Ingo Molnar wrote:
>> If it takes, say, 20 cycles to pull data from e.g. L3 cache to ECX,
>> then SYSRET can't possibly complete sooner than in 20 cycles.
>
> Yeah, that's true, but my point is: SYSRET has to do a lot of other things
> (permission checks, loading the user mode state - most of which are unrelated to
> R11/RCX), which take dozens of cycles,

SYSRET was designed to avoid doing that. It does not check permissions
- it slam-dunks CPL3 and resets CS and SS to preset values.
It does not touch stack register or restores any other GP register.

Having said that, I'd try to get cold hard facts, i.e. experimentally
measure SYSRET latency.


> and which are probably overlapped with any
> cache misses on arguments such as R11/RCX.
>
> It's not impossible that reordering helps, for example if SYSRET has some internal
> dependencies that makes it parallelism worse than ideal - but I'd complicate this
> code only if it gives a measurable improvement for cache-cold syscall performance.

I attempted to test it. With the patch which moves RCX and R11 loads all the way down:

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index f2064bd..0ea09a3 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -139,9 +139,6 @@ sysexit_from_sys_call:
* with 'sysenter' and it uses the SYSENTER calling convention.
*/
andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- /* Prepare registers for SYSRET insn */
- movl RIP(%rsp), %ecx /* User %eip */
- movl EFLAGS(%rsp), %r11d /* User eflags *
/* Restore registers per SYSEXIT ABI requirements: */
/* arg1 (ebx): preserved by virtue of being a callee-saved register */
/* arg2 (ecx): used by SYSEXIT to restore esp (and by SYSRET to restore eip) */
@@ -155,6 +152,9 @@ sysexit_from_sys_call:
xorl %r8d, %r8d
xorl %r9d, %r9d
xorl %r10d, %r10d
+ /* Prepare registers for SYSRET insn */
+ movl RIP(%rsp), %ecx /* User %eip */
+ movl EFLAGS(%rsp), %r11d /* User eflags *
TRACE_IRQS_ON

/*
@@ -374,9 +374,6 @@ cstar_dispatch:

sysretl_from_sys_call:
andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- /* Prepare registers for SYSRET insn */
- movl RIP(%rsp), %ecx /* User %eip */
- movl EFLAGS(%rsp), %r11d /* User eflags */
/* Restore registers per SYSRET ABI requirements: */
/* arg1 (ebx): preserved by virtue of being a callee-saved register */
/* arg2 (ebp): preserved (already restored, see above) */
@@ -388,6 +385,9 @@ sysretl_from_sys_call:
xorl %r8d, %r8d
xorl %r9d, %r9d
xorl %r10d, %r10d
+ /* Prepare registers for SYSRET insn */
+ movl RIP(%rsp), %ecx /* User %eip */
+ movl EFLAGS(%rsp), %r11d /* User eflags */
TRACE_IRQS_ON
movl RSP(%rsp), %esp
/*

This does not change instructions sizes and therefore code
cacheline alignments over entire bzImage.


Testing getpid() in a loop (IOW: cache-hot test) did show that with
this patch it is slower, but by statistically insignificant amount:

before patch, it's 61.92 ns per syscall.
after patch, it's 61.99 ns per syscall.

That's less than one cycle, more like 0.15 cycles.
However, it is reproducible.

I did not figure out how to do a cache-cold test.
Tried a 65kbyte-ish read from "/dev/zero". That takes ~3885 ns
and its variability of +-10 ns drowns out a possible difference.