2005-11-22 16:54:39

by Benjamin LaHaise

[permalink] [raw]
Subject: rfc/rft: use r10 as current on x86-64

Hello Andi et al,

The patch below converts x86-64 to use r10 as the current pointer instead
of gs:pcurrent. This results in a ~34KB savings in the code segment of
the kernel. I've tested this with running a few regular applications,
plus a few 32 bit binaries. If this patch is interesting, it probably
makes sense to merge the thread info structure into the task_struct so
that the assembly bits for syscall entry can be cleaned up. Comments?

-ben
--
"Time is what keeps everything from happening all at once." -- John Wheeler
Don't Email: <[email protected]>.


diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
index a9cd42e..e547830 100644
--- a/arch/x86_64/Makefile
+++ b/arch/x86_64/Makefile
@@ -31,6 +31,7 @@ cflags-$(CONFIG_MK8) += $(call cc-option
cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
CFLAGS += $(cflags-y)

+CFLAGS += -ffixed-r10
CFLAGS += -mno-red-zone
CFLAGS += -mcmodel=kernel
CFLAGS += -pipe
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index e0eb0c7..cdb5918 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -99,6 +99,7 @@ sysenter_do_call:
cmpl $(IA32_NR_syscalls),%eax
jae ia32_badsys
IA32_ARG_FIXUP 1
+ movq %gs:pda_pcurrent,%r10
call *ia32_sys_call_table(,%rax,8)
movq %rax,RAX-ARGOFFSET(%rsp)
GET_THREAD_INFO(%r10)
@@ -127,6 +128,7 @@ sysenter_tracesys:
CLEAR_RREGS
movq $-ENOSYS,RAX(%rsp) /* really needed? */
movq %rsp,%rdi /* &pt_regs -> arg1 */
+ movq %gs:pda_pcurrent,%r10
call syscall_trace_enter
LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
RESTORE_REST
@@ -198,6 +200,7 @@ cstar_do_call:
cmpl $IA32_NR_syscalls,%eax
jae ia32_badsys
IA32_ARG_FIXUP 1
+ movq %gs:pda_pcurrent,%r10
call *ia32_sys_call_table(,%rax,8)
movq %rax,RAX-ARGOFFSET(%rsp)
GET_THREAD_INFO(%r10)
@@ -220,6 +223,7 @@ cstar_tracesys:
CLEAR_RREGS
movq $-ENOSYS,RAX(%rsp) /* really needed? */
movq %rsp,%rdi /* &pt_regs -> arg1 */
+ movq %gs:pda_pcurrent,%r10
call syscall_trace_enter
LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
RESTORE_REST
@@ -282,6 +286,7 @@ ia32_do_syscall:
cmpl $(IA32_NR_syscalls),%eax
jae ia32_badsys
IA32_ARG_FIXUP
+ movq %gs:pda_pcurrent,%r10
call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
ia32_sysret:
movq %rax,RAX-ARGOFFSET(%rsp)
@@ -291,6 +296,7 @@ ia32_tracesys:
SAVE_REST
movq $-ENOSYS,RAX(%rsp) /* really needed? */
movq %rsp,%rdi /* &pt_regs -> arg1 */
+ movq %gs:pda_pcurrent,%r10
call syscall_trace_enter
LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
RESTORE_REST
@@ -336,6 +342,7 @@ ENTRY(ia32_ptregs_common)
CFI_ADJUST_CFA_OFFSET -8
CFI_REGISTER rip, r11
SAVE_REST
+ movq %gs:pda_pcurrent,%r10
call *%rax
RESTORE_REST
jmp ia32_sysret /* misbalances the return cache */
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 9ff4204..53a829c 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -201,6 +201,7 @@ ENTRY(system_call)
cmpq $__NR_syscall_max,%rax
ja badsys
movq %r10,%rcx
+ movq %gs:pda_pcurrent,%r10
call *sys_call_table(,%rax,8) # XXX: rip relative
movq %rax,RAX-ARGOFFSET(%rsp)
/*
@@ -235,6 +236,7 @@ sysret_careful:
sti
pushq %rdi
CFI_ADJUST_CFA_OFFSET 8
+ movq %gs:pda_pcurrent,%r10
call schedule
popq %rdi
CFI_ADJUST_CFA_OFFSET -8
@@ -266,12 +268,14 @@ tracesys:
movq $-ENOSYS,RAX(%rsp)
FIXUP_TOP_OF_STACK %rdi
movq %rsp,%rdi
+ movq %gs:pda_pcurrent,%r10
call syscall_trace_enter
LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
RESTORE_REST
cmpq $__NR_syscall_max,%rax
ja 1f
movq %r10,%rcx /* fixup for C */
+ movq %gs:pda_pcurrent,%r10
call *sys_call_table(,%rax,8)
movq %rax,RAX-ARGOFFSET(%rsp)
1: SAVE_REST
@@ -324,6 +328,7 @@ int_careful:
sti
pushq %rdi
CFI_ADJUST_CFA_OFFSET 8
+ movq %gs:pda_pcurrent,%r10
call schedule
popq %rdi
CFI_ADJUST_CFA_OFFSET -8
@@ -409,6 +414,7 @@ ENTRY(stub_execve)
movq %r11, %r15
CFI_REGISTER rip, r15
FIXUP_TOP_OF_STACK %r11
+ movq %gs:pda_pcurrent,%r10
call sys_execve
GET_THREAD_INFO(%rcx)
bt $TIF_IA32,threadinfo_flags(%rcx)
@@ -441,6 +447,7 @@ ENTRY(stub_rt_sigreturn)
SAVE_REST
movq %rsp,%rdi
FIXUP_TOP_OF_STACK %r11
+ movq %gs:pda_pcurrent,%r10
call sys_rt_sigreturn
movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
RESTORE_REST
@@ -498,6 +505,7 @@ ENTRY(stub_rt_sigreturn)
cmoveq %rax,%rsp /*todo This needs CFI annotation! */
pushq %rdi # save old stack
CFI_ADJUST_CFA_OFFSET 8
+ movq %gs:pda_pcurrent,%r10
call \func
.endm

@@ -559,6 +567,7 @@ retint_careful:
sti
pushq %rdi
CFI_ADJUST_CFA_OFFSET 8
+ movq %gs:pda_pcurrent,%r10
call schedule
popq %rdi
CFI_ADJUST_CFA_OFFSET -8
@@ -574,6 +583,7 @@ retint_signal:
movq $-1,ORIG_RAX(%rsp)
xorl %esi,%esi # oldset
movq %rsp,%rdi # &pt_regs
+ movq %gs:pda_pcurrent,%r10
call do_notify_resume
RESTORE_REST
cli
@@ -592,6 +602,7 @@ retint_kernel:
jnc retint_restore_args
bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
jnc retint_restore_args
+ movq %gs:pda_pcurrent,%r10
call preempt_schedule_irq
jmp exit_intr
#endif
@@ -682,6 +693,7 @@ ENTRY(spurious_interrupt)
testl %edx,%edx
js 1f
swapgs
+ movq %gs:pda_pcurrent,%r10
xorl %ebx,%ebx
1: movq %rsp,%rdi
movq ORIG_RAX(%rsp),%rsi
@@ -734,6 +746,7 @@ ENTRY(error_entry)
je error_kernelspace
error_swapgs:
swapgs
+ movq %gs:pda_pcurrent,%r10
error_sti:
movq %rdi,RDI(%rsp)
movq %rsp,%rdi
@@ -876,6 +889,7 @@ ENTRY(execve)
CFI_STARTPROC
FAKE_STACK_FRAME $0
SAVE_ALL
+ movq %gs:pda_pcurrent,%r10
call sys_execve
movq %rax, RAX(%rsp)
RESTORE_REST
@@ -953,6 +967,7 @@ paranoid_userspace:
jmp paranoid_userspace
paranoid_schedule:
sti
+ movq %gs:pda_pcurrent,%r10
call schedule
cli
jmp paranoid_userspace
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 5afd63e..340bce2 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -435,8 +435,10 @@ int copy_thread(int nr, unsigned long cl

childregs->rax = 0;
childregs->rsp = rsp;
- if (rsp == ~0UL)
+ if (rsp == ~0UL) {
+ childregs->r10 = (long)p;
childregs->rsp = (unsigned long)childregs;
+ }

p->thread.rsp = (unsigned long) childregs;
p->thread.rsp0 = (unsigned long) (childregs+1);
@@ -568,6 +570,7 @@ __switch_to(struct task_struct *prev_p,
prev->userrsp = read_pda(oldrsp);
write_pda(oldrsp, next->userrsp);
write_pda(pcurrent, next_p);
+ current = next_p;
write_pda(kernelstack,
(unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET);

diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 06dc354..3af8688 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -132,16 +132,16 @@ void pda_init(int cpu)

if (cpu == 0) {
/* others are initialized in smpboot.c */
- pda->pcurrent = &init_task;
+ current = pda->pcurrent = &init_task;
pda->irqstackptr = boot_cpu_stack;
} else {
+ current = pda->pcurrent;
pda->irqstackptr = (char *)
__get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
if (!pda->irqstackptr)
panic("cannot allocate irqstack for cpu %d", cpu);
}

-
pda->irqstackptr += IRQSTACKSIZE-64;
}

diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index bf337f4..a6008ae 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -277,6 +277,7 @@ void show_registers(struct pt_regs *regs
const int cpu = safe_smp_processor_id();
struct task_struct *cur = cpu_pda[cpu].pcurrent;

+ current = cur;
rsp = regs->rsp;

printk("CPU %d ", cpu);
diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S
index dfa358b..f24497d 100644
--- a/arch/x86_64/lib/copy_user.S
+++ b/arch/x86_64/lib/copy_user.S
@@ -95,6 +95,7 @@ copy_user_generic:
.previous
.Lcug:
pushq %rbx
+ pushq %r12
xorl %eax,%eax /*zero for the exception handler */

#ifdef FIX_ALIGNMENT
@@ -117,20 +118,20 @@ copy_user_generic:
.Ls1: movq (%rsi),%r11
.Ls2: movq 1*8(%rsi),%r8
.Ls3: movq 2*8(%rsi),%r9
-.Ls4: movq 3*8(%rsi),%r10
+.Ls4: movq 3*8(%rsi),%r12
.Ld1: movq %r11,(%rdi)
.Ld2: movq %r8,1*8(%rdi)
.Ld3: movq %r9,2*8(%rdi)
-.Ld4: movq %r10,3*8(%rdi)
+.Ld4: movq %r12,3*8(%rdi)

.Ls5: movq 4*8(%rsi),%r11
.Ls6: movq 5*8(%rsi),%r8
.Ls7: movq 6*8(%rsi),%r9
-.Ls8: movq 7*8(%rsi),%r10
+.Ls8: movq 7*8(%rsi),%r12
.Ld5: movq %r11,4*8(%rdi)
.Ld6: movq %r8,5*8(%rdi)
.Ld7: movq %r9,6*8(%rdi)
-.Ld8: movq %r10,7*8(%rdi)
+.Ld8: movq %r12,7*8(%rdi)

decq %rdx

@@ -169,6 +170,7 @@ copy_user_generic:
jnz .Lloop_1

.Lende:
+ popq %r12
popq %rbx
ret

diff --git a/arch/x86_64/lib/csum-copy.S b/arch/x86_64/lib/csum-copy.S
index 72fd55e..8e0ee5f 100644
--- a/arch/x86_64/lib/csum-copy.S
+++ b/arch/x86_64/lib/csum-copy.S
@@ -84,7 +84,7 @@ csum_partial_copy_generic:
/* main loop. clear in 64 byte blocks */
/* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
/* r11: temp3, rdx: temp4, r12 loopcnt */
- /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */
+ /* r15: temp5, rbp: temp6, r14 temp7, r13 temp8 */
.p2align 4
.Lloop:
source
@@ -97,7 +97,7 @@ csum_partial_copy_generic:
movq 24(%rdi),%rdx

source
- movq 32(%rdi),%r10
+ movq 32(%rdi),%r15
source
movq 40(%rdi),%rbp
source
@@ -112,7 +112,7 @@ csum_partial_copy_generic:
adcq %r8,%rax
adcq %r11,%rax
adcq %rdx,%rax
- adcq %r10,%rax
+ adcq %r15,%rax
adcq %rbp,%rax
adcq %r14,%rax
adcq %r13,%rax
@@ -129,7 +129,7 @@ csum_partial_copy_generic:
movq %rdx,24(%rsi)

dest
- movq %r10,32(%rsi)
+ movq %r15,32(%rsi)
dest
movq %rbp,40(%rsi)
dest
@@ -149,7 +149,7 @@ csum_partial_copy_generic:
/* do last upto 56 bytes */
.Lhandle_tail:
/* ecx: count */
- movl %ecx,%r10d
+ movl %ecx,%r15d
andl $63,%ecx
shrl $3,%ecx
jz .Lfold
@@ -176,7 +176,7 @@ csum_partial_copy_generic:

/* do last upto 6 bytes */
.Lhandle_7:
- movl %r10d,%ecx
+ movl %r15d,%ecx
andl $7,%ecx
shrl $1,%ecx
jz .Lhandle_1
@@ -198,7 +198,7 @@ csum_partial_copy_generic:

/* handle last odd byte */
.Lhandle_1:
- testl $1,%r10d
+ testl $1,%r15d
jz .Lende
xorl %ebx,%ebx
source
diff --git a/include/asm-x86_64/current.h b/include/asm-x86_64/current.h
index bc8adec..6675f2d 100644
--- a/include/asm-x86_64/current.h
+++ b/include/asm-x86_64/current.h
@@ -6,13 +6,7 @@ struct task_struct;

#include <asm/pda.h>

-static inline struct task_struct *get_current(void)
-{
- struct task_struct *t = read_pda(pcurrent);
- return t;
-}
-
-#define current get_current()
+register struct task_struct *current __asm__("%r10");

#else


2005-11-22 17:10:48

by Andi Kleen

[permalink] [raw]
Subject: Re: rfc/rft: use r10 as current on x86-64

On Tue, Nov 22, 2005 at 11:52:04AM -0500, Benjamin LaHaise wrote:
> Hello Andi et al,
>
> The patch below converts x86-64 to use r10 as the current pointer instead
> of gs:pcurrent. This results in a ~34KB savings in the code segment of
> the kernel. I've tested this with running a few regular applications,
> plus a few 32 bit binaries. If this patch is interesting, it probably
> makes sense to merge the thread info structure into the task_struct so
> that the assembly bits for syscall entry can be cleaned up. Comments?

I think you could get most of the benefit by just dropping
the volatile and "memory" from read_pda(). With that gcc would
usually CSE current into a register and it would would work essentially
the same way with only minor more .text overhead, but r10 would be still
available.

Unfortunately when that's done then the kernel doesn't boot.
It's probably something silly, but i never had time to track it down.
Might want to look into that?

Looking at your patch it might be enough to make sure all users
of current after the changes in __switch_to you did use some
other way to access it (there is unfortunately no way I know
of to make gcc flush all CSEd items without addings barriers
in the original get_current function)

-Andi

2005-11-22 17:29:07

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: rfc/rft: use r10 as current on x86-64

On Tue, Nov 22, 2005 at 06:10:42PM +0100, Andi Kleen wrote:
> I think you could get most of the benefit by just dropping
> the volatile and "memory" from read_pda(). With that gcc would
> usually CSE current into a register and it would would work essentially
> the same way with only minor more .text overhead, but r10 would be still
> available.
>
> Unfortunately when that's done then the kernel doesn't boot.
> It's probably something silly, but i never had time to track it down.
> Might want to look into that?

Without even fixing it, the difference in kernel code size is still 20K
less than what using a register does. The benefit of using a register is
that accessing a field in current can simply offset the register, compared
to the pda usage that requires loading current into a register before the
offset is performed. Using 'size' on the resulting kernels shows:

text data bss dec hex filename
4132289 819632 317256 5269177 5066b9 vmlinux.orig
4119951 819632 317256 5256839 503687 vmlinux.non-volatile
4097300 819560 317256 5234116 4fddc4 vmlinux.r10

I think that using a register makes more sense given the benefits.

-ben
--
"Time is what keeps everything from happening all at once." -- John Wheeler
Don't Email: <[email protected]>.

2005-11-22 17:44:17

by Brian Gerst

[permalink] [raw]
Subject: Re: rfc/rft: use r10 as current on x86-64

Andi Kleen wrote:
> On Tue, Nov 22, 2005 at 11:52:04AM -0500, Benjamin LaHaise wrote:
>> Hello Andi et al,
>>
>> The patch below converts x86-64 to use r10 as the current pointer instead
>> of gs:pcurrent. This results in a ~34KB savings in the code segment of
>> the kernel. I've tested this with running a few regular applications,
>> plus a few 32 bit binaries. If this patch is interesting, it probably
>> makes sense to merge the thread info structure into the task_struct so
>> that the assembly bits for syscall entry can be cleaned up. Comments?
>
> I think you could get most of the benefit by just dropping
> the volatile and "memory" from read_pda(). With that gcc would
> usually CSE current into a register and it would would work essentially
> the same way with only minor more .text overhead, but r10 would be still
> available.

It seems that GCC is reluctant to use the extended registers anyways
because of the rex prefix, so I don't think dedicating r10 to current
will cause that many problems.

--
Brian Gerst

2005-11-22 17:55:57

by Andreas Steinmetz

[permalink] [raw]
Subject: Re: rfc/rft: use r10 as current on x86-64

Brian Gerst wrote:
> Andi Kleen wrote:
>
>> On Tue, Nov 22, 2005 at 11:52:04AM -0500, Benjamin LaHaise wrote:
>>
>>> Hello Andi et al,
>>>
>>> The patch below converts x86-64 to use r10 as the current pointer
>>> instead of gs:pcurrent. This results in a ~34KB savings in the code
>>> segment of the kernel. I've tested this with running a few regular
>>> applications, plus a few 32 bit binaries. If this patch is
>>> interesting, it probably makes sense to merge the thread info
>>> structure into the task_struct so that the assembly bits for syscall
>>> entry can be cleaned up. Comments?
>>
>>
>> I think you could get most of the benefit by just dropping
>> the volatile and "memory" from read_pda(). With that gcc would
>> usually CSE current into a register and it would would work essentially
>> the same way with only minor more .text overhead, but r10 would be still
>> available.
>
>
> It seems that GCC is reluctant to use the extended registers anyways
> because of the rex prefix, so I don't think dedicating r10 to current
> will cause that many problems.

Be aware of assembler that uses r10, e.g.
arch/x86_64/crypto/aes-x86_64-asm.S
--
Andreas Steinmetz SPAMmers use [email protected]

2005-11-23 22:50:15

by Pavel Machek

[permalink] [raw]
Subject: Re: rfc/rft: use r10 as current on x86-64

Hi!

> The patch below converts x86-64 to use r10 as the current pointer instead
> of gs:pcurrent. This results in a ~34KB savings in the code segment of
> the kernel. I've tested this with running a few regular applications,
> plus a few 32 bit binaries. If this patch is interesting, it probably
> makes sense to merge the thread info structure into the task_struct so
> that the assembly bits for syscall entry can be cleaned up. Comments?

34KB smaller is nice, but is not it also 30% slower? Plus some inline
assembly *will* have %r10 hardcoded, no? I'd be afraid around crypto
code, for example.
Pavel
--
Thanks, Sharp!

2005-11-23 22:57:18

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: rfc/rft: use r10 as current on x86-64

On Wed, Nov 23, 2005 at 11:48:03PM +0100, Pavel Machek wrote:
> 34KB smaller is nice, but is not it also 30% slower? Plus some inline
> assembly *will* have %r10 hardcoded, no? I'd be afraid around crypto
> code, for example.

It's not slower in any of the tests I've run. The crypto code needs a
tweak (the next version I send out will have that fix), and I'm still
working on getting thread_info to be relative to current, which should
save a bit more code. The assembly I've looked at tends to be better
as gcc can access various fields by directly offseting current instead
of the inline asm load then store that is otherwise needed.

-ben
--
"Time is what keeps everything from happening all at once." -- John Wheeler
Don't Email: <[email protected]>.