From: Ma Ling <[email protected]>
Modern CPU use fast-string instruction to accelerate copy performance,
by combining data into 128bit, so we modify comments and code style.
Signed-off-by: Ma Ling <[email protected]>
---
arch/x86/lib/copy_page_64.S | 119 +++++++++++++++++++++----------------------
1 files changed, 59 insertions(+), 60 deletions(-)
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 6b34d04..3da5527 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -5,91 +5,90 @@
#include <asm/alternative-asm.h>
ALIGN
-copy_page_c:
+copy_page_rep:
CFI_STARTPROC
- movl $4096/8,%ecx
- rep movsq
+ movl $4096/8, %ecx
+ rep movsq
ret
CFI_ENDPROC
-ENDPROC(copy_page_c)
+ENDPROC(copy_page_rep)
-/* Don't use streaming store because it's better when the target
- ends up in cache. */
-
-/* Could vary the prefetch distance based on SMP/UP */
+/*
+ Don't use streaming copy unless cpu indicate X86_FEATURE_REP_GOOD
+ Could vary the prefetch distance based on SMP/UP
+*/
ENTRY(copy_page)
CFI_STARTPROC
- subq $2*8,%rsp
+ subq $2*8, %rsp
CFI_ADJUST_CFA_OFFSET 2*8
- movq %rbx,(%rsp)
+ movq %rbx, (%rsp)
CFI_REL_OFFSET rbx, 0
- movq %r12,1*8(%rsp)
+ movq %r12, 1*8(%rsp)
CFI_REL_OFFSET r12, 1*8
- movl $(4096/64)-5,%ecx
+ movl $(4096/64)-5, %ecx
.p2align 4
.Loop64:
- dec %rcx
+ dec %rcx
- movq (%rsi), %rax
- movq 8 (%rsi), %rbx
- movq 16 (%rsi), %rdx
- movq 24 (%rsi), %r8
- movq 32 (%rsi), %r9
- movq 40 (%rsi), %r10
- movq 48 (%rsi), %r11
- movq 56 (%rsi), %r12
+ movq 0x8*0(%rsi), %rax
+ movq 0x8*1(%rsi), %rbx
+ movq 0x8*2(%rsi), %rdx
+ movq 0x8*3(%rsi), %r8
+ movq 0x8*4(%rsi), %r9
+ movq 0x8*5(%rsi), %r10
+ movq 0x8*6(%rsi), %r11
+ movq 0x8*7(%rsi), %r12
prefetcht0 5*64(%rsi)
- movq %rax, (%rdi)
- movq %rbx, 8 (%rdi)
- movq %rdx, 16 (%rdi)
- movq %r8, 24 (%rdi)
- movq %r9, 32 (%rdi)
- movq %r10, 40 (%rdi)
- movq %r11, 48 (%rdi)
- movq %r12, 56 (%rdi)
+ movq %rax, 0x8*0(%rdi)
+ movq %rbx, 0x8*1(%rdi)
+ movq %rdx, 0x8*2(%rdi)
+ movq %r8, 0x8*3(%rdi)
+ movq %r9, 0x8*4(%rdi)
+ movq %r10, 0x8*5(%rdi)
+ movq %r11, 0x8*6(%rdi)
+ movq %r12, 0x8*7(%rdi)
- leaq 64 (%rsi), %rsi
- leaq 64 (%rdi), %rdi
+ leaq 64 (%rsi), %rsi
+ leaq 64 (%rdi), %rdi
- jnz .Loop64
+ jnz .Loop64
- movl $5,%ecx
+ movl $5, %ecx
.p2align 4
.Loop2:
- decl %ecx
-
- movq (%rsi), %rax
- movq 8 (%rsi), %rbx
- movq 16 (%rsi), %rdx
- movq 24 (%rsi), %r8
- movq 32 (%rsi), %r9
- movq 40 (%rsi), %r10
- movq 48 (%rsi), %r11
- movq 56 (%rsi), %r12
-
- movq %rax, (%rdi)
- movq %rbx, 8 (%rdi)
- movq %rdx, 16 (%rdi)
- movq %r8, 24 (%rdi)
- movq %r9, 32 (%rdi)
- movq %r10, 40 (%rdi)
- movq %r11, 48 (%rdi)
- movq %r12, 56 (%rdi)
-
- leaq 64(%rdi),%rdi
- leaq 64(%rsi),%rsi
-
+ decl %ecx
+
+ movq 0x8*0(%rsi), %rax
+ movq 0x8*1(%rsi), %rbx
+ movq 0x8*2(%rsi), %rdx
+ movq 0x8*3(%rsi), %r8
+ movq 0x8*4(%rsi), %r9
+ movq 0x8*5(%rsi), %r10
+ movq 0x8*6(%rsi), %r11
+ movq 0x8*7(%rsi), %r12
+
+ movq %rax, 0x8*0(%rdi)
+ movq %rbx, 0x8*1(%rdi)
+ movq %rdx, 0x8*2(%rdi)
+ movq %r8, 0x8*3(%rdi)
+ movq %r9, 0x8*4(%rdi)
+ movq %r10, 0x8*5(%rdi)
+ movq %r11, 0x8*6(%rdi)
+ movq %r12, 0x8*7(%rdi)
+
+ leaq 64(%rdi), %rdi
+ leaq 64(%rsi), %rsi
jnz .Loop2
- movq (%rsp),%rbx
+ movq (%rsp), %rbx
CFI_RESTORE rbx
- movq 1*8(%rsp),%r12
+ movq 1*8(%rsp), %r12
CFI_RESTORE r12
- addq $2*8,%rsp
+ addq $2*8, %rsp
CFI_ADJUST_CFA_OFFSET -2*8
ret
.Lcopy_page_end:
@@ -103,7 +102,7 @@ ENDPROC(copy_page)
.section .altinstr_replacement,"ax"
1: .byte 0xeb /* jmp <disp8> */
- .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */
+ .byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */
2:
.previous
.section .altinstructions,"a"
--
1.6.5.2
On Thu, Oct 11, 2012 at 08:28:44PM +0800, [email protected] wrote:
> From: Ma Ling <[email protected]>
>
> Modern CPU use fast-string instruction to accelerate copy performance,
> by combining data into 128bit, so we modify comments and code style.
>
> Signed-off-by: Ma Ling <[email protected]>
>
> ---
> arch/x86/lib/copy_page_64.S | 119 +++++++++++++++++++++----------------------
> 1 files changed, 59 insertions(+), 60 deletions(-)
>
> diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
> index 6b34d04..3da5527 100644
> --- a/arch/x86/lib/copy_page_64.S
> +++ b/arch/x86/lib/copy_page_64.S
> @@ -5,91 +5,90 @@
> #include <asm/alternative-asm.h>
>
> ALIGN
> -copy_page_c:
> +copy_page_rep:
> CFI_STARTPROC
> - movl $4096/8,%ecx
> - rep movsq
> + movl $4096/8, %ecx
> + rep movsq
> ret
> CFI_ENDPROC
> -ENDPROC(copy_page_c)
> +ENDPROC(copy_page_rep)
>
> -/* Don't use streaming store because it's better when the target
> - ends up in cache. */
> -
> -/* Could vary the prefetch distance based on SMP/UP */
> +/*
> + Don't use streaming copy unless cpu indicate X86_FEATURE_REP_GOOD
> + Could vary the prefetch distance based on SMP/UP
> +*/
Kernel comment style formatting is:
/*
* <something not trivial to see worth adding comment for>
* <optional second line of that statement>
*/
>
> ENTRY(copy_page)
> CFI_STARTPROC
> - subq $2*8,%rsp
> + subq $2*8, %rsp
> CFI_ADJUST_CFA_OFFSET 2*8
> - movq %rbx,(%rsp)
> + movq %rbx, (%rsp)
> CFI_REL_OFFSET rbx, 0
> - movq %r12,1*8(%rsp)
> + movq %r12, 1*8(%rsp)
> CFI_REL_OFFSET r12, 1*8
>
> - movl $(4096/64)-5,%ecx
> + movl $(4096/64)-5, %ecx
> .p2align 4
> .Loop64:
> - dec %rcx
> + dec %rcx
Applying: Modify comments and clean up code.
/home/boris/kernel/linux-2.6/.git/rebase-apply/patch:51: space before tab in indent.
dec %rcx
warning: 1 line adds whitespace errors.
>
> - movq (%rsi), %rax
> - movq 8 (%rsi), %rbx
> - movq 16 (%rsi), %rdx
> - movq 24 (%rsi), %r8
> - movq 32 (%rsi), %r9
> - movq 40 (%rsi), %r10
> - movq 48 (%rsi), %r11
> - movq 56 (%rsi), %r12
> + movq 0x8*0(%rsi), %rax
> + movq 0x8*1(%rsi), %rbx
> + movq 0x8*2(%rsi), %rdx
> + movq 0x8*3(%rsi), %r8
> + movq 0x8*4(%rsi), %r9
> + movq 0x8*5(%rsi), %r10
> + movq 0x8*6(%rsi), %r11
> + movq 0x8*7(%rsi), %r12
[ … ]
--
Regards/Gruss,
Boris.