LinuxLists.cc - [PATCH RFC 1/2 ] [x86] Optimize copy-page by reducing impact from HW prefetch

2011-06-28 15:23:27

Subject: [PATCH RFC 1/2 ] [x86] Optimize copy-page by reducing impact from HW prefetch

From: Ma Ling <[email protected]>

Program's temporal & spatial locality introduce cache unit to overcome
the processor-memory performance gap, hardware prefetch is very important
to improve performance by reducing cache miss. Modern CPU micro-architecture
mainly support two kinds of prefetch mechanism in L1 data cache:

a. Data cache unit (DCU) prefetcher. Data spatial locality ask us to provide
adjacent data while handling current data. larger cache line size
is one choice, but it would cause more cached data to be evicted and latency
to load, so we simply prefetch next line when accessing current data.
This mode only prefetch data of ascending address.

b. Instruction pointer (IP)- based strided prefetcher. Based on Load/write
instruction address the mechanism predicate to prefetch data with adaptive stride,
including ascending and descending address

DCU mode is good when time program data operation spend is longer than that of
prefetch next line, however copy-page function breaks the assumption,
DCU mode is hardly helpful, specially we append software prefetch and data is
in cache, so bus traffic is more busy that impact perforamnce seriously.

In this patch we introduce backward copy to successfully avoid HW prfetch
impact(DCU prefetcher), and simplify original code.
The performance on atom is improved about 11%, 8% on hot/cold-cache case respectively.
(We use our micro-benchmark, and will do further test according to your requirement)

Thanks
Ling

---
In this version we re-use prefetcht0 for atom cpu, although prefetchnta is better on snb.

arch/x86/lib/copy_page_64.S | 140 +++++++++++++++++++-----------------------
1 files changed, 63 insertions(+), 77 deletions(-)

diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 45f7db7..35e08fe 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -1,4 +1,5 @@
/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
+/* Updated 2011 by Ma Ling to introduce backward copy */

#include <linux/linkage.h>
#include <asm/dwarf2.h>
@@ -13,89 +14,74 @@ copy_page_rep:
CFI_ENDPROC
ENDPROC(copy_page_rep)

-/*
- Don't use streaming copy unless cpu indicate X86_FEATURE_REP_GOOD
- Could vary the prefetch distance based on SMP/UP
-*/
-
+/*
+ * Don't use streaming copy unless cpu indicate X86_FEATURE_REP_GOOD.
+ * By backward copy we manage to reduce impact from HW prefetch
+ * when data is in L1 cache, and get benefit when data is not in L1 cache.
+ */
ENTRY(copy_page)
CFI_STARTPROC
- subq $3*8, %rsp
- CFI_ADJUST_CFA_OFFSET 3*8
- movq %rbx, (%rsp)
- CFI_REL_OFFSET rbx, 0
- movq %r12, 1*8(%rsp)
- CFI_REL_OFFSET r12, 1*8
- movq %r13, 2*8(%rsp)
- CFI_REL_OFFSET r13, 2*8
-
- movl $(4096/64)-5, %ecx
- .p2align 4
+ lea 4096(%rsi), %rsi
+ lea 4096(%rdi), %rdi
+ mov $(4096/64)-5, %cl
+ mov $5, %dl
+ /*
+ * Nop force following instruction to be 16 bytes aligned.
+ */
+ nop
.Loop64:
- dec %rcx
-
- movq 0x8*0(%rsi), %rax
- movq 0x8*1(%rsi), %rbx
- movq 0x8*2(%rsi), %rdx
- movq 0x8*3(%rsi), %r8
- movq 0x8*4(%rsi), %r9
- movq 0x8*5(%rsi), %r10
- movq 0x8*6(%rsi), %r11
- movq 0x8*7(%rsi), %r12
-
- prefetcht0 5*64(%rsi)
-
- movq %rax, 0x8*0(%rdi)
- movq %rbx, 0x8*1(%rdi)
- movq %rdx, 0x8*2(%rdi)
- movq %r8, 0x8*3(%rdi)
- movq %r9, 0x8*4(%rdi)
- movq %r10, 0x8*5(%rdi)
- movq %r11, 0x8*6(%rdi)
- movq %r12, 0x8*7(%rdi)
-
- leaq 64 (%rsi), %rsi
- leaq 64 (%rdi), %rdi
-
- jnz .Loop64
-
- movl $5, %ecx
- .p2align 4
+ prefetcht0 -5*64(%rsi)
+ decb %cl
+
+ movq -0x8*1(%rsi), %rax
+ movq -0x8*2(%rsi), %r8
+ movq -0x8*3(%rsi), %r9
+ movq -0x8*4(%rsi), %r10
+ movq %rax, -0x8*1(%rdi)
+ movq %r8, -0x8*2(%rdi)
+ movq %r9, -0x8*3(%rdi)
+ movq %r10, -0x8*4(%rdi)
+
+ movq -0x8*5(%rsi), %rax
+ movq -0x8*6(%rsi), %r8
+ movq -0x8*7(%rsi), %r9
+ movq -0x8*8(%rsi), %r10
+ leaq -64(%rsi), %rsi
+ movq %rax, -0x8*5(%rdi)
+ movq %r8, -0x8*6(%rdi)
+ movq %r9, -0x8*7(%rdi)
+ movq %r10, -0x8*8(%rdi)
+ leaq -64(%rdi), %rdi
+
+ jnz .Loop64
+
.Loop2:
- decl %ecx
-
- movq 0x8*0(%rsi), %rax
- movq 0x8*1(%rsi), %rbx
- movq 0x8*2(%rsi), %rdx
- movq 0x8*3(%rsi), %r8
- movq 0x8*4(%rsi), %r9
- movq 0x8*5(%rsi), %r10
- movq 0x8*6(%rsi), %r11
- movq 0x8*7(%rsi), %r12
-
- movq %rax, 0x8*0(%rdi)
- movq %rbx, 0x8*1(%rdi)
- movq %rdx, 0x8*2(%rdi)
- movq %r8, 0x8*3(%rdi)
- movq %r9, 0x8*4(%rdi)
- movq %r10, 0x8*5(%rdi)
- movq %r11, 0x8*6(%rdi)
- movq %r12, 0x8*7(%rdi)
+ decb %dl
+
+ movq -0x8*1(%rsi), %rax
+ movq -0x8*2(%rsi), %r8
+ movq -0x8*3(%rsi), %r9
+ movq -0x8*4(%rsi), %r10
+ movq %rax, -0x8*1(%rdi)
+ movq %r8, -0x8*2(%rdi)
+ movq %r9, -0x8*3(%rdi)
+ movq %r10, -0x8*4(%rdi)
+
+ movq -0x8*5(%rsi), %rax
+ movq -0x8*6(%rsi), %r8
+ movq -0x8*7(%rsi), %r9
+ movq -0x8*8(%rsi), %r10
+ leaq -64(%rsi), %rsi
+ movq %rax, -0x8*5(%rdi)
+ movq %r8, -0x8*6(%rdi)
+ movq %r9, -0x8*7(%rdi)
+ movq %r10, -0x8*8(%rdi)
+ leaq -64(%rdi), %rdi
+
+ jnz .Loop2

- leaq 64(%rdi), %rdi
- leaq 64(%rsi), %rsi
-
- jnz .Loop2
-
- movq (%rsp), %rbx
- CFI_RESTORE rbx
- movq 1*8(%rsp), %r12
- CFI_RESTORE r12
- movq 2*8(%rsp), %r13
- CFI_RESTORE r13
- addq $3*8, %rsp
- CFI_ADJUST_CFA_OFFSET -3*8
ret
+
.Lcopy_page_end:
CFI_ENDPROC
ENDPROC(copy_page)
--
1.6.5.2

2011-06-29 01:25:31

by tip-bot for Ma Ling

[permalink] [raw]

Subject: RE: [PATCH RFC 1/2 ] [x86] Optimize copy-page by reducing impact from HW prefetch

This patch subject should be [PATCH RFC 2/2 ] [x86] Optimize copy-page by reducing impact from HW prefetch,
I will change it in next version with your comments.

Thanks
Ling

> -----Original Message-----
> From: Ma, Ling
> Sent: Wednesday, June 29, 2011 6:37 AM
> To: [email protected]
> Cc: [email protected]; [email protected]; [email protected]; Ma,
> Ling
> Subject: [PATCH RFC 1/2 ] [x86] Optimize copy-page by reducing impact
> from HW prefetch
>
> From: Ma Ling <[email protected]>
>
> Program's temporal & spatial locality introduce cache unit to overcome
> the processor-memory performance gap, hardware prefetch is very
> important
> to improve performance by reducing cache miss. Modern CPU micro-
> architecture
> mainly support two kinds of prefetch mechanism in L1 data cache:
>
> a. Data cache unit (DCU) prefetcher. Data spatial locality ask us to
> provide
> adjacent data while handling current data. larger cache line size
> is one choice, but it would cause more cached data to be evicted and
> latency
> to load, so we simply prefetch next line when accessing current data.
> This mode only prefetch data of ascending address.
>
> b. Instruction pointer (IP)- based strided prefetcher. Based on
> Load/write
> instruction address the mechanism predicate to prefetch data with
> adaptive stride,
> including ascending and descending address
>
> DCU mode is good when time program data operation spend is longer than
> that of
> prefetch next line, however copy-page function breaks the assumption,
> DCU mode is hardly helpful, specially we append software prefetch and
> data is
> in cache, so bus traffic is more busy that impact perforamnce seriously.
>
> In this patch we introduce backward copy to successfully avoid HW
> prfetch
> impact(DCU prefetcher), and simplify original code.
> The performance on atom is improved about 11%, 8% on hot/cold-cache
> case respectively.
> (We use our micro-benchmark, and will do further test according to your
> requirement)
>
> Thanks
> Ling
>
> ---
> In this version we re-use prefetcht0 for atom cpu, although prefetchnta
> is better on snb.
>
> arch/x86/lib/copy_page_64.S | 140 +++++++++++++++++++----------------
> -------
> 1 files changed, 63 insertions(+), 77 deletions(-)
>
> diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
> index 45f7db7..35e08fe 100644
> --- a/arch/x86/lib/copy_page_64.S
> +++ b/arch/x86/lib/copy_page_64.S
> @@ -1,4 +1,5 @@
> /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
> +/* Updated 2011 by Ma Ling to introduce backward copy */
>
> #include <linux/linkage.h>
> #include <asm/dwarf2.h>
> @@ -13,89 +14,74 @@ copy_page_rep:
> CFI_ENDPROC
> ENDPROC(copy_page_rep)
>
> -/*
> - Don't use streaming copy unless cpu indicate X86_FEATURE_REP_GOOD
> - Could vary the prefetch distance based on SMP/UP
> -*/
> -
> +/*
> + * Don't use streaming copy unless cpu indicate X86_FEATURE_REP_GOOD.
> + * By backward copy we manage to reduce impact from HW prefetch
> + * when data is in L1 cache, and get benefit when data is not in L1
> cache.
> + */
> ENTRY(copy_page)
> CFI_STARTPROC
> - subq $3*8, %rsp
> - CFI_ADJUST_CFA_OFFSET 3*8
> - movq %rbx, (%rsp)
> - CFI_REL_OFFSET rbx, 0
> - movq %r12, 1*8(%rsp)
> - CFI_REL_OFFSET r12, 1*8
> - movq %r13, 2*8(%rsp)
> - CFI_REL_OFFSET r13, 2*8
> -
> - movl $(4096/64)-5, %ecx
> - .p2align 4
> + lea 4096(%rsi), %rsi
> + lea 4096(%rdi), %rdi
> + mov $(4096/64)-5, %cl
> + mov $5, %dl
> + /*
> + * Nop force following instruction to be 16 bytes aligned.
> + */
> + nop
> .Loop64:
> - dec %rcx
> -
> - movq 0x8*0(%rsi), %rax
> - movq 0x8*1(%rsi), %rbx
> - movq 0x8*2(%rsi), %rdx
> - movq 0x8*3(%rsi), %r8
> - movq 0x8*4(%rsi), %r9
> - movq 0x8*5(%rsi), %r10
> - movq 0x8*6(%rsi), %r11
> - movq 0x8*7(%rsi), %r12
> -
> - prefetcht0 5*64(%rsi)
> -
> - movq %rax, 0x8*0(%rdi)
> - movq %rbx, 0x8*1(%rdi)
> - movq %rdx, 0x8*2(%rdi)
> - movq %r8, 0x8*3(%rdi)
> - movq %r9, 0x8*4(%rdi)
> - movq %r10, 0x8*5(%rdi)
> - movq %r11, 0x8*6(%rdi)
> - movq %r12, 0x8*7(%rdi)
> -
> - leaq 64 (%rsi), %rsi
> - leaq 64 (%rdi), %rdi
> -
> - jnz .Loop64
> -
> - movl $5, %ecx
> - .p2align 4
> + prefetcht0 -5*64(%rsi)
> + decb %cl
> +
> + movq -0x8*1(%rsi), %rax
> + movq -0x8*2(%rsi), %r8
> + movq -0x8*3(%rsi), %r9
> + movq -0x8*4(%rsi), %r10
> + movq %rax, -0x8*1(%rdi)
> + movq %r8, -0x8*2(%rdi)
> + movq %r9, -0x8*3(%rdi)
> + movq %r10, -0x8*4(%rdi)
> +
> + movq -0x8*5(%rsi), %rax
> + movq -0x8*6(%rsi), %r8
> + movq -0x8*7(%rsi), %r9
> + movq -0x8*8(%rsi), %r10
> + leaq -64(%rsi), %rsi
> + movq %rax, -0x8*5(%rdi)
> + movq %r8, -0x8*6(%rdi)
> + movq %r9, -0x8*7(%rdi)
> + movq %r10, -0x8*8(%rdi)
> + leaq -64(%rdi), %rdi
> +
> + jnz .Loop64
> +
> .Loop2:
> - decl %ecx
> -
> - movq 0x8*0(%rsi), %rax
> - movq 0x8*1(%rsi), %rbx
> - movq 0x8*2(%rsi), %rdx
> - movq 0x8*3(%rsi), %r8
> - movq 0x8*4(%rsi), %r9
> - movq 0x8*5(%rsi), %r10
> - movq 0x8*6(%rsi), %r11
> - movq 0x8*7(%rsi), %r12
> -
> - movq %rax, 0x8*0(%rdi)
> - movq %rbx, 0x8*1(%rdi)
> - movq %rdx, 0x8*2(%rdi)
> - movq %r8, 0x8*3(%rdi)
> - movq %r9, 0x8*4(%rdi)
> - movq %r10, 0x8*5(%rdi)
> - movq %r11, 0x8*6(%rdi)
> - movq %r12, 0x8*7(%rdi)
> + decb %dl
> +
> + movq -0x8*1(%rsi), %rax
> + movq -0x8*2(%rsi), %r8
> + movq -0x8*3(%rsi), %r9
> + movq -0x8*4(%rsi), %r10
> + movq %rax, -0x8*1(%rdi)
> + movq %r8, -0x8*2(%rdi)
> + movq %r9, -0x8*3(%rdi)
> + movq %r10, -0x8*4(%rdi)
> +
> + movq -0x8*5(%rsi), %rax
> + movq -0x8*6(%rsi), %r8
> + movq -0x8*7(%rsi), %r9
> + movq -0x8*8(%rsi), %r10
> + leaq -64(%rsi), %rsi
> + movq %rax, -0x8*5(%rdi)
> + movq %r8, -0x8*6(%rdi)
> + movq %r9, -0x8*7(%rdi)
> + movq %r10, -0x8*8(%rdi)
> + leaq -64(%rdi), %rdi
> +
> + jnz .Loop2
>
> - leaq 64(%rdi), %rdi
> - leaq 64(%rsi), %rsi
> -
> - jnz .Loop2
> -
> - movq (%rsp), %rbx
> - CFI_RESTORE rbx
> - movq 1*8(%rsp), %r12
> - CFI_RESTORE r12
> - movq 2*8(%rsp), %r13
> - CFI_RESTORE r13
> - addq $3*8, %rsp
> - CFI_ADJUST_CFA_OFFSET -3*8
> ret
> +
> .Lcopy_page_end:
> CFI_ENDPROC
> ENDPROC(copy_page)
> --
> 1.6.5.2