From: Ma Ling <[email protected]>
Load and write operation occupy about 35% and 10% respectively
for most industry benchmarks. Fetched 16-aligned bytes code include
about 4 instructions, implying 1.34(0.35 * 4) load, 0.4 write.
Modern CPU support 2 load and 1 write per cycle, so throughput from write is
bottleneck for memcpy or copy_page, and some slight CPU only support one mem
operation per cycle. So it is enough to issue one read and write instruction
per cycle, and we can save registers.
In this patch we also re-arrange instruction sequence to improve performance
The performance on atom is improved about 11%, 9% on hot/cold-cache case respectively.
Signed-off-by: Ma Ling <[email protected]>
---
arch/x86/lib/copy_page_64.S | 103 +++++++++++++++++-------------------------
1 files changed, 42 insertions(+), 61 deletions(-)
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 3da5527..13c97f4 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -20,76 +20,57 @@ ENDPROC(copy_page_rep)
ENTRY(copy_page)
CFI_STARTPROC
- subq $2*8, %rsp
- CFI_ADJUST_CFA_OFFSET 2*8
- movq %rbx, (%rsp)
- CFI_REL_OFFSET rbx, 0
- movq %r12, 1*8(%rsp)
- CFI_REL_OFFSET r12, 1*8
+ mov $(4096/64)-5, %ecx
- movl $(4096/64)-5, %ecx
- .p2align 4
.Loop64:
- dec %rcx
-
- movq 0x8*0(%rsi), %rax
- movq 0x8*1(%rsi), %rbx
- movq 0x8*2(%rsi), %rdx
- movq 0x8*3(%rsi), %r8
- movq 0x8*4(%rsi), %r9
- movq 0x8*5(%rsi), %r10
- movq 0x8*6(%rsi), %r11
- movq 0x8*7(%rsi), %r12
-
prefetcht0 5*64(%rsi)
-
- movq %rax, 0x8*0(%rdi)
- movq %rbx, 0x8*1(%rdi)
- movq %rdx, 0x8*2(%rdi)
- movq %r8, 0x8*3(%rdi)
- movq %r9, 0x8*4(%rdi)
- movq %r10, 0x8*5(%rdi)
- movq %r11, 0x8*6(%rdi)
- movq %r12, 0x8*7(%rdi)
-
- leaq 64 (%rsi), %rsi
- leaq 64 (%rdi), %rdi
-
+ decb %cl
+
+ movq 0x8*0(%rsi), %r10
+ movq 0x8*1(%rsi), %rax
+ movq 0x8*2(%rsi), %r8
+ movq 0x8*3(%rsi), %r9
+ movq %r10, 0x8*0(%rdi)
+ movq %rax, 0x8*1(%rdi)
+ movq %r8, 0x8*2(%rdi)
+ movq %r9, 0x8*3(%rdi)
+
+ movq 0x8*4(%rsi), %r10
+ movq 0x8*5(%rsi), %rax
+ movq 0x8*6(%rsi), %r8
+ movq 0x8*7(%rsi), %r9
+ leaq 64(%rsi), %rsi
+ movq %r10, 0x8*4(%rdi)
+ movq %rax, 0x8*5(%rdi)
+ movq %r8, 0x8*6(%rdi)
+ movq %r9, 0x8*7(%rdi)
+ leaq 64(%rdi), %rdi
jnz .Loop64
- movl $5, %ecx
- .p2align 4
+ mov $5, %dl
.Loop2:
- decl %ecx
-
- movq 0x8*0(%rsi), %rax
- movq 0x8*1(%rsi), %rbx
- movq 0x8*2(%rsi), %rdx
- movq 0x8*3(%rsi), %r8
- movq 0x8*4(%rsi), %r9
- movq 0x8*5(%rsi), %r10
- movq 0x8*6(%rsi), %r11
- movq 0x8*7(%rsi), %r12
-
- movq %rax, 0x8*0(%rdi)
- movq %rbx, 0x8*1(%rdi)
- movq %rdx, 0x8*2(%rdi)
- movq %r8, 0x8*3(%rdi)
- movq %r9, 0x8*4(%rdi)
- movq %r10, 0x8*5(%rdi)
- movq %r11, 0x8*6(%rdi)
- movq %r12, 0x8*7(%rdi)
-
- leaq 64(%rdi), %rdi
+ decb %dl
+ movq 0x8*0(%rsi), %r10
+ movq 0x8*1(%rsi), %rax
+ movq 0x8*2(%rsi), %r8
+ movq 0x8*3(%rsi), %r9
+ movq %r10, 0x8*0(%rdi)
+ movq %rax, 0x8*1(%rdi)
+ movq %r8, 0x8*2(%rdi)
+ movq %r9, 0x8*3(%rdi)
+
+ movq 0x8*4(%rsi), %r10
+ movq 0x8*5(%rsi), %rax
+ movq 0x8*6(%rsi), %r8
+ movq 0x8*7(%rsi), %r9
leaq 64(%rsi), %rsi
+ movq %r10, 0x8*4(%rdi)
+ movq %rax, 0x8*5(%rdi)
+ movq %r8, 0x8*6(%rdi)
+ movq %r9, 0x8*7(%rdi)
+ leaq 64(%rdi), %rdi
jnz .Loop2
- movq (%rsp), %rbx
- CFI_RESTORE rbx
- movq 1*8(%rsp), %r12
- CFI_RESTORE r12
- addq $2*8, %rsp
- CFI_ADJUST_CFA_OFFSET -2*8
ret
.Lcopy_page_end:
CFI_ENDPROC
--
1.6.5.2
[email protected] writes:
> From: Ma Ling <[email protected]>
>
> Load and write operation occupy about 35% and 10% respectively
> for most industry benchmarks. Fetched 16-aligned bytes code include
> about 4 instructions, implying 1.34(0.35 * 4) load, 0.4 write.
> Modern CPU support 2 load and 1 write per cycle, so throughput from write is
> bottleneck for memcpy or copy_page, and some slight CPU only support one mem
> operation per cycle. So it is enough to issue one read and write instruction
> per cycle, and we can save registers.
I don't think "saving registers" is a useful goal here.
>
> In this patch we also re-arrange instruction sequence to improve performance
> The performance on atom is improved about 11%, 9% on hot/cold-cache
> case respectively.
That's great, but the question is what happened to the older CPUs that
also this sequence. It may be safer to add a new variant for Atom,
unless you can benchmark those too.
-Andi
--
[email protected] -- Speaking for myself only
On Thu, Oct 11, 2012 at 08:29:08PM +0800, [email protected] wrote:
> From: Ma Ling <[email protected]>
>
> Load and write operation occupy about 35% and 10% respectively
> for most industry benchmarks. Fetched 16-aligned bytes code include
> about 4 instructions, implying 1.34(0.35 * 4) load, 0.4 write.
> Modern CPU support 2 load and 1 write per cycle, so throughput from write is
> bottleneck for memcpy or copy_page, and some slight CPU only support one mem
> operation per cycle. So it is enough to issue one read and write instruction
> per cycle, and we can save registers.
So is that also true for AMD CPUs?
>
> In this patch we also re-arrange instruction sequence to improve performance
> The performance on atom is improved about 11%, 9% on hot/cold-cache case respectively.
>
> Signed-off-by: Ma Ling <[email protected]>
>
> ---
> arch/x86/lib/copy_page_64.S | 103 +++++++++++++++++-------------------------
> 1 files changed, 42 insertions(+), 61 deletions(-)
>
> diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
> index 3da5527..13c97f4 100644
> --- a/arch/x86/lib/copy_page_64.S
> +++ b/arch/x86/lib/copy_page_64.S
> @@ -20,76 +20,57 @@ ENDPROC(copy_page_rep)
>
> ENTRY(copy_page)
> CFI_STARTPROC
> - subq $2*8, %rsp
> - CFI_ADJUST_CFA_OFFSET 2*8
> - movq %rbx, (%rsp)
> - CFI_REL_OFFSET rbx, 0
> - movq %r12, 1*8(%rsp)
> - CFI_REL_OFFSET r12, 1*8
> + mov $(4096/64)-5, %ecx
>
> - movl $(4096/64)-5, %ecx
> - .p2align 4
> .Loop64:
> - dec %rcx
> -
> - movq 0x8*0(%rsi), %rax
> - movq 0x8*1(%rsi), %rbx
> - movq 0x8*2(%rsi), %rdx
> - movq 0x8*3(%rsi), %r8
> - movq 0x8*4(%rsi), %r9
> - movq 0x8*5(%rsi), %r10
> - movq 0x8*6(%rsi), %r11
> - movq 0x8*7(%rsi), %r12
> -
> prefetcht0 5*64(%rsi)
> -
> - movq %rax, 0x8*0(%rdi)
> - movq %rbx, 0x8*1(%rdi)
> - movq %rdx, 0x8*2(%rdi)
> - movq %r8, 0x8*3(%rdi)
> - movq %r9, 0x8*4(%rdi)
> - movq %r10, 0x8*5(%rdi)
> - movq %r11, 0x8*6(%rdi)
> - movq %r12, 0x8*7(%rdi)
> -
> - leaq 64 (%rsi), %rsi
> - leaq 64 (%rdi), %rdi
> -
> + decb %cl
> +
> + movq 0x8*0(%rsi), %r10
> + movq 0x8*1(%rsi), %rax
> + movq 0x8*2(%rsi), %r8
> + movq 0x8*3(%rsi), %r9
> + movq %r10, 0x8*0(%rdi)
> + movq %rax, 0x8*1(%rdi)
> + movq %r8, 0x8*2(%rdi)
> + movq %r9, 0x8*3(%rdi)
> +
> + movq 0x8*4(%rsi), %r10
> + movq 0x8*5(%rsi), %rax
> + movq 0x8*6(%rsi), %r8
> + movq 0x8*7(%rsi), %r9
> + leaq 64(%rsi), %rsi
> + movq %r10, 0x8*4(%rdi)
> + movq %rax, 0x8*5(%rdi)
> + movq %r8, 0x8*6(%rdi)
> + movq %r9, 0x8*7(%rdi)
> + leaq 64(%rdi), %rdi
> jnz .Loop64
>
> - movl $5, %ecx
> - .p2align 4
> + mov $5, %dl
> .Loop2:
> - decl %ecx
> -
> - movq 0x8*0(%rsi), %rax
> - movq 0x8*1(%rsi), %rbx
> - movq 0x8*2(%rsi), %rdx
> - movq 0x8*3(%rsi), %r8
> - movq 0x8*4(%rsi), %r9
> - movq 0x8*5(%rsi), %r10
> - movq 0x8*6(%rsi), %r11
> - movq 0x8*7(%rsi), %r12
> -
> - movq %rax, 0x8*0(%rdi)
> - movq %rbx, 0x8*1(%rdi)
> - movq %rdx, 0x8*2(%rdi)
> - movq %r8, 0x8*3(%rdi)
> - movq %r9, 0x8*4(%rdi)
> - movq %r10, 0x8*5(%rdi)
> - movq %r11, 0x8*6(%rdi)
> - movq %r12, 0x8*7(%rdi)
> -
> - leaq 64(%rdi), %rdi
> + decb %dl
> + movq 0x8*0(%rsi), %r10
> + movq 0x8*1(%rsi), %rax
> + movq 0x8*2(%rsi), %r8
> + movq 0x8*3(%rsi), %r9
> + movq %r10, 0x8*0(%rdi)
> + movq %rax, 0x8*1(%rdi)
> + movq %r8, 0x8*2(%rdi)
> + movq %r9, 0x8*3(%rdi)
> +
> + movq 0x8*4(%rsi), %r10
> + movq 0x8*5(%rsi), %rax
> + movq 0x8*6(%rsi), %r8
> + movq 0x8*7(%rsi), %r9
> leaq 64(%rsi), %rsi
> + movq %r10, 0x8*4(%rdi)
> + movq %rax, 0x8*5(%rdi)
> + movq %r8, 0x8*6(%rdi)
> + movq %r9, 0x8*7(%rdi)
> + leaq 64(%rdi), %rdi
> jnz .Loop2
>
> - movq (%rsp), %rbx
> - CFI_RESTORE rbx
> - movq 1*8(%rsp), %r12
> - CFI_RESTORE r12
> - addq $2*8, %rsp
> - CFI_ADJUST_CFA_OFFSET -2*8
> ret
> .Lcopy_page_end:
> CFI_ENDPROC
> --
> 1.6.5.2
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
> > Load and write operation occupy about 35% and 10% respectively for
> > most industry benchmarks. Fetched 16-aligned bytes code include about
> > 4 instructions, implying 1.34(0.35 * 4) load, 0.4 write.
> > Modern CPU support 2 load and 1 write per cycle, so throughput from
> > write is bottleneck for memcpy or copy_page, and some slight CPU only
> > support one mem operation per cycle. So it is enough to issue one
> read
> > and write instruction per cycle, and we can save registers.
>
> I don't think "saving registers" is a useful goal here.
Ling: issuing one read and write ops in one cycle is enough for copy_page or memcpy performance,
so we could avoid saving and restoring registers operation.
> >
> > In this patch we also re-arrange instruction sequence to improve
> > performance The performance on atom is improved about 11%, 9% on
> > hot/cold-cache case respectively.
>
> That's great, but the question is what happened to the older CPUs that
> also this sequence. It may be safer to add a new variant for Atom,
> unless you can benchmark those too.
Ling:
I tested new and original version on core2, the patch improved performance about 9%,
Although core2 is out-of-order pipeline and weaken instruction sequence requirement,
because of ROB size limitation, new patch issues write operation earlier and
get more parallelism possibility for the pair of write and load ops and better result.
Attached core2-cpu-info (I have no older machine)
Thanks
Ling
> > Load and write operation occupy about 35% and 10% respectively for
> > most industry benchmarks. Fetched 16-aligned bytes code include about
> > 4 instructions, implying 1.34(0.35 * 4) load, 0.4 write.
> > Modern CPU support 2 load and 1 write per cycle, so throughput from
> > write is bottleneck for memcpy or copy_page, and some slight CPU only
> > support one mem operation per cycle. So it is enough to issue one
> read
> > and write instruction per cycle, and we can save registers.
>
> So is that also true for AMD CPUs?
Although Bulldozer put 32byte instruction into decoupled 16byte entry buffers,
it still decode 4 instructions per cycle, so 4 instructions will be fed into execution unit and
2 loads ,1 write will be issued per cycle.
Thanks
Ling
On Fri, Oct 12, 2012 at 03:37:50AM +0000, Ma, Ling wrote:
> > > Load and write operation occupy about 35% and 10% respectively for
> > > most industry benchmarks. Fetched 16-aligned bytes code include about
> > > 4 instructions, implying 1.34(0.35 * 4) load, 0.4 write.
> > > Modern CPU support 2 load and 1 write per cycle, so throughput from
> > > write is bottleneck for memcpy or copy_page, and some slight CPU only
> > > support one mem operation per cycle. So it is enough to issue one
> > read
> > > and write instruction per cycle, and we can save registers.
> >
> > So is that also true for AMD CPUs?
> Although Bulldozer put 32byte instruction into decoupled 16byte entry buffers,
> it still decode 4 instructions per cycle, so 4 instructions will be fed into execution unit and
> 2 loads ,1 write will be issued per cycle.
I'd be very interested with what benchmarks are you seeing that perf
improvement on Atom and who knows, maybe I could find time to run them
on Bulldozer and see how your patch behaves there :-).
Thanks.
--
Regards/Gruss,
Boris.
> > > So is that also true for AMD CPUs?
> > Although Bulldozer put 32byte instruction into decoupled 16byte entry
> > buffers, it still decode 4 instructions per cycle, so 4 instructions
> > will be fed into execution unit and
> > 2 loads ,1 write will be issued per cycle.
>
> I'd be very interested with what benchmarks are you seeing that perf
> improvement on Atom and who knows, maybe I could find time to run them
> on Bulldozer and see how your patch behaves there :-).M
I use another benchmark from gcc, there are many code, and extract one simple benchmark, you may use it to test (cc -o copy_page copy_page.c),
my initial result shows new copy page version is still better on bulldozer machine, because the machine is first release, please verify result.
And CC to Ian.
Thanks
Ling
> I tested new and original version on core2, the patch improved performance about 9%,
That's not useful because core2 doesn't use this variant, it uses the
rep string variant. Primary user is P4.
> Although core2 is out-of-order pipeline and weaken instruction sequence requirement,
> because of ROB size limitation, new patch issues write operation earlier and
> get more parallelism possibility for the pair of write and load ops and better result.
> Attached core2-cpu-info (I have no older machine)
If you can't test the CPUs who run this code I think it's safer if you
add a new variant for Atom, not change the existing well tested code.
Otherwise you risk performance regressions on these older CPUs.
-Andi
--
[email protected] -- Speaking for myself only.
> If you can't test the CPUs who run this code I think it's safer if you
> add a new variant for Atom, not change the existing well tested code.
> Otherwise you risk performance regressions on these older CPUs.
I found one older machine, and tested the code on it, the results between them are almost the same as below(attached cpu info).
1 copy_page_org copy_page_new
2 TPT: Len 4096, alignment 0/ 0: 2252 2218
3 TPT: Len 4096, alignment 0/ 0: 2244 2193
4 TPT: Len 4096, alignment 0/ 0: 2261 2227
5 TPT: Len 4096, alignment 0/ 0: 2235 2244
6 TPT: Len 4096, alignment 0/ 0: 2261 2184
Thanks
Ling
On Fri, Oct 12, 2012 at 02:54:54PM +0000, Ma, Ling wrote:
> > If you can't test the CPUs who run this code I think it's safer if you
> > add a new variant for Atom, not change the existing well tested code.
> > Otherwise you risk performance regressions on these older CPUs.
>
> I found one older machine, and tested the code on it, the results between them are almost the same as below(attached cpu info).
Was that a P4 (family 15)?
Those were the main users. There were a few others, but they are obscure
(early steppings of K8)
-Andi
--
[email protected] -- Speaking for myself only.
On Fri, Oct 12, 2012 at 09:07:43AM +0000, Ma, Ling wrote:
> > > > So is that also true for AMD CPUs?
> > > Although Bulldozer put 32byte instruction into decoupled 16byte entry
> > > buffers, it still decode 4 instructions per cycle, so 4 instructions
> > > will be fed into execution unit and
> > > 2 loads ,1 write will be issued per cycle.
> >
> > I'd be very interested with what benchmarks are you seeing that perf
> > improvement on Atom and who knows, maybe I could find time to run them
> > on Bulldozer and see how your patch behaves there :-).M
> I use another benchmark from gcc, there are many code, and extract
> one simple benchmark, you may use it to test (cc -o copy_page
> copy_page.c), my initial result shows new copy page version is still
> better on bulldozer machine, because the machine is first release,
> please verify result. And CC to Ian.
Right, so benchmark shows around 20% speedup on Bulldozer but this is a
microbenchmark and before pursue this further, we need to verify whether
this brings any palpable speedup with a real benchmark, I don't know,
kernbench, netbench, whatever. Even something as boring as kernel build.
And probably check for perf regressions on the rest of the uarches.
Thanks.
--
Regards/Gruss,
Boris.
Here are some Phenom results for that benchmark. The average time
increases from 700 to 760 cycles (+8.6%).
vendor_id : AuthenticAMD
cpu family : 16
model : 2
model name : AMD Phenom(tm) 9850 Quad-Core Processor
stepping : 3
microcode : 0x1000083
cpu MHz : 2500.210
cache size : 512 KB
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm 3dnowext 3dnow constant_tsc rep_good nopl nonstop_tsc extd_apicid pni monitor cx16 popcnt lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs hw_pstate npt lbrv svm_lock
bogomips : 5000.42
TLB size : 1024 4K pages
clflush size : 64
cache_alignment : 64
copy_page_org copy_page_new
TPT: Len 4096, alignment 0/ 0: 678 760
TPT: Len 4096, alignment 0/ 0: 710 760
TPT: Len 4096, alignment 0/ 0: 710 760
TPT: Len 4096, alignment 0/ 0: 710 760
TPT: Len 4096, alignment 0/ 0: 710 760
copy_page_org copy_page_new
TPT: Len 4096, alignment 0/ 0: 667 760
TPT: Len 4096, alignment 0/ 0: 673 760
TPT: Len 4096, alignment 0/ 0: 710 760
TPT: Len 4096, alignment 0/ 0: 710 760
TPT: Len 4096, alignment 0/ 0: 710 760
copy_page_org copy_page_new
TPT: Len 4096, alignment 0/ 0: 667 760
TPT: Len 4096, alignment 0/ 0: 673 760
TPT: Len 4096, alignment 0/ 0: 710 760
TPT: Len 4096, alignment 0/ 0: 710 760
TPT: Len 4096, alignment 0/ 0: 710 760
copy_page_org copy_page_new
TPT: Len 4096, alignment 0/ 0: 671 760
TPT: Len 4096, alignment 0/ 0: 673 760
TPT: Len 4096, alignment 0/ 0: 671 760
TPT: Len 4096, alignment 0/ 0: 709 760
TPT: Len 4096, alignment 0/ 0: 708 760
copy_page_org copy_page_new
TPT: Len 4096, alignment 0/ 0: 667 760
TPT: Len 4096, alignment 0/ 0: 710 760
TPT: Len 4096, alignment 0/ 0: 710 760
TPT: Len 4096, alignment 0/ 0: 710 760
TPT: Len 4096, alignment 0/ 0: 710 760
copy_page_org copy_page_new
TPT: Len 4096, alignment 0/ 0: 671 760
TPT: Len 4096, alignment 0/ 0: 710 760
TPT: Len 4096, alignment 0/ 0: 710 760
TPT: Len 4096, alignment 0/ 0: 710 760
TPT: Len 4096, alignment 0/ 0: 710 760
copy_page_org copy_page_new
TPT: Len 4096, alignment 0/ 0: 678 760
TPT: Len 4096, alignment 0/ 0: 709 758
TPT: Len 4096, alignment 0/ 0: 710 760
TPT: Len 4096, alignment 0/ 0: 709 759
TPT: Len 4096, alignment 0/ 0: 710 760
copy_page_org copy_page_new
TPT: Len 4096, alignment 0/ 0: 680 760
TPT: Len 4096, alignment 0/ 0: 710 760
TPT: Len 4096, alignment 0/ 0: 710 760
TPT: Len 4096, alignment 0/ 0: 710 760
TPT: Len 4096, alignment 0/ 0: 710 760
copy_page_org copy_page_new
TPT: Len 4096, alignment 0/ 0: 667 760
TPT: Len 4096, alignment 0/ 0: 710 760
TPT: Len 4096, alignment 0/ 0: 709 760
TPT: Len 4096, alignment 0/ 0: 709 759
TPT: Len 4096, alignment 0/ 0: 710 760
copy_page_org copy_page_new
TPT: Len 4096, alignment 0/ 0: 678 760
TPT: Len 4096, alignment 0/ 0: 710 760
TPT: Len 4096, alignment 0/ 0: 710 760
TPT: Len 4096, alignment 0/ 0: 710 760
TPT: Len 4096, alignment 0/ 0: 710 760
On Fri, Oct 12, 2012 at 05:02:57PM -0400, George Spelvin wrote:
> Here are some Phenom results for that benchmark. The average time
> increases from 700 to 760 cycles (+8.6%).
I was afraid something like that would show up.
Btw, in looking at this more and IINM, we use the REP MOVSQ version on
AMD anyway because of X86_FEATURE_REP_GOOD being set on some K8 and
everything from F10h on.
So, actually this µbenchmark should be comparing the REP MOVSQ case too
and the changes to the unrolled copy_page shouldn't concern AMD boxes
actually...
Hmm.
--
Regards/Gruss,
Boris.
On Fri, Oct 12, 2012 at 08:04:11PM +0200, Borislav Petkov wrote:
> Right, so benchmark shows around 20% speedup on Bulldozer but this is
> a microbenchmark and before pursue this further, we need to verify
> whether this brings any palpable speedup with a real benchmark, I
> don't know, kernbench, netbench, whatever. Even something as boring as
> kernel build. And probably check for perf regressions on the rest of
> the uarches.
Ok, so to summarize, on AMD we're using REP MOVSQ which is even
faster than the unrolled version. I've added the REP MOVSQ version
to the µbenchmark. It nicely validates that we're correctly setting
X86_FEATURE_REP_GOOD on everything >= F10h and some K8s.
So, to answer Konrad's question: those patches don't concern AMD
machines.
Thanks.
--
Regards/Gruss,
Boris.
Thanks Boris!
So the patch is helpful and no impact for other/older machines,
I will re-send new version according to comments.
Any further comments are appreciated!
Regards
Ling
> -----Original Message-----
> From: Borislav Petkov [mailto:[email protected]]
> Sent: Sunday, October 14, 2012 6:58 PM
> To: Ma, Ling
> Cc: Konrad Rzeszutek Wilk; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> George Spelvin
> Subject: Re: [PATCH RFC 2/2] [x86] Optimize copy_page by re-arranging
> instruction sequence and saving register
>
> On Fri, Oct 12, 2012 at 08:04:11PM +0200, Borislav Petkov wrote:
> > Right, so benchmark shows around 20% speedup on Bulldozer but this is
> > a microbenchmark and before pursue this further, we need to verify
> > whether this brings any palpable speedup with a real benchmark, I
> > don't know, kernbench, netbench, whatever. Even something as boring
> as
> > kernel build. And probably check for perf regressions on the rest of
> > the uarches.
>
> Ok, so to summarize, on AMD we're using REP MOVSQ which is even faster
> than the unrolled version. I've added the REP MOVSQ version to the
> µbenchmark. It nicely validates that we're correctly setting
> X86_FEATURE_REP_GOOD on everything >= F10h and some K8s.
>
> So, to answer Konrad's question: those patches don't concern AMD
> machines.
>
> Thanks.
>
> --
> Regards/Gruss,
> Boris.
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m????????????I?
Just for everyone's information, here's the updated benchmark code on
the same Phenom. The REP MOVSQ code is indeed much faster.
vendor_id : AuthenticAMD
cpu family : 16
model : 2
model name : AMD Phenom(tm) 9850 Quad-Core Processor
stepping : 3
microcode : 0x1000083
cpu MHz : 2500.210
cache size : 512 KB
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm 3dnowext 3dnow constant_tsc rep_good nopl nonstop_tsc extd_apicid pni monitor cx16 popcnt lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs hw_pstate npt lbrv svm_lock
bogomips : 5000.42
TLB size : 1024 4K pages
clflush size : 64
cache_alignment : 64
copy_page_org copy_page_new REP MOVSQ
TPT: Len 4096, alignment 0/ 0: 708 758 611
TPT: Len 4096, alignment 0/ 0: 672 758 611
TPT: Len 4096, alignment 0/ 0: 708 758 611
TPT: Len 4096, alignment 0/ 0: 708 758 611
TPT: Len 4096, alignment 0/ 0: 708 758 611
copy_page_org copy_page_new REP MOVSQ
TPT: Len 4096, alignment 0/ 0: 694 759 611
TPT: Len 4096, alignment 0/ 0: 672 758 611
TPT: Len 4096, alignment 0/ 0: 708 758 611
TPT: Len 4096, alignment 0/ 0: 708 759 611
TPT: Len 4096, alignment 0/ 0: 708 757 611
copy_page_org copy_page_new REP MOVSQ
TPT: Len 4096, alignment 0/ 0: 697 758 611
TPT: Len 4096, alignment 0/ 0: 708 758 611
TPT: Len 4096, alignment 0/ 0: 708 758 611
TPT: Len 4096, alignment 0/ 0: 708 758 611
TPT: Len 4096, alignment 0/ 0: 708 757 611
copy_page_org copy_page_new REP MOVSQ
TPT: Len 4096, alignment 0/ 0: 703 758 612
TPT: Len 4096, alignment 0/ 0: 709 758 611
TPT: Len 4096, alignment 0/ 0: 709 757 611
TPT: Len 4096, alignment 0/ 0: 709 759 613
TPT: Len 4096, alignment 0/ 0: 709 759 611
copy_page_org copy_page_new REP MOVSQ
TPT: Len 4096, alignment 0/ 0: 669 758 613
TPT: Len 4096, alignment 0/ 0: 671 758 611
TPT: Len 4096, alignment 0/ 0: 708 758 611
TPT: Len 4096, alignment 0/ 0: 708 758 611
TPT: Len 4096, alignment 0/ 0: 708 758 613
copy_page_org copy_page_new REP MOVSQ
TPT: Len 4096, alignment 0/ 0: 679 758 612
TPT: Len 4096, alignment 0/ 0: 671 758 612
TPT: Len 4096, alignment 0/ 0: 684 759 612
TPT: Len 4096, alignment 0/ 0: 709 759 613
TPT: Len 4096, alignment 0/ 0: 709 759 611
copy_page_org copy_page_new REP MOVSQ
TPT: Len 4096, alignment 0/ 0: 682 758 612
TPT: Len 4096, alignment 0/ 0: 673 758 613
TPT: Len 4096, alignment 0/ 0: 704 759 613
TPT: Len 4096, alignment 0/ 0: 709 758 613
TPT: Len 4096, alignment 0/ 0: 709 758 611
copy_page_org copy_page_new REP MOVSQ
TPT: Len 4096, alignment 0/ 0: 669 759 611
TPT: Len 4096, alignment 0/ 0: 671 759 611
TPT: Len 4096, alignment 0/ 0: 709 759 613
TPT: Len 4096, alignment 0/ 0: 709 759 613
TPT: Len 4096, alignment 0/ 0: 708 759 613
copy_page_org copy_page_new REP MOVSQ
TPT: Len 4096, alignment 0/ 0: 668 759 612
TPT: Len 4096, alignment 0/ 0: 709 759 612
TPT: Len 4096, alignment 0/ 0: 709 759 612
TPT: Len 4096, alignment 0/ 0: 709 759 612
TPT: Len 4096, alignment 0/ 0: 709 759 612
copy_page_org copy_page_new REP MOVSQ
TPT: Len 4096, alignment 0/ 0: 694 758 611
TPT: Len 4096, alignment 0/ 0: 671 759 611
TPT: Len 4096, alignment 0/ 0: 708 759 611
TPT: Len 4096, alignment 0/ 0: 708 759 611
TPT: Len 4096, alignment 0/ 0: 708 759 613