2005-05-30 18:17:07

by Benjamin LaHaise

[permalink] [raw]
Subject: [RFC] x86-64: Use SSE for copy_page and clear_page

Hello Andi,

Below is a patch that uses 128 bit SSE instructions for copy_page and
clear_page. This is an improvement on P4 systems as can be seen by
running the test program at http://www.kvack.org/~bcrl/xmm64.c to get
results like:

SSE test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $ buffer = 0x2aaaaaad6000
clear_page() tests
clear_page function 'warm up run' took 25444 cycles per page
clear_page function 'kernel clear' took 6595 cycles per page
clear_page function '2.4 non MMX' took 7827 cycles per page
clear_page function '2.4 MMX fallback' took 7741 cycles per page
clear_page function '2.4 MMX version' took 6454 cycles per page
clear_page function 'faster_clear_page' took 4344 cycles per page
clear_page function 'even_faster_clear' took 4151 cycles per page
clear_page function 'xmm_clear ' took 3204 cycles per page
clear_page function 'xmma_clear ' took 6080 cycles per page
clear_page function 'xmm2_clear ' took 3370 cycles per page
clear_page function 'xmma2_clear ' took 6115 cycles per page
clear_page function 'kernel clear' took 6583 cycles per page

copy_page() tests
copy_page function 'warm up run' took 9770 cycles per page
copy_page function '2.4 non MMX' took 9758 cycles per page
copy_page function '2.4 MMX fallback' took 9572 cycles per page
copy_page function '2.4 MMX version' took 9405 cycles per page
copy_page function 'faster_copy' took 7407 cycles per page
copy_page function 'even_faster' took 7158 cycles per page
copy_page function 'xmm_copy_page_no' took 6110 cycles per page
copy_page function 'xmm_copy_page' took 5914 cycles per page
copy_page function 'xmma_copy_page' took 5913 cycles per page
copy_page function 'v26_copy_page' took 9168 cycles per page

The SSE clear page fuction is almost twice as fast as the kernel's
current clear_page, while the copy_page implementation is roughly a
third faster. This is likely due to the fact that SSE instructions
can keep the 256 bit wide L2 cache bus at a higher utilisation than
64 bit movs are able to. Comments?

-ben

Signed-off-by: Benjamin LaHaise <[email protected]>
:r public_html/patches/v2.6.12-rc4-xmm-2.diff
diff -purN v2.6.12-rc4/arch/x86_64/lib/c_clear_page.c xmm-rc4/arch/x86_64/lib/c_clear_page.c
--- v2.6.12-rc4/arch/x86_64/lib/c_clear_page.c 1969-12-31 19:00:00.000000000 -0500
+++ xmm-rc4/arch/x86_64/lib/c_clear_page.c 2005-05-26 11:16:09.000000000 -0400
@@ -0,0 +1,45 @@
+#include <linux/config.h>
+#include <linux/preempt.h>
+#include <asm/page.h>
+#include <linux/kernel.h>
+#include <asm/string.h>
+
+typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
+
+void c_clear_page_xmm(void *page)
+{
+ /* Note! gcc doesn't seem to align stack variables properly, so we
+ * need to make use of unaligned loads and stores.
+ */
+ xmm_store_t xmm_save[1];
+ unsigned long cr0;
+ int i;
+
+ preempt_disable();
+ __asm__ __volatile__ (
+ " mov %%cr0,%0\n"
+ " clts\n"
+ " movdqu %%xmm0,(%1)\n"
+ " pxor %%xmm0, %%xmm0\n"
+ : "=&r" (cr0): "r" (xmm_save) : "memory"
+ );
+
+ for(i=0;i<PAGE_SIZE/64;i++)
+ {
+ __asm__ __volatile__ (
+ " movntdq %%xmm0, (%0)\n"
+ " movntdq %%xmm0, 16(%0)\n"
+ " movntdq %%xmm0, 32(%0)\n"
+ " movntdq %%xmm0, 48(%0)\n"
+ : : "r" (page) : "memory");
+ page+=64;
+ }
+
+ __asm__ __volatile__ (
+ " sfence \n "
+ " movdqu (%0),%%xmm0\n"
+ " mov %1,%%cr0\n"
+ :: "r" (xmm_save), "r" (cr0)
+ );
+ preempt_enable();
+}
diff -purN v2.6.12-rc4/arch/x86_64/lib/c_copy_page.c xmm-rc4/arch/x86_64/lib/c_copy_page.c
--- v2.6.12-rc4/arch/x86_64/lib/c_copy_page.c 1969-12-31 19:00:00.000000000 -0500
+++ xmm-rc4/arch/x86_64/lib/c_copy_page.c 2005-05-30 14:07:28.000000000 -0400
@@ -0,0 +1,52 @@
+#include <linux/config.h>
+#include <linux/preempt.h>
+#include <asm/page.h>
+#include <linux/kernel.h>
+#include <asm/string.h>
+
+typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
+
+void c_copy_page_xmm(void *to, void *from)
+{
+ /* Note! gcc doesn't seem to align stack variables properly, so we
+ * need to make use of unaligned loads and stores.
+ */
+ xmm_store_t xmm_save[2];
+ unsigned long cr0;
+ int i;
+
+ preempt_disable();
+ __asm__ __volatile__ (
+ " prefetchnta (%1)\n"
+ " prefetchnta 64(%1)\n"
+ " prefetchnta 128(%1)\n"
+ " prefetchnta 192(%1)\n"
+ " prefetchnta 256(%1)\n"
+ " mov %%cr0,%0\n"
+ " clts\n"
+ " movdqu %%xmm0, (%1)\n"
+ " movdqu %%xmm1,16(%1)\n"
+ : "=&r" (cr0): "r" (xmm_save) : "memory"
+ );
+
+ for(i=0;i<PAGE_SIZE/32;i++) {
+ __asm__ __volatile__ (
+ " prefetchnta 320(%0)\n"
+ " movdqa (%0),%%xmm0\n"
+ " movdqa 16(%0),%%xmm1\n"
+ " movntdq %%xmm0, (%1)\n"
+ " movntdq %%xmm1, 16(%1)\n"
+ : : "r" (from), "r" (to) : "memory");
+ to += 32;
+ from += 32;
+ }
+
+ __asm__ __volatile__ (
+ " sfence \n "
+ " movdqu (%0),%%xmm0\n"
+ " movdqu 16(%0),%%xmm1\n"
+ " mov %1,%%cr0\n"
+ :: "r" (xmm_save), "r" (cr0)
+ );
+ preempt_enable();
+}
diff -purN v2.6.12-rc4/arch/x86_64/lib/clear_page.S xmm-rc4/arch/x86_64/lib/clear_page.S
--- v2.6.12-rc4/arch/x86_64/lib/clear_page.S 2004-12-24 16:34:33.000000000 -0500
+++ xmm-rc4/arch/x86_64/lib/clear_page.S 2005-05-26 11:27:26.000000000 -0400
@@ -1,3 +1,5 @@
+#include <asm/cpufeature.h>
+
/*
* Zero a page.
* rdi page
@@ -24,12 +26,25 @@ clear_page:
nop
ret
clear_page_end:
-
+
+ .section .altinstructions,"a"
+ .align 8
+ .quad clear_page
+ .quad clear_page_xmm
+ .byte X86_FEATURE_XMM2
+ .byte clear_page_end-clear_page
+ .byte clear_page_xmm_end-clear_page_xmm
+ .previous
+
+ .globl c_clear_page_xmm
+ .p2align 4
+clear_page_xmm:
+ jmp c_clear_page_xmm+(clear_page_xmm-clear_page)
+clear_page_xmm_end:
+
/* C stepping K8 run faster using the string instructions.
It is also a lot simpler. Use this when possible */

-#include <asm/cpufeature.h>
-
.section .altinstructions,"a"
.align 8
.quad clear_page
diff -purN v2.6.12-rc4/arch/x86_64/lib/copy_page.S xmm-rc4/arch/x86_64/lib/copy_page.S
--- v2.6.12-rc4/arch/x86_64/lib/copy_page.S 2004-12-24 16:34:32.000000000 -0500
+++ xmm-rc4/arch/x86_64/lib/copy_page.S 2005-05-26 11:29:55.000000000 -0400
@@ -76,18 +76,34 @@ copy_page:
movq 2*8(%rsp),%r13
addq $3*8,%rsp
ret
+copy_page_end = .

+#include <asm/cpufeature.h>
+
+ .section .altinstructions,"a"
+ .align 8
+ .quad copy_page
+ .quad copy_page_xmm
+ .byte X86_FEATURE_XMM2
+ .byte copy_page_end-copy_page
+ .byte copy_page_xmm_end-copy_page_xmm
+ .previous
+
+ .globl c_copy_page_xmm
+ .p2align 4
+copy_page_xmm:
+ jmp c_copy_page_xmm+(copy_page_xmm-copy_page)
+copy_page_xmm_end = .
+
/* C stepping K8 run faster using the string copy instructions.
It is also a lot simpler. Use this when possible */

-#include <asm/cpufeature.h>
-
.section .altinstructions,"a"
.align 8
.quad copy_page
.quad copy_page_c
.byte X86_FEATURE_K8_C
- .byte copy_page_c_end-copy_page_c
+ .byte copy_page_end-copy_page
.byte copy_page_c_end-copy_page_c
.previous

diff -purN v2.6.12-rc4/arch/x86_64/lib/Makefile xmm-rc4/arch/x86_64/lib/Makefile
--- v2.6.12-rc4/arch/x86_64/lib/Makefile 2004-12-24 16:34:01.000000000 -0500
+++ xmm-rc4/arch/x86_64/lib/Makefile 2005-05-26 11:26:50.000000000 -0400
@@ -10,5 +10,7 @@ lib-y := csum-partial.o csum-copy.o csum
usercopy.o getuser.o putuser.o \
thunk.o clear_page.o copy_page.o bitstr.o bitops.o
lib-y += memcpy.o memmove.o memset.o copy_user.o
+lib-y += c_clear_page.o
+lib-y += c_copy_page.o

lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o


2005-05-30 18:46:18

by Jeff Garzik

[permalink] [raw]
Subject: Re: [RFC] x86-64: Use SSE for copy_page and clear_page

Benjamin LaHaise wrote:
> Hello Andi,
>
> Below is a patch that uses 128 bit SSE instructions for copy_page and
> clear_page. This is an improvement on P4 systems as can be seen by
> running the test program at http://www.kvack.org/~bcrl/xmm64.c to get
> results like:
>
> SSE test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $ buffer = 0x2aaaaaad6000
> clear_page() tests
> clear_page function 'warm up run' took 25444 cycles per page
> clear_page function 'kernel clear' took 6595 cycles per page
> clear_page function '2.4 non MMX' took 7827 cycles per page
> clear_page function '2.4 MMX fallback' took 7741 cycles per page
> clear_page function '2.4 MMX version' took 6454 cycles per page
> clear_page function 'faster_clear_page' took 4344 cycles per page
> clear_page function 'even_faster_clear' took 4151 cycles per page
> clear_page function 'xmm_clear ' took 3204 cycles per page
> clear_page function 'xmma_clear ' took 6080 cycles per page
> clear_page function 'xmm2_clear ' took 3370 cycles per page
> clear_page function 'xmma2_clear ' took 6115 cycles per page
> clear_page function 'kernel clear' took 6583 cycles per page
>
> copy_page() tests
> copy_page function 'warm up run' took 9770 cycles per page
> copy_page function '2.4 non MMX' took 9758 cycles per page
> copy_page function '2.4 MMX fallback' took 9572 cycles per page
> copy_page function '2.4 MMX version' took 9405 cycles per page
> copy_page function 'faster_copy' took 7407 cycles per page
> copy_page function 'even_faster' took 7158 cycles per page
> copy_page function 'xmm_copy_page_no' took 6110 cycles per page
> copy_page function 'xmm_copy_page' took 5914 cycles per page
> copy_page function 'xmma_copy_page' took 5913 cycles per page
> copy_page function 'v26_copy_page' took 9168 cycles per page
>
> The SSE clear page fuction is almost twice as fast as the kernel's
> current clear_page, while the copy_page implementation is roughly a
> third faster. This is likely due to the fact that SSE instructions
> can keep the 256 bit wide L2 cache bus at a higher utilisation than
> 64 bit movs are able to. Comments?

Sounds pretty darn cool to me. I can give it a test on athlon64 and
em64t here.

I have some codingstyle whining to do though...


> :r public_html/patches/v2.6.12-rc4-xmm-2.diff
> diff -purN v2.6.12-rc4/arch/x86_64/lib/c_clear_page.c xmm-rc4/arch/x86_64/lib/c_clear_page.c
> --- v2.6.12-rc4/arch/x86_64/lib/c_clear_page.c 1969-12-31 19:00:00.000000000 -0500
> +++ xmm-rc4/arch/x86_64/lib/c_clear_page.c 2005-05-26 11:16:09.000000000 -0400
> @@ -0,0 +1,45 @@
> +#include <linux/config.h>
> +#include <linux/preempt.h>
> +#include <asm/page.h>
> +#include <linux/kernel.h>
> +#include <asm/string.h>

preferred ordering:

linux/config
linux/kernel
linux/preempt
asm/*


> +typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;

space between "a,b"


> +void c_clear_page_xmm(void *page)
> +{
> + /* Note! gcc doesn't seem to align stack variables properly, so we
> + * need to make use of unaligned loads and stores.
> + */
> + xmm_store_t xmm_save[1];
> + unsigned long cr0;
> + int i;
> +
> + preempt_disable();
> + __asm__ __volatile__ (
> + " mov %%cr0,%0\n"
> + " clts\n"
> + " movdqu %%xmm0,(%1)\n"
> + " pxor %%xmm0, %%xmm0\n"
> + : "=&r" (cr0): "r" (xmm_save) : "memory"
> + );
> +
> + for(i=0;i<PAGE_SIZE/64;i++)

exercise that spacebar :)


> + {
> + __asm__ __volatile__ (
> + " movntdq %%xmm0, (%0)\n"
> + " movntdq %%xmm0, 16(%0)\n"
> + " movntdq %%xmm0, 32(%0)\n"
> + " movntdq %%xmm0, 48(%0)\n"
> + : : "r" (page) : "memory");
> + page+=64;
> + }
> +
> + __asm__ __volatile__ (
> + " sfence \n "
> + " movdqu (%0),%%xmm0\n"
> + " mov %1,%%cr0\n"
> + :: "r" (xmm_save), "r" (cr0)
> + );
> + preempt_enable();
> +}
> diff -purN v2.6.12-rc4/arch/x86_64/lib/c_copy_page.c xmm-rc4/arch/x86_64/lib/c_copy_page.c
> --- v2.6.12-rc4/arch/x86_64/lib/c_copy_page.c 1969-12-31 19:00:00.000000000 -0500
> +++ xmm-rc4/arch/x86_64/lib/c_copy_page.c 2005-05-30 14:07:28.000000000 -0400
> @@ -0,0 +1,52 @@
> +#include <linux/config.h>
> +#include <linux/preempt.h>
> +#include <asm/page.h>
> +#include <linux/kernel.h>
> +#include <asm/string.h>
> +
> +typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;

ditto

> +void c_copy_page_xmm(void *to, void *from)
> +{
> + /* Note! gcc doesn't seem to align stack variables properly, so we
> + * need to make use of unaligned loads and stores.
> + */
> + xmm_store_t xmm_save[2];
> + unsigned long cr0;
> + int i;
> +
> + preempt_disable();
> + __asm__ __volatile__ (
> + " prefetchnta (%1)\n"
> + " prefetchnta 64(%1)\n"
> + " prefetchnta 128(%1)\n"
> + " prefetchnta 192(%1)\n"
> + " prefetchnta 256(%1)\n"
> + " mov %%cr0,%0\n"
> + " clts\n"
> + " movdqu %%xmm0, (%1)\n"
> + " movdqu %%xmm1,16(%1)\n"
> + : "=&r" (cr0): "r" (xmm_save) : "memory"
> + );
> +
> + for(i=0;i<PAGE_SIZE/32;i++) {

ditto

2005-05-30 19:06:33

by dean gaudet

[permalink] [raw]
Subject: Re: [RFC] x86-64: Use SSE for copy_page and clear_page

On Mon, 30 May 2005, Benjamin LaHaise wrote:

> Below is a patch that uses 128 bit SSE instructions for copy_page and
> clear_page. This is an improvement on P4 systems as can be seen by
> running the test program at http://www.kvack.org/~bcrl/xmm64.c to get
> results like:

it looks like the patch uses SSE2 instructions (pxor, movdqa, movntdq)...
if you use xorps, movaps, movntps then it works on SSE processors as well.

-dean

2005-05-30 19:13:10

by dean gaudet

[permalink] [raw]
Subject: Re: [RFC] x86-64: Use SSE for copy_page and clear_page

On Mon, 30 May 2005, dean gaudet wrote:

> On Mon, 30 May 2005, Benjamin LaHaise wrote:
>
> > Below is a patch that uses 128 bit SSE instructions for copy_page and
> > clear_page. This is an improvement on P4 systems as can be seen by
> > running the test program at http://www.kvack.org/~bcrl/xmm64.c to get
> > results like:
>
> it looks like the patch uses SSE2 instructions (pxor, movdqa, movntdq)...
> if you use xorps, movaps, movntps then it works on SSE processors as well.

oh and btw... on x86-64 you might want to look at using movnti with 64-bit
registers... the memory datapath on these processors is actually 64-bits
wide, and the 128-bit stores are broken into two 64-bit pieces internally
anyhow. the advantage of using movnti over movntdq/movntps is that you
don't have to save/restore the xmm register set.

-dean

2005-05-30 19:32:57

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC] x86-64: Use SSE for copy_page and clear_page

On Mon, May 30, 2005 at 12:11:23PM -0700, dean gaudet wrote:
> On Mon, 30 May 2005, dean gaudet wrote:
>
> > On Mon, 30 May 2005, Benjamin LaHaise wrote:
> >
> > > Below is a patch that uses 128 bit SSE instructions for copy_page and
> > > clear_page. This is an improvement on P4 systems as can be seen by
> > > running the test program at http://www.kvack.org/~bcrl/xmm64.c to get
> > > results like:
> >
> > it looks like the patch uses SSE2 instructions (pxor, movdqa, movntdq)...
> > if you use xorps, movaps, movntps then it works on SSE processors as well.
>
> oh and btw... on x86-64 you might want to look at using movnti with 64-bit
> registers... the memory datapath on these processors is actually 64-bits
> wide, and the 128-bit stores are broken into two 64-bit pieces internally
> anyhow. the advantage of using movnti over movntdq/movntps is that you
> don't have to save/restore the xmm register set.

Any use of write combining for copy_page/clear_page is a bad idea.
The problem is that write combining always forces the destination
out of cache. While it gives you better microbenchmarks your real workloads
suffer because they eat lot more additional cache misses when
accessing the fresh pages.

Don't go down that path please.

At least on Opteron I did quite some tests and the existing setup
with just rep ; movsq for C stepping or later or the unrolled loop
for earlier CPUs worked best overall. On P4 I haven't do any benchmarks;
however it might be a good idea to check if rep ; movsq would be
a win there too (if yes it could be enabled there)

-Andi

2005-05-30 19:41:47

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC] x86-64: Use SSE for copy_page and clear_page

> The SSE clear page fuction is almost twice as fast as the kernel's
> current clear_page, while the copy_page implementation is roughly a
> third faster. This is likely due to the fact that SSE instructions
> can keep the 256 bit wide L2 cache bus at a higher utilisation than
> 64 bit movs are able to. Comments?

Any use of write combining is wrong here because it forces
the destination out of cache, which causes performance issues later on.
Believe me we went through this years ago.

If you can code up a better function for P4 that does not use
write combining I would be happy to add. I never tuned the functions
for P4.

One simple experiment would be to just test if P4 likes the
simple rep ; movsq / rep ; stosq loops and enable them.

-Andi

2005-05-30 20:06:03

by Michael Thonke

[permalink] [raw]
Subject: Re: [RFC] x86-64: Use SSE for copy_page and clear_page

Andi Kleen schrieb:

>>The SSE clear page fuction is almost twice as fast as the kernel's
>>current clear_page, while the copy_page implementation is roughly a
>>third faster. This is likely due to the fact that SSE instructions
>>can keep the 256 bit wide L2 cache bus at a higher utilisation than
>>64 bit movs are able to. Comments?
>>
>>
>
>Any use of write combining is wrong here because it forces
>the destination out of cache, which causes performance issues later on.
>Believe me we went through this years ago.
>
>If you can code up a better function for P4 that does not use
>write combining I would be happy to add. I never tuned the functions
>for P4.
>
>One simple experiment would be to just test if P4 likes the
>simple rep ; movsq / rep ; stosq loops and enable them.
>
>
No it doesn't like this sample here at all,I'll get segmentationfault on
that run.
RUN 1:

SSE test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
buffer = 0x2aaaaade7000
clear_page() tests
clear_page function 'warm up run' took 13516 cycles per page
clear_page function 'kernel clear' took 6539 cycles per page
clear_page function '2.4 non MMX' took 6354 cycles per page
clear_page function '2.4 MMX fallback' took 6205 cycles per page
clear_page function '2.4 MMX version' took 6830 cycles per page
clear_page function 'faster_clear_page' took 6240 cycles per page
clear_page function 'even_faster_clear' took 5746 cycles per page
clear_page function 'xmm_clear ' took 4580 cycles per page
Segmentation fault

xmm64.o[9485] general protection rip:400814 rsp:7fffffc74118 error:0
xmm64.o[9486] general protection rip:400814 rsp:7fffff8b1498 error:0
xmm64.o[9487] general protection rip:400814 rsp:7fffffc31848 error:0

RUN 2:
Tell gcc use processor specific flags
gcc -pipe -march=nocona -O2 -o xmm64.o xmm64.c

SSE test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
buffer = 0x2aaaaade7000
clear_page() tests
clear_page function 'warm up run' took 13419 cycles per page
clear_page function 'kernel clear' took 6403 cycles per page
clear_page function '2.4 non MMX' took 6290 cycles per page
clear_page function '2.4 MMX fallback' took 6156 cycles per page
clear_page function '2.4 MMX version' took 6605 cycles per page
clear_page function 'faster_clear_page' took 5607 cycles per page
clear_page function 'even_faster_clear' took 5173 cycles per page
clear_page function 'xmm_clear ' took 4307 cycles per page
clear_page function 'xmma_clear ' took 6230 cycles per page
clear_page function 'xmm2_clear ' took 4908 cycles per page
clear_page function 'xmma2_clear ' took 6256 cycles per page
clear_page function 'kernel clear' took 6506 cycles per page

copy_page() tests
copy_page function 'warm up run' took 10352 cycles per page
copy_page function '2.4 non MMX' took 9440 cycles per page
copy_page function '2.4 MMX fallback' took 9300 cycles per page
copy_page function '2.4 MMX version' took 10238 cycles per page
copy_page function 'faster_copy' took 9497 cycles per page
copy_page function 'even_faster' took 9229 cycles per page
copy_page function 'xmm_copy_page_no' took 7810 cycles per page
copy_page function 'xmm_copy_page' took 7397 cycles per page
copy_page function 'xmma_copy_page' took 9430 cycles per page
copy_page function 'v26_copy_page' took 9234 cycles per page

CPU flags on Intel Pentium 4 640 x86_64 Gentoo GNU/Linux

flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr
pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm
syscall nx lm constant_tsc pni monitor ds_cpl est cid cx16 xtpr

Greets
Michael

2005-05-30 20:12:25

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [RFC] x86-64: Use SSE for copy_page and clear_page

On Mon, May 30, 2005 at 10:05:28PM +0200, Michael Thonke wrote:
> No it doesn't like this sample here at all,I'll get segmentationfault on
> that run.

Grab a new copy -- one of the routines had an unaligned store instead of
aligned for the register save.

-ben

2005-05-30 20:42:45

by Michael Thonke

[permalink] [raw]
Subject: Re: [RFC] x86-64: Use SSE for copy_page and clear_page

Benjamin LaHaise schrieb:

>On Mon, May 30, 2005 at 10:05:28PM +0200, Michael Thonke wrote:
>
>
>>No it doesn't like this sample here at all,I'll get segmentationfault on
>>that run.
>>
>>
>
>Grab a new copy -- one of the routines had an unaligned store instead of
>aligned for the register save.
>
> -ben
>
>
>
Hi Benjamin,

Here are the results with the new copy.

*RUN 1: cc -o xmm64.o xmm64.c*

ioGL64NX_EMT64 ~ # ./xmm64.o
SSE test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
buffer = 0x2aaaaade7000
clear_page() tests
clear_page function 'warm up run' took 13632 cycles per page
clear_page function 'kernel clear' took 6599 cycles per page
clear_page function '2.4 non MMX' took 6482 cycles per page
clear_page function '2.4 MMX fallback' took 6367 cycles per page
clear_page function '2.4 MMX version' took 6644 cycles per page
clear_page function 'faster_clear_page' took 6088 cycles per page
clear_page function 'even_faster_clear' took 5692 cycles per page
clear_page function 'xmm_clear' took 4270 cycles per page
clear_page function 'xmma_clear' took 6351 cycles per page
clear_page function 'xmm2_clear' took 4710 cycles per page
clear_page function 'xmma2_clear' took 6198 cycles per page
clear_page function 'xmm3_clear' took 6583 cycles per page
clear_page function 'nt clear ' took 4746 cycles per page
clear_page function 'kernel clear' took 6158 cycles per page

copy_page() tests
copy_page function 'warm up run' took 9210 cycles per page
copy_page function '2.4 non MMX' took 6740 cycles per page
copy_page function '2.4 MMX fallback' took 6697 cycles per page
copy_page function '2.4 MMX version' took 9178 cycles per page
copy_page function 'faster_copy' took 11360 cycles per page
copy_page function 'even_faster' took 10133 cycles per page
copy_page function 'xmm_copy_page_no' took 8885 cycles per page
copy_page function 'xmm_copy_page' took 8725 cycles per page
copy_page function 'xmma_copy_page' took 9964 cycles per page
copy_page function 'xmm3_copy_page' took 7176 cycles per page
copy_page function 'v26_copy_page' took 6879 cycles per page
copy_page function 'nt_copy_page' took 10858 cycles per page


*RUN 2: gcc -o xmm64.o xmm64.c*

ioGL64NX_EMT64 ~ # ./xmm64.o
SSE test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
buffer = 0x2aaaaade7000
clear_page() tests
clear_page function 'warm up run' took 13981 cycles per page
clear_page function 'kernel clear' took 6708 cycles per page
clear_page function '2.4 non MMX' took 6505 cycles per page
clear_page function '2.4 MMX fallback' took 6235 cycles per page
clear_page function '2.4 MMX version' took 7251 cycles per page
clear_page function 'faster_clear_page' took 6390 cycles per page
clear_page function 'even_faster_clear' took 5932 cycles per page
clear_page function 'xmm_clear' took 4876 cycles per page
clear_page function 'xmma_clear' took 6379 cycles per page
clear_page function 'xmm2_clear' took 5264 cycles per page
clear_page function 'xmma2_clear' took 6373 cycles per page
clear_page function 'xmm3_clear' took 6651 cycles per page
clear_page function 'nt clear ' took 5186 cycles per page
clear_page function 'kernel clear' took 6326 cycles per page

copy_page() tests
copy_page function 'warm up run' took 9537 cycles per page
copy_page function '2.4 non MMX' took 6776 cycles per page
copy_page function '2.4 MMX fallback' took 7407 cycles per page
copy_page function '2.4 MMX version' took 8812 cycles per page
copy_page function 'faster_copy' took 10992 cycles per page
copy_page function 'even_faster' took 10232 cycles per page
copy_page function 'xmm_copy_page_no' took 8918 cycles per page
copy_page function 'xmm_copy_page' took 9579 cycles per page
copy_page function 'xmma_copy_page' took 9854 cycles per page
copy_page function 'xmm3_copy_page' took 7602 cycles per page
copy_page function 'v26_copy_page' took 6811 cycles per page
copy_page function 'nt_copy_page' took 10958 cycles per page

*RUN 3: gcc -pipe -march=nocona -O2 -o xmm64.o xmm64.c
*
SSE test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
buffer = 0x2aaaaade7000
clear_page() tests
clear_page function 'warm up run' took 13626 cycles per page
clear_page function 'kernel clear' took 6780 cycles per page
clear_page function '2.4 non MMX' took 6755 cycles per page
clear_page function '2.4 MMX fallback' took 6283 cycles per page
clear_page function '2.4 MMX version' took 6764 cycles per page
clear_page function 'faster_clear_page' took 5764 cycles per page
clear_page function 'even_faster_clear' took 5240 cycles per page
clear_page function 'xmm_clear' took 4532 cycles per page
clear_page function 'xmma_clear' took 6352 cycles per page
clear_page function 'xmm2_clear' took 4983 cycles per page
clear_page function 'xmma2_clear' took 6211 cycles per page
clear_page function 'xmm3_clear' took 6748 cycles per page
clear_page function 'nt clear ' took 5166 cycles per page
clear_page function 'kernel clear' took 6201 cycles per page

copy_page() tests
copy_page function 'warm up run' took 9651 cycles per page
copy_page function '2.4 non MMX' took 6724 cycles per page
copy_page function '2.4 MMX fallback' took 6905 cycles per page
copy_page function '2.4 MMX version' took 9722 cycles per page
copy_page function 'faster_copy' took 9738 cycles per page
copy_page function 'even_faster' took 9609 cycles per page
copy_page function 'xmm_copy_page_no' took 8846 cycles per page
copy_page function 'xmm_copy_page' took 8591 cycles per page
copy_page function 'xmma_copy_page' took 8250 cycles per page
copy_page function 'xmm3_copy_page' took 7879 cycles per page
copy_page function 'v26_copy_page' took 7512 cycles per page
copy_page function 'nt_copy_page' took 10424 cycles per page

RUN 4: *gcc -pipe -march=nocona -O2 -fPIC -o xmm64.o xmm64.c*

SSE test program $Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $
buffer = 0x2aaaaade7000
clear_page() tests
clear_page function 'warm up run' took 13713 cycles per page
clear_page function 'kernel clear' took 6655 cycles per page
clear_page function '2.4 non MMX' took 6448 cycles per page
clear_page function '2.4 MMX fallback' took 6270 cycles per page
clear_page function '2.4 MMX version' took 7001 cycles per page
clear_page function 'faster_clear_page' took 5671 cycles per page
clear_page function 'even_faster_clear' took 5366 cycles per page
clear_page function 'xmm_clear' took 4737 cycles per page
clear_page function 'xmma_clear' took 6464 cycles per page
clear_page function 'xmm2_clear' took 5214 cycles per page
clear_page function 'xmma2_clear' took 6371 cycles per page
clear_page function 'xmm3_clear' took 6660 cycles per page
clear_page function 'nt clear ' took 5066 cycles per page
clear_page function 'kernel clear' took 6314 cycles per page

copy_page() tests
copy_page function 'warm up run' took 9464 cycles per page
copy_page function '2.4 non MMX' took 7179 cycles per page
copy_page function '2.4 MMX fallback' took 6928 cycles per page
copy_page function '2.4 MMX version' took 9091 cycles per page
copy_page function 'faster_copy' took 9996 cycles per page
copy_page function 'even_faster' took 9824 cycles per page
copy_page function 'xmm_copy_page_no' took 8724 cycles per page
copy_page function 'xmm_copy_page' took 8920 cycles per page
copy_page function 'xmma_copy_page' took 8859 cycles per page
copy_page function 'xmm3_copy_page' took 7794 cycles per page
copy_page function 'v26_copy_page' took 7808 cycles per page
copy_page function 'nt_copy_page' took 9264 cycles per page

Do you need more results or tests Benjamin?

Greets and best regards
Michael

2005-05-31 07:11:39

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC] x86-64: Use SSE for copy_page and clear_page

> >One simple experiment would be to just test if P4 likes the
> >simple rep ; movsq / rep ; stosq loops and enable them.
> >
> >
> No it doesn't like this sample here at all,I'll get segmentationfault on
> that run.

Sorry, what did you test exactly?

-Andi

2005-05-31 08:37:59

by Denis Vlasenko

[permalink] [raw]
Subject: Re: [RFC] x86-64: Use SSE for copy_page and clear_page

On Monday 30 May 2005 22:32, Andi Kleen wrote:
> On Mon, May 30, 2005 at 12:11:23PM -0700, dean gaudet wrote:
> > On Mon, 30 May 2005, dean gaudet wrote:
> >
> > > On Mon, 30 May 2005, Benjamin LaHaise wrote:
> > >
> > > > Below is a patch that uses 128 bit SSE instructions for copy_page and
> > > > clear_page. This is an improvement on P4 systems as can be seen by
> > > > running the test program at http://www.kvack.org/~bcrl/xmm64.c to get
> > > > results like:
> > >
> > > it looks like the patch uses SSE2 instructions (pxor, movdqa, movntdq)...
> > > if you use xorps, movaps, movntps then it works on SSE processors as well.
> >
> > oh and btw... on x86-64 you might want to look at using movnti with 64-bit
> > registers... the memory datapath on these processors is actually 64-bits
> > wide, and the 128-bit stores are broken into two 64-bit pieces internally
> > anyhow. the advantage of using movnti over movntdq/movntps is that you
> > don't have to save/restore the xmm register set.

And if (more like 'when', actually) next AMD CPU will have 2x128bit bus
instead of 2x64bit? Revert back to XMM?

> Any use of write combining for copy_page/clear_page is a bad idea.
> The problem is that write combining always forces the destination
> out of cache. While it gives you better microbenchmarks your real workloads
> suffer because they eat lot more additional cache misses when
> accessing the fresh pages.
>
> Don't go down that path please.

I doubt it unless real-world data will back your claim up.

I did microbenchmarking. You said it looks good in microbench but
hurts real-world.

Sometime after that I made a patch which allows for switching
clear/copy routines on the fly, and played a bit with real-world tests.

See http://www.thisishull.net/showthread.php?t=36562

In short, I ran forking test programs which excercise clearing and copying
routines in kernel. I wasn't able to find a usage pattern where page copying
using SSE non-temporal stores is a loss. Page clear was demonstrably worse,
no argument about that.

If you know such usage pattern, I'd like to test it.
--
vda

2005-05-31 09:16:15

by Denis Vlasenko

[permalink] [raw]
Subject: Re: [RFC] x86-64: Use SSE for copy_page and clear_page

On Tuesday 31 May 2005 11:37, Denis Vlasenko wrote:
> On Monday 30 May 2005 22:32, Andi Kleen wrote:
> > Any use of write combining for copy_page/clear_page is a bad idea.
> > The problem is that write combining always forces the destination
> > out of cache. While it gives you better microbenchmarks your real workloads
> > suffer because they eat lot more additional cache misses when
> > accessing the fresh pages.
> >
> > Don't go down that path please.
>
> I doubt it unless real-world data will back your claim up.
>
> I did microbenchmarking. You said it looks good in microbench but
> hurts real-world.
>
> Sometime after that I made a patch which allows for switching
> clear/copy routines on the fly, and played a bit with real-world tests.
>
> See http://www.thisishull.net/showthread.php?t=36562
>
> In short, I ran forking test programs which excercise clearing and copying
> routines in kernel. I wasn't able to find a usage pattern where page copying
> using SSE non-temporal stores is a loss. Page clear was demonstrably worse,
> no argument about that.

Let me explain what did I test, and how.

[snip explanation why nt store is a loss on small buffer, see
http://www.thisishull.net/showthread.php?t=36562 if you want to know]

Core of copy test program:

#define N (256/4)
#define SIZE 4096
...
for(k = 0; k < 5000; k++) {
int pid;
pid = fork();
if(pid == 0) {
/* child */
for(i = 0; i < N; i++) mem[i*SIZE+1] = 'b'; /* force copy */
strchr(mem, 'c') == mem+N*SIZE-1 || printf("BUG\n"); /* read all */
exit(0);
} else if(pid == -1) {
perror("fork");
} else {
/* parent */
waitpid(pid, NULL, 0);
}
}

Each copy test does one fork per one loop.
With each fork, kernel zeroes out 3 pages and copies 8 pages.
This amounts to 12k+32k bytes.

256k copying, 5x5000 loops:
slow: 0m8.036 0m8.063 0m8.192 0m8.233 0m8.252 75600/1800468
mmx_APn: 0m7.461 0m7.496 0m7.543 0m7.687 0m7.725 75586/1800446
mmx_APN: 0m6.351 0m6.366 0m6.378 0m6.382 0m6.525 75586/1800436
mmx_APn/APN: 0m6.412 0m6.448 0m6.501 0m6.663 0m6.669 75584/1800439
^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^time (5 runs) ^^^^^^^^^^^^^ pages cleared/copied (as reported by patched kernel)
A/a - align/do not align loop
P/p - prefetch/do not prefetch
N/n - nt stores/normal stores
"mmx_APn/APN" means "normal stores for clear and nt stores for copy"
(because nt clear is already known to be bad)

nt stores win as expected on working sets larger tnan cache size

Smaller working set, 44k touched by fork and 20k by copying.
This is still larger than 64k L1 size:

20k copying, 5x20000 loops:
slow: 0m6.610 0m6.665 0m6.694 0m6.750 0m6.774 300315/1300468
mmx_APn: 0m6.208 0m6.218 0m6.263 0m6.335 0m6.452 300352/1300448
mmx_APN: 0m4.887 0m4.984 0m5.021 0m5.052 0m5.057 300295/1300443
mmx_APn/APN: 0m5.115 0m5.160 0m5.167 0m5.172 0m5.183 300292/1300443

Smallest working set possible for this test program.
44k touched by fork and 4k by copying:

4k copying, 5x40000 loops:
slow: 0m8.303 0m8.334 0m8.354 0m8.510 0m8.572 600313/1800473
mmx_APn: 0m8.233 0m8.350 0m8.406 0m8.407 0m8.642 600323/1800467
mmx_APN: 0m6.475 0m6.501 0m6.510 0m6.534 0m6.783 600302/1800436
mmx_APn/APN: 0m6.540 0m6.551 0m6.603 0m6.640 0m6.708 600271/1800442

Unexpectedly, these small ones still run quite noticeably faster
with nt stores!

Why? Simply because small-workspace test did not need to read
back all 32k of data copied by fork. This is also likely to be
the case for the most frequent use of fork: fork+exec.

Thus with "normal" page clear and "nt" page copy routines
both clear and copy benchmarks run faster than with
stock kernel, both with small and large working set.

Am I wrong?
--
vda

2005-05-31 09:24:01

by Andi Kleen

[permalink] [raw]
Subject: Re: [RFC] x86-64: Use SSE for copy_page and clear_page

> Thus with "normal" page clear and "nt" page copy routines
> both clear and copy benchmarks run faster than with
> stock kernel, both with small and large working set.
>
> Am I wrong?

fork is only a corner case. The main case is a process allocating
memory using brk/mmap and then using it.

-Andi

2005-05-31 13:57:54

by Benjamin LaHaise

[permalink] [raw]
Subject: Re: [RFC] x86-64: Use SSE for copy_page and clear_page

On Tue, May 31, 2005 at 11:23:58AM +0200, Andi Kleen wrote:
> fork is only a corner case. The main case is a process allocating
> memory using brk/mmap and then using it.

At least for kernel compiles, using non-temporal stores is a slight
win (a 2-5s improvement on 4m30s). Granted, there seems to be a
lot of variation in kernel compile times.

A bit more experimentation shows that non-temporal stores plus a
prefetch of the resulting data is still better than the existing
routines and only slightly slower than the pure non-temporal version.
That said, it seems to result in kernel compiles that are on the high
side of the variations I normally see (4m40s, 4m38s) compared to the
~4m30s for an unpatched kernel and ~4m25s-4m30s for the non-temporal
store version.

-ben
--
"Time is what keeps everything from happening all at once." -- John Wheeler