LinuxLists.cc - [PATCH] x86_64/lib: improve the performance of memmove

2010-09-16 06:31:43

Subject: [PATCH] x86_64/lib: improve the performance of memmove

When the dest and the src do overlap and the memory area is large, memmove of
x86_64 is very inefficient, and it led to bad performance, such as btrfs's file
deletion performance. This patch improved the performance of memmove on x86_64
by using __memcpy_bwd() instead of byte copy when doing large memory area copy
(len > 64).

I have tested this patchset by doing 500 bytes memory copy for 50000 times
with various alignments and buffer sizes on my x86_64 box:
Len Src Unalign Dest Unalign Without Patch Patch applied
--- ----------- ------------ ------------- -------------
256 0 0 0s 815158us 0s 249647us
256 0 4 0s 816059us 0s 324210us
256 0 7 0s 815192us 0s 324254us
256 3 0 0s 815179us 0s 325991us
256 3 1 0s 815161us 0s 378462us
256 3 4 0s 815154us 0s 779306us
256 3 7 0s 815151us 0s 782924us
256 7 0 0s 815839us 0s 325524us
256 7 4 0s 815149us 0s 375658us
256 7 7 0s 815160us 0s 374488us
1024 0 0 3s 125891us 0s 437662us
1024 0 1 3s 125940us 0s 777524us
1024 0 4 3s 159788us 0s 778850us
1024 0 7 3s 155177us 0s 733927us
1024 4 0 3s 118323us 0s 830167us
1024 4 4 3s 129124us 0s 962505us
1024 4 7 3s 123456us 2s 600326us

After appling this patchset, the performance of the file creation and deletion
on some filesystem become better. I have tested it with the following benchmark
tool on my x86_64 box.
http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3

Test steps:
# ./creat_unlink 50000

The result(Total time):
Ext4:
2.6.36-rc4 2.6.36-rc4 + patch
file creation 0.737007 0.701888 4.8%UP
file deletion 0.422226 0.413457 2.1%UP

Btrfs:
2.6.36-rc4 2.6.36-rc4 + patch
file creation 0.977638 0.935208 4.3%UP
file deletion 1.327140 1.221073 8%UP

Signed-off-by: Miao Xie <[email protected]>
---
arch/x86/include/asm/string_64.h | 1 +
arch/x86/lib/Makefile | 2 +-
arch/x86/lib/memcpy_bwd_64.S | 137 ++++++++++++++++++++++++++++++++++++++
arch/x86/lib/memmove_64.c | 10 ++-
4 files changed, 145 insertions(+), 5 deletions(-)
create mode 100644 arch/x86/lib/memcpy_bwd_64.S

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 19e2c46..4e64a87 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -55,6 +55,7 @@ extern void *__memcpy(void *to, const void *from, size_t len);
void *memset(void *s, int c, size_t n);

#define __HAVE_ARCH_MEMMOVE
+extern void *__memcpy_bwd(void *dest, const void *src, size_t count);
void *memmove(void *dest, const void *src, size_t count);

int memcmp(const void *cs, const void *ct, size_t count);
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index e10cf07..ab241df 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -19,7 +19,7 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
lib-y := delay.o
lib-y += thunk_$(BITS).o
lib-y += usercopy_$(BITS).o getuser.o putuser.o
-lib-y += memcpy_$(BITS).o
+lib-y += memcpy_$(BITS).o memcpy_bwd_$(BITS).o
lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o

obj-y += msr.o msr-reg.o msr-reg-export.o
diff --git a/arch/x86/lib/memcpy_bwd_64.S b/arch/x86/lib/memcpy_bwd_64.S
new file mode 100644
index 0000000..ca894e3
--- /dev/null
+++ b/arch/x86/lib/memcpy_bwd_64.S
@@ -0,0 +1,137 @@
+/* Copyright 2010 Miao Xie */
+
+#include <linux/linkage.h>
+
+#include <asm/cpufeature.h>
+#include <asm/dwarf2.h>
+
+/*
+ * __memcpy_bwd - Copy a memory block from the end to the beginning
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:
+ * rax original destination
+ */
+
+ .section .altinstr_replacement, "ax", @progbits
+.Lmemcpy_bwd_c:
+ movq %rdi, %rax
+
+ addq %rdx, %rdi
+ addq %rdx, %rsi
+ leaq -8(%rdi), %rdi
+ leaq -8(%rsi), %rsi
+
+ std
+
+ movq %rdx, %rcx
+ shrq $3, %rcx
+ andq $7, %rdx
+ rep movsq
+
+ leaq 8(%rdi), %rdi
+ leaq 8(%rsi), %rsi
+ decq %rsi
+ decq %rdi
+ movq %rdx, %rcx
+ rep movsb
+
+ cld
+ ret
+.Lmemcpy_bwd_e:
+ .previous
+
+ENTRY(__memcpy_bwd)
+ CFI_STARTPROC
+
+ movq %rdi, %rax
+
+ addq %rdx, %rdi
+ addq %rdx, %rsi
+
+ movq %rdx, %rcx
+ shrq $6, %rcx
+ jz .Lhandle_tail
+
+ .p2align 4
+.Lloop_64:
+ decq %rcx
+
+ leaq -64(%rdi), %rdi
+ leaq -64(%rsi), %rsi
+
+ movq 7*8(%rsi), %r11
+ movq 6*8(%rsi), %r8
+ movq %r11, 7*8(%rdi)
+ movq %r8, 6*8(%rdi)
+
+ movq 5*8(%rsi), %r9
+ movq 4*8(%rsi), %r10
+ movq %r9, 5*8(%rdi)
+ movq %r10, 4*8(%rdi)
+
+ movq 3*8(%rsi), %r11
+ movq 2*8(%rsi), %r8
+ movq %r11, 3*8(%rdi)
+ movq %r8, 2*8(%rdi)
+
+ movq 1*8(%rsi), %r9
+ movq 0*8(%rsi), %r10
+ movq %r9, 1*8(%rdi)
+ movq %r10, 0*8(%rdi)
+
+ jnz .Lloop_64
+
+.Lhandle_tail:
+ movq %rdx, %rcx
+ andq $63, %rcx
+ shrq $3, %rcx
+ jz .Lhandle_7
+
+ .p2align 4
+.Lloop_8:
+ decq %rcx
+
+ leaq -8(%rsi), %rsi
+ leaq -8(%rdi), %rdi
+
+ movq (%rsi), %r8
+ movq %r8, (%rdi)
+
+ jnz .Lloop_8
+
+.Lhandle_7:
+ movq %rdx, %rcx
+ andq $7, %rcx
+ jz .Lend
+
+ .p2align 4
+.Lloop_1:
+ decq %rcx
+
+ decq %rsi
+ decq %rdi
+
+ movb (%rsi), %r8b
+ movb %r8b, (%rdi)
+
+ jnz .Lloop_1
+
+.Lend:
+ ret
+ CFI_ENDPROC
+ENDPROC(__memcpy_bwd)
+
+ .section .altinstructions, "a"
+ .align 8
+ .quad __memcpy_bwd
+ .quad .Lmemcpy_bwd_c
+ .word X86_FEATURE_REP_GOOD
+
+ .byte .Lmemcpy_bwd_e - .Lmemcpy_bwd_c
+ .byte .Lmemcpy_bwd_e - .Lmemcpy_bwd_c
+ .previous
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
index 0a33909..bd4cbcc 100644
--- a/arch/x86/lib/memmove_64.c
+++ b/arch/x86/lib/memmove_64.c
@@ -8,14 +8,16 @@
#undef memmove
void *memmove(void *dest, const void *src, size_t count)
{
- if (dest < src) {
+ if (dest < src || dest - src >= count)
return memcpy(dest, src, count);
- } else {
+ else if (count <= 64) {
char *p = dest + count;
const char *s = src + count;
while (count--)
*--p = *--s;
- }
- return dest;
+
+ return dest;
+ } else
+ return __memcpy_bwd(dest, src, count);
}
EXPORT_SYMBOL(memmove);
--
1.7.0.1

2010-09-16 06:48:29

by Andi Kleen

[permalink] [raw]

Subject: Re: [PATCH] x86_64/lib: improve the performance of memmove

> When the dest and the src do overlap and the memory area is large, memmove
> of
> x86_64 is very inefficient, and it led to bad performance, such as btrfs's
> file
> deletion performance. This patch improved the performance of memmove on
> x86_64
> by using __memcpy_bwd() instead of byte copy when doing large memory area
> copy
> (len > 64).

I still don't understand why you don't simply use a backwards
string copy (with std) ? That should be much simpler and
hopefully be as optimized for kernel copies on recent CPUs.

-Andi

2010-09-16 07:16:31

by Miao Xie

[permalink] [raw]

Subject: Re: [PATCH] x86_64/lib: improve the performance of memmove

On Thu, 16 Sep 2010 08:48:25 +0200 (cest), Andi Kleen wrote:
>> When the dest and the src do overlap and the memory area is large, memmove
>> of
>> x86_64 is very inefficient, and it led to bad performance, such as btrfs's
>> file
>> deletion performance. This patch improved the performance of memmove on
>> x86_64
>> by using __memcpy_bwd() instead of byte copy when doing large memory area
>> copy
>> (len> 64).
>
>
> I still don't understand why you don't simply use a backwards
> string copy (with std) ? That should be much simpler and
> hopefully be as optimized for kernel copies on recent CPUs.

But according to the comment of memcpy, some CPUs don't support "REP" instruction,
so I think we must implement a backwards string copy by other method for those CPUs,
But that implement is complex, so I write it as a function -- __memcpy_bwd().

Thanks!
Miao

2010-09-16 08:40:14

by Andi Kleen

[permalink] [raw]

Subject: Re: [PATCH] x86_64/lib: improve the performance of memmove

On Thu, 16 Sep 2010 15:16:31 +0800
Miao Xie <[email protected]> wrote:

> On Thu, 16 Sep 2010 08:48:25 +0200 (cest), Andi Kleen wrote:
> >> When the dest and the src do overlap and the memory area is large,
> >> memmove of
> >> x86_64 is very inefficient, and it led to bad performance, such as
> >> btrfs's file
> >> deletion performance. This patch improved the performance of
> >> memmove on x86_64
> >> by using __memcpy_bwd() instead of byte copy when doing large
> >> memory area copy
> >> (len> 64).
> >
> >
> > I still don't understand why you don't simply use a backwards
> > string copy (with std) ? That should be much simpler and
> > hopefully be as optimized for kernel copies on recent CPUs.
>
> But according to the comment of memcpy, some CPUs don't support "REP"
> instruction,

I think you misread the comment. REP prefixes are in all x86 CPUs.
On some very old systems it wasn't optimized very well,
but it probably doesn't make too much sense to optimize for those.
On newer CPUs in fact REP should be usually faster than
an unrolled loop.

I'm not sure how optimized the backwards copy is, but most likely
it is optimized too.

Here's an untested patch that implements backwards copy with string
instructions. Could you run it through your test harness?

-Andi

Implement the 64bit memmmove backwards case using string instructions

Signed-off-by: Andi Kleen <[email protected]>

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index bcbcd1e..6e8258d 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -141,3 +141,28 @@ ENDPROC(__memcpy)
.byte .Lmemcpy_e - .Lmemcpy_c
.byte .Lmemcpy_e - .Lmemcpy_c
.previous
+
+/*
+ * Copy memory backwards (for memmove)
+ * rdi target
+ * rsi source
+ * rdx count
+ */
+
+ENTRY(memcpy_backwards):
+ CFI_STARTPROC
+ std
+ movq %rdi, %rax
+ movl %edx, %ecx
+ add %rdx, %rdi
+ add %rdx, %rsi
+ shrl $3, %ecx
+ andl $7, %edx
+ rep movsq
+ movl %edx, %ecx
+ rep movsb
+ cld
+ ret
+ CFI_ENDPROC
+ENDPROC(memcpy_backwards)
+
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
index 0a33909..6c00304 100644
--- a/arch/x86/lib/memmove_64.c
+++ b/arch/x86/lib/memmove_64.c
@@ -5,16 +5,16 @@
#include <linux/string.h>
#include <linux/module.h>

+extern void asmlinkage memcpy_backwards(void *dst, const void *src,
+ size_t count);
+
#undef memmove
void *memmove(void *dest, const void *src, size_t count)
{
if (dest < src) {
return memcpy(dest, src, count);
} else {
- char *p = dest + count;
- const char *s = src + count;
- while (count--)
- *--p = *--s;
+ return memcpy_backwards(dest, src, count);
}
return dest;
}

--
[email protected] -- Speaking for myself only.

2010-09-16 09:29:26

by Miao Xie

[permalink] [raw]

Subject: Re: [PATCH] x86_64/lib: improve the performance of memmove

On Thu, 16 Sep 2010 10:40:08 +0200, Andi Kleen wrote:
> On Thu, 16 Sep 2010 15:16:31 +0800
> Miao Xie<[email protected]> wrote:
>
>> On Thu, 16 Sep 2010 08:48:25 +0200 (cest), Andi Kleen wrote:
>>>> When the dest and the src do overlap and the memory area is large,
>>>> memmove of
>>>> x86_64 is very inefficient, and it led to bad performance, such as
>>>> btrfs's file
>>>> deletion performance. This patch improved the performance of
>>>> memmove on x86_64
>>>> by using __memcpy_bwd() instead of byte copy when doing large
>>>> memory area copy
>>>> (len> 64).
>>>
>>>
>>> I still don't understand why you don't simply use a backwards
>>> string copy (with std) ? That should be much simpler and
>>> hopefully be as optimized for kernel copies on recent CPUs.
>>
>> But according to the comment of memcpy, some CPUs don't support "REP"
>> instruction,
>
> I think you misread the comment. REP prefixes are in all x86 CPUs.
> On some very old systems it wasn't optimized very well,
> but it probably doesn't make too much sense to optimize for those.
> On newer CPUs in fact REP should be usually faster than
> an unrolled loop.
>
> I'm not sure how optimized the backwards copy is, but most likely
> it is optimized too.
>
> Here's an untested patch that implements backwards copy with string
> instructions. Could you run it through your test harness?

Ok, I'll do it.

> +
> +/*
> + * Copy memory backwards (for memmove)
> + * rdi target
> + * rsi source
> + * rdx count
> + */
> +
> +ENTRY(memcpy_backwards):

s/://

> + CFI_STARTPROC
> + std
> + movq %rdi, %rax
> + movl %edx, %ecx
> + add %rdx, %rdi
> + add %rdx, %rsi

- add %rdx, %rdi
- add %rdx, %rsi
+ addq %rdx, %rdi
+ addq %rdx, %rsi

Besides that, the address that %rdi/%rsi pointed to is over the end of
mempry area that going to be copied, so we must tune the address to be
correct.
+ leaq -8(%rdi), %rdi
+ leaq -8(%rsi), %rsi

> + shrl $3, %ecx
> + andl $7, %edx
> + rep movsq

The same as above.
+ leaq 8(%rdi), %rdi
+ leaq 8(%rsi), %rsi
+ decq %rsi
+ decq %rdi

> + movl %edx, %ecx
> + rep movsb
> + cld
> + ret
> + CFI_ENDPROC
> +ENDPROC(memcpy_backwards)
> +
> diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
> index 0a33909..6c00304 100644
> --- a/arch/x86/lib/memmove_64.c
> +++ b/arch/x86/lib/memmove_64.c
> @@ -5,16 +5,16 @@
> #include<linux/string.h>
> #include<linux/module.h>
>
> +extern void asmlinkage memcpy_backwards(void *dst, const void *src,
> + size_t count);

The type of the return value must be "void *".

Thanks
Miao

> +
> #undef memmove
> void *memmove(void *dest, const void *src, size_t count)
> {
> if (dest< src) {
> return memcpy(dest, src, count);
> } else {
> - char *p = dest + count;
> - const char *s = src + count;
> - while (count--)
> - *--p = *--s;
> + return memcpy_backwards(dest, src, count);
> }
> return dest;
> }
>

2010-09-16 10:11:41

by Andi Kleen

[permalink] [raw]

Subject: Re: [PATCH] x86_64/lib: improve the performance of memmove

On Thu, 16 Sep 2010 17:29:32 +0800
Miao Xie <[email protected]> wrote:

Ok was a very broken patch. Sorry should have really done some more
work on it. Anyways hopefully the corrected version is good for
testing.

-Andi

--
[email protected] -- Speaking for myself only.

2010-09-16 10:47:53

by Miao Xie

[permalink] [raw]

Subject: Re: [PATCH] x86_64/lib: improve the performance of memmove

On Thu, 16 Sep 2010 12:11:41 +0200, Andi Kleen wrote:
> On Thu, 16 Sep 2010 17:29:32 +0800
> Miao Xie<[email protected]> wrote:
>
>
> Ok was a very broken patch. Sorry should have really done some more
> work on it. Anyways hopefully the corrected version is good for
> testing.
>
> -Andi
>

title: x86_64/lib: improve the performance of memmove

Implement the 64bit memmmove backwards case using string instructions

Signed-off-by: Andi Kleen <[email protected]>
Signed-off-by: Miao Xie <[email protected]>
---
arch/x86/lib/memcpy_64.S | 29 +++++++++++++++++++++++++++++
arch/x86/lib/memmove_64.c | 8 ++++----
2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index bcbcd1e..9de5e9a 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -141,3 +141,32 @@ ENDPROC(__memcpy)
.byte .Lmemcpy_e - .Lmemcpy_c
.byte .Lmemcpy_e - .Lmemcpy_c
.previous
+
+/*
+ * Copy memory backwards (for memmove)
+ * rdi target
+ * rsi source
+ * rdx count
+ */
+
+ENTRY(memcpy_backwards)
+ CFI_STARTPROC
+ std
+ movq %rdi, %rax
+ movl %edx, %ecx
+ addq %rdx, %rdi
+ addq %rdx, %rsi
+ leaq -8(%rdi), %rdi
+ leaq -8(%rsi), %rsi
+ shrl $3, %ecx
+ andl $7, %edx
+ rep movsq
+ addq $7, %rdi
+ addq $7, %rsi
+ movl %edx, %ecx
+ rep movsb
+ cld
+ ret
+ CFI_ENDPROC
+ENDPROC(memcpy_backwards)
+
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
index 0a33909..6774fd8 100644
--- a/arch/x86/lib/memmove_64.c
+++ b/arch/x86/lib/memmove_64.c
@@ -5,16 +5,16 @@
#include <linux/string.h>
#include <linux/module.h>

+extern void * asmlinkage memcpy_backwards(void *dst, const void *src,
+ size_t count);
+
#undef memmove
void *memmove(void *dest, const void *src, size_t count)
{
if (dest < src) {
return memcpy(dest, src, count);
} else {
- char *p = dest + count;
- const char *s = src + count;
- while (count--)
- *--p = *--s;
+ return memcpy_backwards(dest, src, count);
}
return dest;
}
--
1.7.0.1

2010-09-16 11:47:19

by Miao Xie

[permalink] [raw]

Subject: Re: [PATCH] x86_64/lib: improve the performance of memmove

On Thu, 16 Sep 2010 18:47:59 +0800
, Miao Xie wrote:
> On Thu, 16 Sep 2010 12:11:41 +0200, Andi Kleen wrote:
>> On Thu, 16 Sep 2010 17:29:32 +0800
>> Miao Xie<[email protected]> wrote:
>>
>>
>> Ok was a very broken patch. Sorry should have really done some more
>> work on it. Anyways hopefully the corrected version is good for
>> testing.
>>
>> -Andi

The test result is following:
Len Src Unalign Dest Unalign Patch applied Without Patch
--- ----------- ------------ ------------- -------------
8 0 0 0s 421117us 0s 70203us
8 0 3 0s 252622us 0s 42114us
8 0 7 0s 252663us 0s 42111us
8 3 0 0s 252666us 0s 42114us
8 3 3 0s 252667us 0s 42113us
8 3 7 0s 252667us 0s 42112us
32 0 0 0s 252672us 0s 114301us
32 0 3 0s 252676us 0s 114306us
32 0 7 0s 252663us 0s 114300us
32 3 0 0s 252661us 0s 114305us
32 3 3 0s 252663us 0s 114300us
32 3 7 0s 252668us 0s 114304us
64 0 0 0s 252672us 0s 236119us
64 0 3 0s 264671us 0s 236120us
64 0 7 0s 264702us 0s 236127us
64 3 0 0s 270701us 0s 236128us
64 3 3 0s 287236us 0s 236809us
64 3 7 0s 287257us 0s 236123us

According to the above result, old version is better than the new one when the
memory area is small.

Len Src Unalign Dest Unalign Patch applied Without Patch
--- ----------- ------------ ------------- -------------
256 0 0 0s 281886us 0s 813660us
256 0 3 0s 332169us 0s 813645us
256 0 7 0s 342961us 0s 813639us
256 3 0 0s 305305us 0s 813634us
256 3 3 0s 386939us 0s 813638us
256 3 7 0s 370511us 0s 814335us
512 0 0 0s 310716us 1s 584677us
512 0 3 0s 456420us 1s 583353us
512 0 7 0s 468236us 1s 583248us
512 3 0 0s 493987us 1s 583659us
512 3 3 0s 588041us 1s 584294us
512 3 7 0s 605489us 1s 583650us
1024 0 0 0s 406971us 3s 123644us
1024 0 3 0s 748419us 3s 126514us
1024 0 7 0s 756153us 3s 127178us
1024 3 0 0s 854681us 3s 130013us
1024 3 3 1s 46828us 3s 140190us
1024 3 7 1s 35886us 3s 135508us

the new version is better when the memory area is large.

Thanks!
Miao

>>
>
> title: x86_64/lib: improve the performance of memmove
>
> Implement the 64bit memmmove backwards case using string instructions
>
> Signed-off-by: Andi Kleen <[email protected]>
> Signed-off-by: Miao Xie <[email protected]>
> ---
> arch/x86/lib/memcpy_64.S | 29 +++++++++++++++++++++++++++++
> arch/x86/lib/memmove_64.c | 8 ++++----
> 2 files changed, 33 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
> index bcbcd1e..9de5e9a 100644
> --- a/arch/x86/lib/memcpy_64.S
> +++ b/arch/x86/lib/memcpy_64.S
> @@ -141,3 +141,32 @@ ENDPROC(__memcpy)
> .byte .Lmemcpy_e - .Lmemcpy_c
> .byte .Lmemcpy_e - .Lmemcpy_c
> .previous
> +
> +/*
> + * Copy memory backwards (for memmove)
> + * rdi target
> + * rsi source
> + * rdx count
> + */
> +
> +ENTRY(memcpy_backwards)
> + CFI_STARTPROC
> + std
> + movq %rdi, %rax
> + movl %edx, %ecx
> + addq %rdx, %rdi
> + addq %rdx, %rsi
> + leaq -8(%rdi), %rdi
> + leaq -8(%rsi), %rsi
> + shrl $3, %ecx
> + andl $7, %edx
> + rep movsq
> + addq $7, %rdi
> + addq $7, %rsi
> + movl %edx, %ecx
> + rep movsb
> + cld
> + ret
> + CFI_ENDPROC
> +ENDPROC(memcpy_backwards)
> +
> diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
> index 0a33909..6774fd8 100644
> --- a/arch/x86/lib/memmove_64.c
> +++ b/arch/x86/lib/memmove_64.c
> @@ -5,16 +5,16 @@
> #include <linux/string.h>
> #include <linux/module.h>
>
> +extern void * asmlinkage memcpy_backwards(void *dst, const void *src,
> + size_t count);
> +
> #undef memmove
> void *memmove(void *dest, const void *src, size_t count)
> {
> if (dest < src) {
> return memcpy(dest, src, count);
> } else {
> - char *p = dest + count;
> - const char *s = src + count;
> - while (count--)
> - *--p = *--s;
> + return memcpy_backwards(dest, src, count);
> }
> return dest;
> }

2010-09-16 12:13:09

by George Spelvin

[permalink] [raw]

Subject: Re: [PATCH] x86_64/lib: improve the performance of memmove

> void *memmove(void *dest, const void *src, size_t count)
> {
> if (dest < src) {
> return memcpy(dest, src, count);
> } else {
> - char *p = dest + count;
> - const char *s = src + count;
> - while (count--)
> - *--p = *--s;
> + return memcpy_backwards(dest, src, count);
> }
> return dest;
> }

Er... presumably, the forward-copy case is somewhat better optimized,
so should be preferred if the areas don't overlap; that is, dest >
src by more than the sount. Assuming that size_t can hold a pointer:

if ((size_t)src - (size_t)dest >= count)
return memcpy(dest, src, count);
else
return memcpy_backwards(dest, src, count);

Or, using GCC's arithmetic on void * extension,
if ((size_t)(src - dest) >= count)
... etc.

If src == dest, it doesn't matter which you use. You could skip the
copy entirely, but presumably that case doesn't arise often enough to
be worth testing for:

if ((size_t)(src - dest) >= count)
return memcpy(dest, src, count);
else if (src - dest != 0)
return memcpy_backwards(dest, src, count);
else
return dest;

2010-09-17 00:55:18

by Zhao, Yakui

[permalink] [raw]

Subject: Re: [PATCH] x86_64/lib: improve the performance of memmove

On Thu, 2010-09-16 at 15:16 +0800, Miao Xie wrote:
> On Thu, 16 Sep 2010 08:48:25 +0200 (cest), Andi Kleen wrote:
> >> When the dest and the src do overlap and the memory area is large, memmove
> >> of
> >> x86_64 is very inefficient, and it led to bad performance, such as btrfs's
> >> file
> >> deletion performance. This patch improved the performance of memmove on
> >> x86_64
> >> by using __memcpy_bwd() instead of byte copy when doing large memory area
> >> copy
> >> (len> 64).
> >
> >
> > I still don't understand why you don't simply use a backwards
> > string copy (with std) ? That should be much simpler and
> > hopefully be as optimized for kernel copies on recent CPUs.
>
> But according to the comment of memcpy, some CPUs don't support "REP" instruction,

Where do you find that the "REP" instruction is not supported on some
CPUs? The comment in arch/x86/lib/memcpy_64.s only states that some CPUs
will run faster when using string copy instruction.

> so I think we must implement a backwards string copy by other method for those CPUs,
> But that implement is complex, so I write it as a function -- __memcpy_bwd().

Will you please look at tip/x86/ tree(mem branch)? The memory copy on
x86_64 is already optimized.

thanks.
Yakui
>
> Thanks!
> Miao
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2010-09-17 03:37:31

by Miao Xie

[permalink] [raw]

Subject: Re: [PATCH] x86_64/lib: improve the performance of memmove

On Fri, 17 Sep 2010 08:55:18 +0800, ykzhao wrote:
> On Thu, 2010-09-16 at 15:16 +0800, Miao Xie wrote:
>> On Thu, 16 Sep 2010 08:48:25 +0200 (cest), Andi Kleen wrote:
>>>> When the dest and the src do overlap and the memory area is large, memmove
>>>> of
>>>> x86_64 is very inefficient, and it led to bad performance, such as btrfs's
>>>> file
>>>> deletion performance. This patch improved the performance of memmove on
>>>> x86_64
>>>> by using __memcpy_bwd() instead of byte copy when doing large memory area
>>>> copy
>>>> (len> 64).
>>>
>>>
>>> I still don't understand why you don't simply use a backwards
>>> string copy (with std) ? That should be much simpler and
>>> hopefully be as optimized for kernel copies on recent CPUs.
>>
>> But according to the comment of memcpy, some CPUs don't support "REP" instruction,
>
> Where do you find that the "REP" instruction is not supported on some
> CPUs? The comment in arch/x86/lib/memcpy_64.s only states that some CPUs
> will run faster when using string copy instruction.

Sorry! I misread the comment.

>> so I think we must implement a backwards string copy by other method for those CPUs,
>> But that implement is complex, so I write it as a function -- __memcpy_bwd().
>
> Will you please look at tip/x86/ tree(mem branch)? The memory copy on
> x86_64 is already optimized.

Thanks for your reminding! It is very helpful.
Miao

> thanks.
> Yakui
>>
>> Thanks!
>> Miao
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
>