2005-03-29 14:37:37

by Denis Vlasenko

[permalink] [raw]
Subject: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel

Try testcase below the sig.

This causes nearly one thousand calls to memcpy in my kernel
(not an allyesconfig one):

# objdump -d vmlinux | grep -F '<memcpy>' | wc -l
959

# gcc -O2 -c t.c
# objdump -r -d t.o

t.o: file format elf32-i386

Disassembly of section .text:

00000000 <f3>:
0: 55 push %ebp
1: 89 e5 mov %esp,%ebp
3: 83 ec 0c sub $0xc,%esp
6: 6a 03 push $0x3
8: ff 75 0c pushl 0xc(%ebp)
b: ff 75 08 pushl 0x8(%ebp)
e: e8 fc ff ff ff call f <f3+0xf>
f: R_386_PC32 memcpy
13: 83 c4 10 add $0x10,%esp
16: c9 leave
17: c3 ret

00000018 <f3b>:
18: 55 push %ebp
19: 89 e5 mov %esp,%ebp
1b: 8b 55 0c mov 0xc(%ebp),%edx
1e: 66 8b 02 mov (%edx),%ax
21: 8b 4d 08 mov 0x8(%ebp),%ecx
24: 66 89 01 mov %ax,(%ecx)
27: 8a 42 02 mov 0x2(%edx),%al
2a: 88 41 02 mov %al,0x2(%ecx)
2d: c9 leave
2e: c3 ret
2f: 90 nop

00000030 <f3k>:
30: 55 push %ebp
31: 89 e5 mov %esp,%ebp
33: 57 push %edi
34: 56 push %esi
35: 8b 7d 08 mov 0x8(%ebp),%edi
38: 8b 75 0c mov 0xc(%ebp),%esi
3b: b9 ee 02 00 00 mov $0x2ee,%ecx
40: f3 a5 repz movsl %ds:(%esi),%es:(%edi)
42: 5e pop %esi
43: 5f pop %edi
44: c9 leave
45: c3 ret


--
vda


typedef unsigned int size_t;

static inline void * __memcpy(void * to, const void * from, size_t n)
{
int d0, d1, d2;
__asm__ __volatile__(
"rep ; movsl\n\t"
"testb $2,%b4\n\t"
"je 1f\n\t"
"movsw\n"
"1:\ttestb $1,%b4\n\t"
"je 2f\n\t"
"movsb\n"
"2:"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
:"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
: "memory");
return (to);
}

/*
* This looks horribly ugly, but the compiler can optimize it totally,
* as the count is constant.
*/
static inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
if (n <= 128)
return __builtin_memcpy(to, from, n);

#define COMMON(x) \
__asm__ __volatile__( \
"rep ; movsl" \
x \
: "=&c" (d0), "=&D" (d1), "=&S" (d2) \
: "0" (n/4),"1" ((long) to),"2" ((long) from) \
: "memory");
{
int d0, d1, d2;
switch (n % 4) {
case 0: COMMON(""); return to;
case 1: COMMON("\n\tmovsb"); return to;
case 2: COMMON("\n\tmovsw"); return to;
default: COMMON("\n\tmovsw\n\tmovsb"); return to;
}
}

#undef COMMON
}

#define memcpy(t, f, n) \
(__builtin_constant_p(n) ? \
__constant_memcpy((t),(f),(n)) : \
__memcpy((t),(f),(n)))

int f3(char *a, char *b) { memcpy(a,b,3); }
int f3b(char *a, char *b) { __builtin_memcpy(a,b,3); }
int f3k(char *a, char *b) { memcpy(a,b,3000); }


2005-03-29 15:06:26

by Richard Biener

[permalink] [raw]
Subject: Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel

On Tue, 29 Mar 2005 17:37:06 +0300, Denis Vlasenko <[email protected]> wrote:
> Try testcase below the sig.
>
> This causes nearly one thousand calls to memcpy in my kernel
> (not an allyesconfig one):

> static inline void * __memcpy(void * to, const void * from, size_t n)
> {
> int d0, d1, d2;
> __asm__ __volatile__(
> "rep ; movsl\n\t"
> "testb $2,%b4\n\t"
> "je 1f\n\t"
> "movsw\n"
> "1:\ttestb $1,%b4\n\t"
> "je 2f\n\t"
> "movsb\n"
> "2:"
> : "=&c" (d0), "=&D" (d1), "=&S" (d2)
> :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
> : "memory");
> return (to);
> }

The question is, what reason does -Winline give for this inlining
decision? And then
of course, how is the size estimate counted for the above. What kind
of tree node do
we get for the ASM expression?

Richard.

2005-03-29 15:10:22

by Nathan Sidwell

[permalink] [raw]
Subject: Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel

Denis Vlasenko wrote:

> Disassembly of section .text:
>

> e: e8 fc ff ff ff call f <f3+0xf>
> f: R_386_PC32 memcpy

> #define memcpy(t, f, n) \
> (__builtin_constant_p(n) ? \
> __constant_memcpy((t),(f),(n)) : \
> __memcpy((t),(f),(n)))

given this #define, how can 'memcpy' appear in the object file? It appears
that something odd is happening with preprocessing. Check the .i files are
as you expect. -dD and -E options will be helpful to you.

nathan

--
Nathan Sidwell :: http://www.codesourcery.com :: CodeSourcery LLC
[email protected] :: http://www.planetfall.pwp.blueyonder.co.uk

2005-03-29 15:17:10

by Jakub Jelinek

[permalink] [raw]
Subject: Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel

On Tue, Mar 29, 2005 at 05:37:06PM +0300, Denis Vlasenko wrote:
> typedef unsigned int size_t;
>
> static inline void * __memcpy(void * to, const void * from, size_t n)
> {
> int d0, d1, d2;
> __asm__ __volatile__(
> "rep ; movsl\n\t"
> "testb $2,%b4\n\t"
> "je 1f\n\t"
> "movsw\n"
> "1:\ttestb $1,%b4\n\t"
> "je 2f\n\t"
> "movsb\n"
> "2:"
> : "=&c" (d0), "=&D" (d1), "=&S" (d2)
> :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
> : "memory");
> return (to);
> }
>
> /*
> * This looks horribly ugly, but the compiler can optimize it totally,
> * as the count is constant.
> */
> static inline void * __constant_memcpy(void * to, const void * from, size_t n)
> {
> if (n <= 128)
> return __builtin_memcpy(to, from, n);
>
> #define COMMON(x) \
> __asm__ __volatile__( \
> "rep ; movsl" \
> x \
> : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
> : "0" (n/4),"1" ((long) to),"2" ((long) from) \
> : "memory");
> {
> int d0, d1, d2;
> switch (n % 4) {
> case 0: COMMON(""); return to;
> case 1: COMMON("\n\tmovsb"); return to;
> case 2: COMMON("\n\tmovsw"); return to;
> default: COMMON("\n\tmovsw\n\tmovsb"); return to;
> }
> }
>
> #undef COMMON
> }
>
> #define memcpy(t, f, n) \
> (__builtin_constant_p(n) ? \
> __constant_memcpy((t),(f),(n)) : \
> __memcpy((t),(f),(n)))
>
> int f3(char *a, char *b) { memcpy(a,b,3); }

The problem is that in GCC < 4.0 there is no constant propagation
pass before expanding builtin functions, so the __builtin_memcpy
call above sees a variable rather than a constant.

Either use GCC 4.0+, where this works just fine, or move the
n <= 128 case into the macro:
#define memcpy(t, f, n) \
(__builtin_constant_p(n) ? \
((n) <= 128 ? __builtin_memcpy(t,f,n) : __constant_memcpy(t,f,n) : \
__memcpy(t,f,n))

Jakub

2005-03-29 15:42:20

by Andrew Pinski

[permalink] [raw]
Subject: Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel

>
> On Tue, Mar 29, 2005 at 05:37:06PM +0300, Denis Vlasenko wrote:
> > /*
> > * This looks horribly ugly, but the compiler can optimize it totally,
> > * as the count is constant.
> > */
> > static inline void * __constant_memcpy(void * to, const void * from, size_t n)
> > {
> > if (n <= 128)
> > return __builtin_memcpy(to, from, n);
> The problem is that in GCC < 4.0 there is no constant propagation
> pass before expanding builtin functions, so the __builtin_memcpy
> call above sees a variable rather than a constant.

or change "size_t n" to "const size_t n" will also fix the issue.
As we do some (well very little and with inlining and const values)
const progation before 4.0.0 on the trees before expanding the builtin.

-- Pinski

2005-03-29 20:30:36

by Denis Vlasenko

[permalink] [raw]
Subject: [PATCH] fix i386 memcpy

This patch shortens non-constant memcpy() by two bytes
and fixes spurious out-of-line constant memcpy().

Patch is run-tested (I run on patched kernel right now).

Benchmark and code generation test program will be mailed as reply.

# size vmlinux.org vmlinux
text data bss dec hex filename
3954591 1553426 236544 5744561 57a7b1 vmlinux.org
3952615 1553426 236544 5742585 579ff9 vmlinux

Example of changes (part of dump_fpu() body):
old.............................................. new......................
8d 83 40 02 00 00 lea 0x240(%ebx),%eax 8d b3 40 02 00 00 lea 0x240(%ebx),%esi
74 31 je c0108b27 <dump_fpu+0x9c> 74 2e je c0108b1d <dump_fpu+0x92>
6a 1c push $0x1c 8b 7d 0c mov 0xc(%ebp),%edi
50 push %eax b9 07 00 00 00 mov $0x7,%ecx
56 push %esi f3 a5 repz movsl %ds:(%esi),%es:(%edi)
e8 49 21 10 00 call c020ac48 <memcpy> 8b 55 0c mov 0xc(%ebp),%edx
83 c4 0c add $0xc,%esp 83 c2 1c add $0x1c,%edx
83 c6 1c add $0x1c,%esi 8d 83 60 02 00 00 lea 0x260(%ebx),%eax
81 c3 60 02 00 00 add $0x260,%ebx b9 07 00 00 00 mov $0x7,%ecx
bf 07 00 00 00 mov $0x7,%edi 89 d7 mov %edx,%edi
6a 0a push $0xa 89 c6 mov %eax,%esi
53 push %ebx a5 movsl %ds:(%esi),%es:(%edi)
56 push %esi a5 movsl %ds:(%esi),%es:(%edi)
e8 2f 21 10 00 call c020ac48 <memcpy> 66 a5 movsw %ds:(%esi),%es:(%edi)
83 c4 0c add $0xc,%esp 83 c2 0a add $0xa,%edx
83 c6 0a add $0xa,%esi 83 c0 10 add $0x10,%eax
83 c3 10 add $0x10,%ebx 49 dec %ecx
4f dec %edi 79 ef jns c0108b0a <dump_fpu+0x7f>
79 eb jns c0108b10 <dump_fpu+0x85> eb 0a jmp c0108b27 <dump_fpu+0x9c>
eb 0c jmp c0108b33 <dump_fpu+0xa8> 8b 7d 0c mov 0xc(%ebp),%edi
6a 6c push $0x6c b9 1b 00 00 00 mov $0x1b,%ecx
50 push %eax f3 a5 repz movsl %ds:(%esi),%es:(%edi)
56 push %esi 8b 45 f0 mov 0xfffffff0(%ebp),%eax
e8 18 21 10 00 call c020ac48 <memcpy> 5a pop %edx
83 c4 0c add $0xc,%esp 5b pop %ebx
8b 45 f0 mov 0xfffffff0(%ebp),%eax
8d 65 f4 lea 0xfffffff4(%ebp),%esp
5b pop %ebx
5e pop %esi

--
vda


Attachments:
(No filename) (2.89 kB)
string.memcpy.diff (2.88 kB)
Download all attachments

2005-03-29 20:29:20

by Denis Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] fix i386 memcpy

On Tuesday 29 March 2005 23:22, Denis Vlasenko wrote:
> This patch shortens non-constant memcpy() by two bytes
> and fixes spurious out-of-line constant memcpy().
>
> Patch is run-tested (I run on patched kernel right now).
>
> Benchmark and code generation test program will be mailed as reply.


Attachments:
(No filename) (298.00 B)
bench.c (3.67 kB)
codecheck.c (5.85 kB)
Download all attachments

2005-03-30 01:27:11

by Gerold Jury

[permalink] [raw]
Subject: Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel


>> On Tue, Mar 29, 2005 at 05:37:06PM +0300, Denis Vlasenko wrote:
>> > /*
>> > * This looks horribly ugly, but the compiler can optimize it totally,
>> > * as the count is constant.
>> > */
>> > static inline void * __constant_memcpy(void * to, const void * from,
>> > size_t n) {
>> > if (n <= 128)
>> > return __builtin_memcpy(to, from, n);
>>
>> The problem is that in GCC < 4.0 there is no constant propagation
>> pass before expanding builtin functions, so the __builtin_memcpy
>> call above sees a variable rather than a constant.
>
>or change "size_t n" to "const size_t n" will also fix the issue.
>As we do some (well very little and with inlining and const values)
>const progation before 4.0.0 on the trees before expanding the builtin.
>
>-- Pinski
>-
I used the following "const size_t n" change on x86_64
and it reduced the memcpy count from 1088 to 609 with my setup and gcc 3.4.3.
(kernel 2.6.12-rc1, running now)

--- include/asm-x86_64/string.h.~1~ 2005-03-02 08:38:33.000000000 +0100
+++ include/asm-x86_64/string.h 2005-03-30 03:24:35.000000000 +0200
@@ -28,9 +28,9 @@
function. */

#define __HAVE_ARCH_MEMCPY 1
-extern void *__memcpy(void *to, const void *from, size_t len);
+extern void *__memcpy(void *to, const void *from, const size_t len);
#define memcpy(dst,src,len) \
- ({ size_t __len = (len); \
+ ({ const size_t __len = (len); \
void *__ret; \
if (__builtin_constant_p(len) && __len >= 64) \
__ret = __memcpy((dst),(src),__len); \

2005-03-30 06:17:01

by Denis Vlasenko

[permalink] [raw]
Subject: Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel

On Wednesday 30 March 2005 05:27, Gerold Jury wrote:
>
> >> On Tue, Mar 29, 2005 at 05:37:06PM +0300, Denis Vlasenko wrote:
> >> > /*
> >> > * This looks horribly ugly, but the compiler can optimize it totally,
> >> > * as the count is constant.
> >> > */
> >> > static inline void * __constant_memcpy(void * to, const void * from,
> >> > size_t n) {
> >> > if (n <= 128)
> >> > return __builtin_memcpy(to, from, n);
> >>
> >> The problem is that in GCC < 4.0 there is no constant propagation
> >> pass before expanding builtin functions, so the __builtin_memcpy
> >> call above sees a variable rather than a constant.
> >
> >or change "size_t n" to "const size_t n" will also fix the issue.
> >As we do some (well very little and with inlining and const values)
> >const progation before 4.0.0 on the trees before expanding the builtin.
> >
> >-- Pinski
> >-
> I used the following "const size_t n" change on x86_64
> and it reduced the memcpy count from 1088 to 609 with my setup and gcc 3.4.3.
> (kernel 2.6.12-rc1, running now)

What do you mean, 'reduced'?

(/me is checking....)

Oh shit... It still emits half of memcpys, to be exact - for
struct copies:

arch/i386/kernel/process.c:

int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
unsigned long unused,
struct task_struct * p, struct pt_regs * regs)
{
struct pt_regs * childregs;
struct task_struct *tsk;
int err;

childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
*childregs = *regs;
^^^^^^^^^^^^^^^^^^^
childregs->eax = 0;
childregs->esp = esp;

# make arch/i386/kernel/process.s

copy_thread:
pushl %ebp
movl %esp, %ebp
pushl %edi
pushl %esi
pushl %ebx
subl $20, %esp
movl 24(%ebp), %eax
movl 4(%eax), %esi
pushl $60
leal 8132(%esi), %ebx
pushl 28(%ebp)
pushl %ebx
call memcpy <=================
movl $0, 24(%ebx)
movl 16(%ebp), %eax
movl %eax, 52(%ebx)
movl 24(%ebp), %edx
addl $8192, %esi
movl %ebx, 516(%edx)
movl %esi, -32(%ebp)
movl %esi, 504(%edx)
movl $ret_from_fork, 512(%edx)

Jakub, is there a way to instruct gcc to inine this copy, or better yet,
to use user-supplied inline version of memcpy?
--
vda

2005-04-01 21:54:38

by Jan Hubicka

[permalink] [raw]
Subject: Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel

> On Wednesday 30 March 2005 05:27, Gerold Jury wrote:
> >
> > >> On Tue, Mar 29, 2005 at 05:37:06PM +0300, Denis Vlasenko wrote:
> > >> > /*
> > >> > * This looks horribly ugly, but the compiler can optimize it totally,
> > >> > * as the count is constant.
> > >> > */
> > >> > static inline void * __constant_memcpy(void * to, const void * from,
> > >> > size_t n) {
> > >> > if (n <= 128)
> > >> > return __builtin_memcpy(to, from, n);
> > >>
> > >> The problem is that in GCC < 4.0 there is no constant propagation
> > >> pass before expanding builtin functions, so the __builtin_memcpy
> > >> call above sees a variable rather than a constant.
> > >
> > >or change "size_t n" to "const size_t n" will also fix the issue.
> > >As we do some (well very little and with inlining and const values)
> > >const progation before 4.0.0 on the trees before expanding the builtin.
> > >
> > >-- Pinski
> > >-
> > I used the following "const size_t n" change on x86_64
> > and it reduced the memcpy count from 1088 to 609 with my setup and gcc 3.4.3.
> > (kernel 2.6.12-rc1, running now)
>
> What do you mean, 'reduced'?
>
> (/me is checking....)
>
> Oh shit... It still emits half of memcpys, to be exact - for
> struct copies:
>
> arch/i386/kernel/process.c:
>
> int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
> unsigned long unused,
> struct task_struct * p, struct pt_regs * regs)
> {
> struct pt_regs * childregs;
> struct task_struct *tsk;
> int err;
>
> childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
> *childregs = *regs;
> ^^^^^^^^^^^^^^^^^^^
> childregs->eax = 0;
> childregs->esp = esp;
>
> # make arch/i386/kernel/process.s
>
> copy_thread:
> pushl %ebp
> movl %esp, %ebp
> pushl %edi
> pushl %esi
> pushl %ebx
> subl $20, %esp
> movl 24(%ebp), %eax
> movl 4(%eax), %esi
> pushl $60
> leal 8132(%esi), %ebx
> pushl 28(%ebp)
> pushl %ebx
> call memcpy <=================
> movl $0, 24(%ebx)
> movl 16(%ebp), %eax
> movl %eax, 52(%ebx)
> movl 24(%ebp), %edx
> addl $8192, %esi
> movl %ebx, 516(%edx)
> movl %esi, -32(%ebp)
> movl %esi, 504(%edx)
> movl $ret_from_fork, 512(%edx)
>
> Jakub, is there a way to instruct gcc to inine this copy, or better yet,
> to use user-supplied inline version of memcpy?

You can't inline struct copy as it is not function call at first place.
You can experiment with -minline-all-stringops where GCC will use it's
own memcpy implementation for this.

Honza
> --
> vda

2005-04-02 12:19:21

by Denis Vlasenko

[permalink] [raw]
Subject: Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel

> > childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
> > *childregs = *regs;
> > ^^^^^^^^^^^^^^^^^^^
> > childregs->eax = 0;
> > childregs->esp = esp;
> >
> > # make arch/i386/kernel/process.s
> >
> > copy_thread:
> > pushl %ebp
> > movl %esp, %ebp
> > pushl %edi
> > pushl %esi
> > pushl %ebx
> > subl $20, %esp
> > movl 24(%ebp), %eax
> > movl 4(%eax), %esi
> > pushl $60
> > leal 8132(%esi), %ebx
> > pushl 28(%ebp)
> > pushl %ebx
> > call memcpy <=================
> > movl $0, 24(%ebx)
> >
> > Jakub, is there a way to instruct gcc to inine this copy, or better yet,
> > to use user-supplied inline version of memcpy?
>
> You can't inline struct copy as it is not function call at first place.
> You can experiment with -minline-all-stringops where GCC will use it's
> own memcpy implementation for this.

No luck. Actually, memcpy calls are produced with -Os. Adding
-minline-all-stringops changes nothing.

-O2 compile does inline copying, however, suboptimally.
Pushing/popping esi/edi on the stack is not needed.
Also "mov $1,ecx; rep; movsl" is rather silly.

Here what did I test:

t.c:
#define STRUCT1(n) struct s##n { char c[n]; } v##n, w##n; void f##n(void) { v##n = w##n; }
#define STRUCT(n) STRUCT1(n)

STRUCT(1)
STRUCT(2)
STRUCT(3)
STRUCT(4)
STRUCT(5)
STRUCT(6)
STRUCT(7)
STRUCT(8)
STRUCT(9)
STRUCT(10)
STRUCT(11)
STRUCT(12)
STRUCT(13)
STRUCT(14)
STRUCT(15)
STRUCT(16)
STRUCT(17)
STRUCT(18)
STRUCT(19)
STRUCT(20)

mk.sh:
#!/bin/sh

# yeah yeah. push/pop + 1 repetition 'rep movsl'
# 88: 55 push %ebp
# 89: 89 e5 mov %esp,%ebp
# 8b: 57 push %edi
# 8c: 56 push %esi
# 8d: fc cld
# 8e: bf 00 00 00 00 mov $0x0,%edi
# 8f: R_386_32 v7
# 93: be 00 00 00 00 mov $0x0,%esi
# 94: R_386_32 w7
# 98: b9 01 00 00 00 mov $0x1,%ecx
# 9d: f3 a5 repz movsl %ds:(%esi),%es:(%edi)
# 9f: 66 a5 movsw %ds:(%esi),%es:(%edi)
# a1: a4 movsb %ds:(%esi),%es:(%edi)
# a2: 5e pop %esi
# a3: 5f pop %edi
# a4: c9 leave
# a5: c3 ret
# a6: 89 f6 mov %esi,%esi
if false; then
gcc -O2 \
falign-functions=1 -falign-labels=1 -falign-loops=1 -falign-jumps=1 \
-c t.c
echo Done; read junk
objdump -d -r t.o | $PAGER
exit
fi

# -Os: emits memcpy
if false; then
gcc -nostdinc -isystem /.share/usr/app/gcc-3.4.1/bin/../lib/gcc/i386-pc-linux-gnu/3.4.1/include \
-Wall -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing \
-fno-common -ffreestanding -Os -falign-functions=1 -falign-labels=1 \
-falign-loops=1 -falign-jumps=1 -fno-omit-frame-pointer -pipe -msoft-float \
-mpreferred-stack-boundary=2 -fno-unit-at-a-time -march=i486 \
-Wdeclaration-after-statement -c t.c
echo Done; read junk
objdump -d -r t.o | $PAGER
exit
fi

# -march=486: emits horrible tail:
# 271: f3 a5 repz movsl %ds:(%esi),%es:(%edi)
# 273: 5e pop %esi
# 274: 66 a1 10 00 00 00 mov 0x10,%ax
# 276: R_386_32 w19
# 27a: 5f pop %edi
# 27b: 66 a3 10 00 00 00 mov %ax,0x10
# 27d: R_386_32 v19
# 281: 5d pop %ebp
# 282: a0 12 00 00 00 mov 0x12,%al
# 283: R_386_32 w19
# 287: a2 12 00 00 00 mov %al,0x12
# 288: R_386_32 v19
# 28c: c3 ret
if false; then
gcc \
-fno-common -ffreestanding -O2 -falign-functions=1 -falign-labels=1 \
-falign-loops=1 -falign-jumps=1 -fno-omit-frame-pointer -pipe -msoft-float \
-mpreferred-stack-boundary=2 -fno-unit-at-a-time -march=i486 \
-Wdeclaration-after-statement \
-c t.c
echo Done; read junk
objdump -d -r t.o | $PAGER
exit
fi

# -Os -minline-all-stringops: still emits memcpy
if true; then
gcc -nostdinc -isystem /.share/usr/app/gcc-3.4.1/bin/../lib/gcc/i386-pc-linux-gnu/3.4.1/include \
-Wall -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing \
-fno-common -ffreestanding -Os -minline-all-stringops -falign-functions=1 -falign-labels=1 \
-falign-loops=1 -falign-jumps=1 -fno-omit-frame-pointer -pipe -msoft-float \
-mpreferred-stack-boundary=2 -fno-unit-at-a-time -march=i486 \
-Wdeclaration-after-statement -c t.c
echo Done; read junk
objdump -d -r t.o | $PAGER
exit
fi

--
vda

2005-04-02 12:27:13

by Denis Vlasenko

[permalink] [raw]
Subject: Re: memcpy(a,b,CONST) is not inlined by gcc 3.4.1 in Linux kernel

On Saturday 02 April 2005 15:18, Denis Vlasenko wrote:
> -O2 compile does inline copying, however, suboptimally.
> Pushing/popping esi/edi on the stack is not needed.
> Also "mov $1,ecx; rep; movsl" is rather silly.

I think I am wrong about push/pop. Sorry.

However, other observation is still valid. You
may wish to compile this updated t.c and see.
--
vda


Attachments:
(No filename) (360.00 B)
t.c (2.45 kB)
Download all attachments

2005-04-05 16:34:28

by Christophe Saout

[permalink] [raw]
Subject: [BUG mm] "fixed" i386 memcpy inlining buggy

Hi Denis,

the new i386 memcpy macro is a ticking timebomb.

I've been debugging a new mISDN crash, just to find out that a memcpy
was not inlined correctly.

Andrew, you should drop the fix-i386-memcpy.patch (or have it fixed).

This source code:

mISDN_pid_t pid;
[...]
memcpy(&st->mgr->pid, &pid, sizeof(mISDN_pid_t));

was compiled as:

lea 0xffffffa4(%ebp),%esi <---- %esi is loaded
( add $0x10,%ebx )
( mov %ebx,%eax ) something else
( call 1613 <test_stack_protocol+0x83> ) %esi preserved
mov 0xffffffa0(%ebp),%edx
mov 0x74(%edx),%edi <---- %edi is loaded
add $0x20,%edi offset in structure added
! mov $0x14,%esi !!!!!! <---- %esi overwritten!
mov %esi,%ecx <---- %ecx loaded
repz movsl %ds:(%esi),%es:(%edi)

Apparently the compiled decided that the value 0x14 could be reused
afterwards (which it does for an inlined memset of the same size some
instructions below) and clobbers %esi.

Looking at the macro:

__asm__ __volatile__(
""
: "=&D" (edi), "=&S" (esi)
: "0" ((long) to),"1" ((long) from)
: "memory"
);
}
if (n >= 5*4) {
/* large block: use rep prefix */
int ecx;
__asm__ __volatile__(
"rep ; movsl"
: "=&c" (ecx)

it seems obvious that the compiled assumes it can reuse %esi and %edi
for something else between the two __asm__ sections. These should
probably be merged.


Attachments:
signature.asc (189.00 B)
This is a digitally signed message part

2005-04-06 10:15:53

by Denis Vlasenko

[permalink] [raw]
Subject: Re: [BUG mm] "fixed" i386 memcpy inlining buggy

On Tuesday 05 April 2005 19:34, Christophe Saout wrote:
> Hi Denis,
>
> the new i386 memcpy macro is a ticking timebomb.
>
> I've been debugging a new mISDN crash, just to find out that a memcpy
> was not inlined correctly.
>
> Andrew, you should drop the fix-i386-memcpy.patch (or have it fixed).
>
> This source code:
>
> mISDN_pid_t pid;
> [...]
> memcpy(&st->mgr->pid, &pid, sizeof(mISDN_pid_t));
>
> was compiled as:
>
> lea 0xffffffa4(%ebp),%esi <---- %esi is loaded
> ( add $0x10,%ebx )
> ( mov %ebx,%eax ) something else
> ( call 1613 <test_stack_protocol+0x83> ) %esi preserved
> mov 0xffffffa0(%ebp),%edx
> mov 0x74(%edx),%edi <---- %edi is loaded
> add $0x20,%edi offset in structure added
> ! mov $0x14,%esi !!!!!! <---- %esi overwritten!
> mov %esi,%ecx <---- %ecx loaded
> repz movsl %ds:(%esi),%es:(%edi)
>
> Apparently the compiled decided that the value 0x14 could be reused
> afterwards (which it does for an inlined memset of the same size some
> instructions below) and clobbers %esi.
>
> Looking at the macro:
>
> __asm__ __volatile__(
> ""
> : "=&D" (edi), "=&S" (esi)
> : "0" ((long) to),"1" ((long) from)
> : "memory"
> );
> }
> if (n >= 5*4) {
> /* large block: use rep prefix */
> int ecx;
> __asm__ __volatile__(
> "rep ; movsl"
> : "=&c" (ecx)
>
> it seems obvious that the compiled assumes it can reuse %esi and %edi
> for something else between the two __asm__ sections. These should
> probably be merged.

Oh shit. I was trying to be too clever. I still run with this patch,
so it must be happening very rarely.

Does this one compile ok?

static inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
long esi, edi;
#if 1 /* want to do small copies with non-string ops? */
switch (n) {
case 0: return to;
case 1: *(char*)to = *(char*)from; return to;
case 2: *(short*)to = *(short*)from; return to;
case 4: *(int*)to = *(int*)from; return to;
#if 1 /* including those doable with two moves? */
case 3: *(short*)to = *(short*)from;
*((char*)to+2) = *((char*)from+2); return to;
case 5: *(int*)to = *(int*)from;
*((char*)to+4) = *((char*)from+4); return to;
case 6: *(int*)to = *(int*)from;
*((short*)to+2) = *((short*)from+2); return to;
case 8: *(int*)to = *(int*)from;
*((int*)to+1) = *((int*)from+1); return to;
#endif
}
#else
if (!n) return to;
#endif
{
/* load esi/edi */
__asm__ __volatile__(
""
: "=&D" (edi), "=&S" (esi)
: "0" ((long) to),"1" ((long) from)
: "memory"
);
}
if (n >= 5*4) {
/* large block: use rep prefix */
int ecx;
__asm__ __volatile__(
"rep ; movsl"
: "=&c" (ecx), "=&D" (edi), "=&S" (esi)
: "0" (n/4), "1" (edi),"2" (esi)
: "memory"
);
} else {
/* small block: don't clobber ecx + smaller code */
if (n >= 4*4) __asm__ __volatile__("movsl":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
if (n >= 3*4) __asm__ __volatile__("movsl":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
if (n >= 2*4) __asm__ __volatile__("movsl":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
if (n >= 1*4) __asm__ __volatile__("movsl":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory");
}
switch (n % 4) {
/* tail */
case 0: return to;
case 1: __asm__ __volatile__("movsb":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); return to;
case 2: __asm__ __volatile__("movsw":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); return to;
default: __asm__ __volatile__("movsw\n\tmovsb":"=&D"(edi),"=&S"(esi):"0"(edi),"1"(esi):"memory"); return to;
}
}
--
vda

2005-04-06 11:06:06

by Dave Korn

[permalink] [raw]
Subject: RE: [BUG mm] "fixed" i386 memcpy inlining buggy

----Original Message----
>From: Denis Vlasenko
>Sent: 06 April 2005 11:14

Is this someone's idea of an April Fool's joke? Because if it is, I've
suffered a serious sense-of-humour failure.

> Oh shit. I was trying to be too clever. I still run with this patch,
> so it must be happening very rarely.

The kernel is way too important for cross-your-fingers-and-hope
engineering techniques to be applied. This patch should never have been
permitted. How on earth could anything like this hope to make it through a
strict review?

> Does this one compile ok?

> {
> /* load esi/edi */
> __asm__ __volatile__(
> ""
> : "=&D" (edi), "=&S" (esi)
> : "0" ((long) to),"1" ((long) from)
> : "memory"
> );
> }
> if (n >= 5*4) {
> /* large block: use rep prefix */
> int ecx;
> __asm__ __volatile__(
> "rep ; movsl"
> : "=&c" (ecx), "=&D" (edi), "=&S" (esi)
> : "0" (n/4), "1" (edi),"2" (esi)
> : "memory"
> );


It doesn't matter if it compiles or not, it's still *utterly* invalid.
You can NOT make assumptions about registers keeping their values between
one asm block and another. Immediately after the closing quote of the first
asm, the compiler can do ANYTHING IT WANTS and to just _hope_ that it won't
use the registers you want is voodoo programming. Even if it works when you
try it once, there are zero guarantees that another version or revision of
the compiler or even just a tiny change to the source that affects the
behaviour of the scheduler when compiling the function won't produce
something completely different, meaning that this code is appallingly
fragile. This code should be completely discarded and rewritten properly.


cheers,
DaveK
--
Can't think of a witty .sigline today....

2005-04-06 11:13:02

by Dave Korn

[permalink] [raw]
Subject: RE: [BUG mm] "fixed" i386 memcpy inlining buggy

----Original Message----
>From: Dave Korn
>Sent: 06 April 2005 12:06


Me and my big mouth.

OK, that one does work.

Sorry for the outburst.

cheers,
DaveK
--
Can't think of a witty .sigline today....

2005-04-06 12:02:34

by Dave Korn

[permalink] [raw]
Subject: RE: [BUG mm] "fixed" i386 memcpy inlining buggy

----Original Message----
>From: Dave Korn
>Sent: 06 April 2005 12:13

> ----Original Message----
>> From: Dave Korn
>> Sent: 06 April 2005 12:06
>
>
> Me and my big mouth.
>
> OK, that one does work.
>
> Sorry for the outburst.
>


.... well, actually, maybe it doesn't after all.


What's that uninitialised variable ecx doing there eh?


cheers,
DaveK
--
Can't think of a witty .sigline today....

2005-04-06 12:10:48

by Christophe Saout

[permalink] [raw]
Subject: Re: [BUG mm] "fixed" i386 memcpy inlining buggy

Am Mittwoch, den 06.04.2005, 13:14 +0300 schrieb Denis Vlasenko:

> Oh shit. I was trying to be too clever. I still run with this patch,
> so it must be happening very rarely.

Yes, that's right, it happened with code that's not in the mainline tree
but could have happened anywhere.

> Does this one compile ok?

Yes, the case that failed is now okay. I changed it slightly to assign
esi and edi directy on top of the functions, no asm section needed here.
The compiler will make sure that they have the correct values when
needed.

In the case above the compiler now uses %ebx to save the loop counter
instead of %esi.

In drivers/cdrom/cdrom.c I'm observing one very strange thing though.

It appears that the compiler decided to put the local variable edi on
the stack for some unexplicable reason (or possibly there is?). Since
the asm sections are memory barriers the compiler then saves the value
of %edi on the stack before entering the next assembler section.

1f1c: a5 movsl %ds:(%esi),%es:(%edi)
1f1d: 89 7d 84 mov %edi,0xffffff84(%ebp)
1f20: a5 movsl %ds:(%esi),%es:(%edi)
1f21: 89 7d 84 mov %edi,0xffffff84(%ebp)
1f24: 66 a5 movsw %ds:(%esi),%es:(%edi)

(this is a constant 10 byte memcpy)

The only thing that would avoid this is to either tell the compiler to
never put esi/edi in memory (which I think is not possibly across
different versions of gcc) or to always generate a single asm section
for all the different cases.


Attachments:
signature.asc (189.00 B)
This is a digitally signed message part

2005-04-06 12:09:49

by Dave Korn

[permalink] [raw]
Subject: RE: [BUG mm] "fixed" i386 memcpy inlining buggy

----Original Message----
>From: Dave Korn
>Sent: 06 April 2005 12:53

> ----Original Message----
>> From: Dave Korn
>> Sent: 06 April 2005 12:13
>
>> ----Original Message----
>>> From: Dave Korn
>>> Sent: 06 April 2005 12:06
>>
>>
>> Me and my big mouth.
>>
>> OK, that one does work.
>>
>> Sorry for the outburst.
>>
>
>
> .... well, actually, maybe it doesn't after all.
>
>
> What's that uninitialised variable ecx doing there eh?

Oh, I see, it's there as an output so it can be matched as an input by the
"0" constraint.

Ok, guess it does.


cheers,
DaveK
--
Can't think of a witty .sigline today....

2005-04-06 12:37:06

by Andrew Haley

[permalink] [raw]
Subject: Re: [BUG mm] "fixed" i386 memcpy inlining buggy

I'm having a little difficulty understanding what this is for. Is it
that gcc's builtin memcpy expander generates bad code, or that older
versions of gcc generate bad code, or what? gcc generates too much
code?

Andrew.

2005-04-06 13:21:36

by linux-os (Dick Johnson)

[permalink] [raw]
Subject: RE: [BUG mm] "fixed" i386 memcpy inlining buggy


Attached is inline ix86 memcpy() plus test code that tests its
corner-cases. The in-line code makes no jumps, but uses longword
copies, word copies and any spare byte copy. It works at all
offsets, doesn't require alignment but would work fastest if
both source and destination were longword aligned.

On Wed, 6 Apr 2005, Dave Korn wrote:

> ----Original Message----
>> From: Dave Korn
>> Sent: 06 April 2005 12:13
>
>> ----Original Message----
>>> From: Dave Korn
>>> Sent: 06 April 2005 12:06
>>
>>
>> Me and my big mouth.
>>
>> OK, that one does work.
>>
>> Sorry for the outburst.
>>
>
>
> .... well, actually, maybe it doesn't after all.
>
>
> What's that uninitialised variable ecx doing there eh?
>
>
> cheers,
> DaveK
> --
> Can't think of a witty .sigline today....
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

Cheers,
Dick Johnson
Penguin : Linux version 2.6.11 on an i686 machine (5537.79 BogoMips).
Notice : All mail here is now cached for review by Dictator Bush.
98.36% of all statistics are fiction.


Attachments:
memcpy.c (1.39 kB)

2005-04-06 14:16:27

by Denis Vlasenko

[permalink] [raw]
Subject: Re: [BUG mm] "fixed" i386 memcpy inlining buggy

On Wednesday 06 April 2005 16:18, Richard B. Johnson wrote:
>
> Attached is inline ix86 memcpy() plus test code that tests its
> corner-cases. The in-line code makes no jumps, but uses longword
> copies, word copies and any spare byte copy. It works at all
> offsets, doesn't require alignment but would work fastest if
> both source and destination were longword aligned.

Yours is:

"shr $1, %%ecx\n" \
"pushf\n" \
"shr $1, %%ecx\n" \
"pushf\n" \ <=== not needed
"rep\n" \
"movsl\n" \
"popf\n" \ <=== not needed
"adcl %%ecx, %%ecx\n" \
"rep\n" \
"movsw\n" \
"popf\n" \
"adcl %%ecx, %%ecx\n" \
"rep\n" \
"movsb\n" \

You struggle too much for that movsw.

-mm one (which happen to be mine) is:

"movl %ecx,%4"
"shr $2,%ecx"
"rep ; movsl"
"movl %4,%%ecx"
"andl $3,%%ecx"
"jz 1ft" /* pay 2 byte penalty for a chance to skip microcoded rep */
"rep ; movsb"
"1:"

and I can still drop that jz. It is there just to have
a chance to skip rep movsb, it was measured to be slow
enough to matter. rep movs are a bit slow to start, on small
blocks it is measurable.

However, maybe it is even better without jz,
need to benchmark 'cold path' (i.e. where branch predictor
have no data to predict it) somehow.
--
vda

2005-04-06 16:13:18

by Denis Vlasenko

[permalink] [raw]
Subject: Re: [BUG mm] "fixed" i386 memcpy inlining buggy

On Tuesday 05 April 2005 19:34, Christophe Saout wrote:
> the new i386 memcpy macro is a ticking timebomb.
>
> I've been debugging a new mISDN crash, just to find out that a memcpy
> was not inlined correctly.
>
> Andrew, you should drop the fix-i386-memcpy.patch (or have it fixed).

Updated patch against 2.6.11 follows. This one, like the original
patch, is run tested too.

This time I took no chances, esi/edi contents are
explicitly propagated from one asm() block to another.
I didn't do it before, not expecting that gcc can be
soooo incredibly clever. Sorry.

Christophe does this one look/compile ok?
--
vda


Attachments:
(No filename) (620.00 B)
string2.h.diff (3.21 kB)
Download all attachments

2005-04-07 07:32:37

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [BUG mm] "fixed" i386 memcpy inlining buggy


> The only thing that would avoid this is to either tell the compiler to
> never put esi/edi in memory (which I think is not possibly across
> different versions of gcc) or to always generate a single asm section
> for all the different cases.

Use __asm__ ("%esi") and __asm__ ("%edi"). It is not guaranteed that
they access the registers always (you can still have copy propagation
etcetera); but, if your __asm__ statement constraints match the register
you specify, then you can be reasonably sure that good code is produced.

Paolo