2005-04-25 07:20:08

by Denis Vlasenko

[permalink] [raw]
Subject: [PATCH] i386: optimize swab64

I noticed that swab64 explicitly swaps 32-bit halves, but this is
not really needed because CPU is 32-bit anyway and we can
just tell GCC to treat registers as being swapped.

Example of resulting code:

mov 0x20(%ecx,%edi,8),%eax
mov 0x24(%ecx,%edi,8),%edx
lea 0x1(%edi),%esi
mov %esi,0xfffffdf4(%ebp)
mov %eax,%ebx
mov %edx,%esi
bswap %ebx
bswap %esi
mov %esi,0xffffff74(%ebp,%edi,8)
mov %ebx,0xffffff78(%ebp,%edi,8)

As you can see, swap is achieved simply by using
appropriate registers in last two insns.

(Why does gcc do extra register moves just before bswaps
is another question. No regression here, old code had them too)

Run-tested.
--
vda

diff -urpN linux-2.6.12-rc2.0.orig/include/asm-i386/byteorder.h linux-2.6.12-rc2.z.cur/include/asm-i386/byteorder.h
--- linux-2.6.12-rc2.0.orig/include/asm-i386/byteorder.h Tue Oct 19 00:54:36 2004
+++ linux-2.6.12-rc2.z.cur/include/asm-i386/byteorder.h Sun Apr 24 22:38:14 2005
@@ -25,6 +25,8 @@ static __inline__ __attribute_const__ __
return x;
}

+/* NB: swap of 32-bit halves is achieved by asm constraints.
+** This will save a xchgl in many cases */
static __inline__ __attribute_const__ __u64 ___arch__swab64(__u64 val)
{
union {
@@ -33,13 +35,13 @@ static __inline__ __attribute_const__ __
} v;
v.u = val;
#ifdef CONFIG_X86_BSWAP
- asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1"
- : "=r" (v.s.a), "=r" (v.s.b)
+ asm("bswapl %0 ; bswapl %1"
+ : "=r" (v.s.b), "=r" (v.s.a)
: "0" (v.s.a), "1" (v.s.b));
#else
- v.s.a = ___arch__swab32(v.s.a);
+ v.s.a = ___arch__swab32(v.s.a);
v.s.b = ___arch__swab32(v.s.b);
- asm("xchgl %0,%1" : "=r" (v.s.a), "=r" (v.s.b) : "0" (v.s.a), "1" (v.s.b));
+ asm("" : "=r" (v.s.b), "=r" (v.s.a) : "0" (v.s.a), "1" (v.s.b));
#endif
return v.u;
}


2005-04-25 16:00:12

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] i386: optimize swab64

On Mon, Apr 25, 2005 at 10:19:30AM +0300, Denis Vlasenko wrote:
> I noticed that swab64 explicitly swaps 32-bit halves, but this is
> not really needed because CPU is 32-bit anyway and we can
> just tell GCC to treat registers as being swapped.

No, we went through this exactly when the code was originally done.
gcc puts long long only into aligned register pairs, and with
register swap you need at least 4 registers which blows near
all possible registers away and completely breaks register
allocation in the function. Dont apply this!

-Andi

>
> Example of resulting code:
>
> mov 0x20(%ecx,%edi,8),%eax
> mov 0x24(%ecx,%edi,8),%edx
> lea 0x1(%edi),%esi
> mov %esi,0xfffffdf4(%ebp)
> mov %eax,%ebx
> mov %edx,%esi
> bswap %ebx
> bswap %esi
> mov %esi,0xffffff74(%ebp,%edi,8)
> mov %ebx,0xffffff78(%ebp,%edi,8)
>
> As you can see, swap is achieved simply by using
> appropriate registers in last two insns.
>
> (Why does gcc do extra register moves just before bswaps
> is another question. No regression here, old code had them too)

2005-04-26 11:13:31

by Denis Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] i386: optimize swab64

On Monday 25 April 2005 18:53, Andi Kleen wrote:
> On Mon, Apr 25, 2005 at 10:19:30AM +0300, Denis Vlasenko wrote:
> > I noticed that swab64 explicitly swaps 32-bit halves, but this is
> > not really needed because CPU is 32-bit anyway and we can
> > just tell GCC to treat registers as being swapped.
>
> No, we went through this exactly when the code was originally done.
> gcc puts long long only into aligned register pairs, and with

I don't see this. However, gcc is indeed does some unneeded moves,
both with and without xchgl. I filed a bug report:

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21202

> register swap you need at least 4 registers which blows near
> all possible registers away and completely breaks register
> allocation in the function. Dont apply this!

Andi, are you saying that code gets worse with this patch?
This is not true at least for gcc 3.4.3 and 4.0.0.
I just re-checked this.

This is with original code:

# objdump -r -d crypto/wp512.o
00000000 <wp512_process_buffer>:
0: 55 push %ebp
1: 89 e5 mov %esp,%ebp
3: 57 push %edi
4: 56 push %esi
5: 53 push %ebx
6: 81 ec 00 02 00 00 sub $0x200,%esp
c: 31 ff xor %edi,%edi
e: 8b 4d 08 mov 0x8(%ebp),%ecx
11: 8b 54 f9 24 mov 0x24(%ecx,%edi,8),%edx
15: 8b 44 f9 20 mov 0x20(%ecx,%edi,8),%eax
19: 8d 77 01 lea 0x1(%edi),%esi
1c: 89 b5 f4 fd ff ff mov %esi,0xfffffdf4(%ebp)
22: 89 c1 mov %eax,%ecx
24: 89 d6 mov %edx,%esi
26: 0f c9 bswap %ecx
28: 0f ce bswap %esi
2a: 87 ce xchg %ecx,%esi <=======
2c: 89 8c fd 74 ff ff ff mov %ecx,0xffffff74(%ebp,%edi,8)
33: 89 b4 fd 78 ff ff ff mov %esi,0xffffff78(%ebp,%edi,8)

Patched:

# objdump -r -d crypto_noswap/wp512.o
00000000 <wp512_process_buffer>:
0: 55 push %ebp
1: 89 e5 mov %esp,%ebp
3: 57 push %edi
4: 56 push %esi
5: 53 push %ebx
6: 81 ec 00 02 00 00 sub $0x200,%esp
c: 31 ff xor %edi,%edi
e: 8b 4d 08 mov 0x8(%ebp),%ecx
11: 8b 44 f9 20 mov 0x20(%ecx,%edi,8),%eax
15: 8b 54 f9 24 mov 0x24(%ecx,%edi,8),%edx
19: 8d 77 01 lea 0x1(%edi),%esi
1c: 89 b5 f4 fd ff ff mov %esi,0xfffffdf4(%ebp)
22: 89 c3 mov %eax,%ebx
24: 89 d6 mov %edx,%esi
26: 0f cb bswap %ebx
28: 0f ce bswap %esi
<========= NO xchg
2a: 89 b4 fd 74 ff ff ff mov %esi,0xffffff74(%ebp,%edi,8)
31: 89 9c fd 78 ff ff ff mov %ebx,0xffffff78(%ebp,%edi,8)

It is not a win only for wp512, other crypto modules are a tiny bit smaller too:

# echo crypto*/*.o | xargs -n1 | grep -Fv .mod. | sort -t / -k2,99 | xargs size
text data bss dec hex filename
17743 108 0 17851 45bb crypto/khazad.o
17735 108 0 17843 45b3 crypto_noswap/khazad.o
666 108 0 774 306 crypto/sha1.o
664 108 0 772 304 crypto_noswap/sha1.o
5160 364 0 5524 1594 crypto/sha512.o
5156 364 0 5520 1590 crypto_noswap/sha512.o
10239 364 0 10603 296b crypto/tgr192.o
10233 364 0 10597 2965 crypto_noswap/tgr192.o
22774 364 0 23138 5a62 crypto/wp512.o
22770 364 0 23134 5a5e crypto_noswap/wp512.o
--
vda