This patch increases the performance of AES x86-64 implementation. The
average increment is more than 6.3% and the max increment is
more than 10.2% on Intel CORE 2 CPU. The performance increment is
gained via the following methods:
- Two additional temporary registers are used to hold the subset of
the state, so that the dependency between instructions is reduced.
- The expanded key is loaded via 2 64bit load instead of 4 32-bit load.
This patch is based on 2.6.25-rc8-mm1.
The file attached is the test data via: modprobe tcrypt mode=200
- dmesg_1_core-stockn: stock kernel data
- dmesg_1_core-op4n: patched kernel data
- percent.txt: (time_patched - time_stock) / time_stock * 100
Signed-off-by: Huang Ying <[email protected]>
---
arch/x86/crypto/aes-x86_64-asm_64.S | 101 ++++++++++++++++++++----------------
include/crypto/aes.h | 1
2 files changed, 58 insertions(+), 44 deletions(-)
--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -46,70 +46,81 @@
#define R7 %rbp
#define R7E %ebp
#define R8 %r8
+#define R8E %r8d
#define R9 %r9
+#define R9E %r9d
#define R10 %r10
#define R11 %r11
+#define R12 %r12
+#define R12E %r12d
+#define R16 %rsp
#define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \
.global FUNC; \
.type FUNC,@function; \
.align 8; \
-FUNC: movq r1,r2; \
- movq r3,r4; \
- leaq BASE+KEY+48+4(r8),r9; \
- movq r10,r11; \
- movl (r7),r5 ## E; \
- movl 4(r7),r1 ## E; \
- movl 8(r7),r6 ## E; \
- movl 12(r7),r7 ## E; \
- movl BASE+0(r8),r10 ## E; \
- xorl -48(r9),r5 ## E; \
- xorl -44(r9),r1 ## E; \
- xorl -40(r9),r6 ## E; \
- xorl -36(r9),r7 ## E; \
- cmpl $24,r10 ## E; \
+FUNC: subq $24,r11; \
+ movl (r6),r4 ## E; \
+ leaq BASE+KEY+48+8(r7),r8; \
+ movq r1,(r11); \
+ movq r9,r10; \
+ movl 4(r6),r1 ## E; \
+ movq r2,8(r11); \
+ movl 8(r6),r5 ## E; \
+ movq r3,16(r11); \
+ movl 12(r6),r6 ## E; \
+ movl BASE+0(r7),r9 ## E; \
+ xorl -48(r8),r4 ## E; \
+ xorl -44(r8),r1 ## E; \
+ xorl -40(r8),r5 ## E; \
+ xorl -36(r8),r6 ## E; \
+ cmpl $24,r9 ## E; \
jb B128; \
- leaq 32(r9),r9; \
+ leaq 32(r8),r8; \
je B192; \
- leaq 32(r9),r9;
+ leaq 32(r8),r8;
#define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \
- movq r1,r2; \
- movq r3,r4; \
- movl r5 ## E,(r9); \
- movl r6 ## E,4(r9); \
- movl r7 ## E,8(r9); \
- movl r8 ## E,12(r9); \
+ movq (r9),r1; \
+ movl r4 ## E,(r8); \
+ movq 8(r9),r2; \
+ movl r5 ## E,4(r8); \
+ movq 16(r9),r3; \
+ movl r6 ## E,8(r8); \
+ addq $24,r9; \
+ movl r7 ## E,12(r8); \
ret;
-#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \
+#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,ra,rb,rc,rd) \
movzbl r2 ## H,r5 ## E; \
movzbl r2 ## L,r6 ## E; \
+ movl r4 ## E,r8 ## E; \
+ shrl $16,r4 ## E; \
movl TAB+1024(,r5,4),r5 ## E;\
- movw r4 ## X,r2 ## X; \
movl TAB(,r6,4),r6 ## E; \
- roll $16,r2 ## E; \
- shrl $16,r4 ## E; \
movzbl r4 ## H,r7 ## E; \
movzbl r4 ## L,r4 ## E; \
- xorl OFFSET(r8),ra ## E; \
- xorl OFFSET+4(r8),rb ## E; \
+ movq OFFSET(r11),r10; \
+ shrl $16,r2 ## E; \
+ movl r3 ## E,r9 ## E; \
xorl TAB+3072(,r7,4),r5 ## E;\
xorl TAB+2048(,r4,4),r6 ## E;\
- movzbl r1 ## L,r7 ## E; \
movzbl r1 ## H,r4 ## E; \
- movl TAB+1024(,r4,4),r4 ## E;\
- movw r3 ## X,r1 ## X; \
- roll $16,r1 ## E; \
+ movzbl r1 ## L,r7 ## E; \
shrl $16,r3 ## E; \
+ movl TAB+1024(,r4,4),r4 ## E;\
xorl TAB(,r7,4),r5 ## E; \
+ shrl $16,r1 ## E; \
movzbl r3 ## H,r7 ## E; \
movzbl r3 ## L,r3 ## E; \
xorl TAB+3072(,r7,4),r4 ## E;\
xorl TAB+2048(,r3,4),r5 ## E;\
movzbl r1 ## H,r7 ## E; \
movzbl r1 ## L,r3 ## E; \
- shrl $16,r1 ## E; \
+ xorl r10 ## E,ra ## E; \
+ movl r9 ## E,r1 ## E; \
+ movq OFFSET+8(r11),r9; \
+ shrq $32,r10; \
xorl TAB+3072(,r7,4),r6 ## E;\
movl TAB+2048(,r3,4),r3 ## E;\
movzbl r1 ## H,r7 ## E; \
@@ -118,38 +129,40 @@ FUNC: movq r1,r2; \
xorl TAB(,r1,4),r3 ## E; \
movzbl r2 ## H,r1 ## E; \
movzbl r2 ## L,r7 ## E; \
- shrl $16,r2 ## E; \
+ xorl r9 ## E, rc ## E; \
+ movl r8 ## E,r2 ## E; \
+ shrq $32,r9; \
+ xorl r10 ## E,rb ## E; \
xorl TAB+3072(,r1,4),r3 ## E;\
xorl TAB+2048(,r7,4),r4 ## E;\
movzbl r2 ## H,r1 ## E; \
+ xorl r9 ## E, rd ## E; \
movzbl r2 ## L,r2 ## E; \
- xorl OFFSET+8(r8),rc ## E; \
- xorl OFFSET+12(r8),rd ## E; \
- xorl TAB+1024(,r1,4),r3 ## E;\
- xorl TAB(,r2,4),r4 ## E;
+ xorl TAB(,r2,4),r4 ## E; \
+ xorl TAB+1024(,r1,4),r3 ## E;
#define move_regs(r1,r2,r3,r4) \
movl r3 ## E,r1 ## E; \
movl r4 ## E,r2 ## E;
#define entry(FUNC,KEY,B128,B192) \
- prologue(FUNC,KEY,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11)
+ prologue(FUNC,KEY,B128,B192,R2,R7,R12,R1,R3,R4,R6,R10,R5,R11,R16)
-#define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11)
+#define return epilogue(R2,R7,R12,R5,R6,R3,R4,R11,R16)
#define encrypt_round(TAB,OFFSET) \
- round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \
+ round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R8,R9,R12,R10,R5,R6,R3,R4) \
move_regs(R1,R2,R5,R6)
#define encrypt_final(TAB,OFFSET) \
- round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4)
+ round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R8,R9,R12,R10,R5,R6,R3,R4)
#define decrypt_round(TAB,OFFSET) \
- round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) \
+ round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R8,R9,R12,R10,R5,R6,R3,R4) \
move_regs(R1,R2,R5,R6)
#define decrypt_final(TAB,OFFSET) \
- round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4)
+ round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R8,R9,R12,R10,R5,R6,R3,R4)
/* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */
--- a/include/crypto/aes.h
+++ b/include/crypto/aes.h
@@ -19,6 +19,7 @@
struct crypto_aes_ctx {
u32 key_length;
+ u32 _pad1;
u32 key_enc[AES_MAX_KEYLENGTH_U32];
u32 key_dec[AES_MAX_KEYLENGTH_U32];
};
* Huang, Ying | 2008-04-09 14:41:02 [+0800]:
>This patch increases the performance of AES x86-64 implementation. The
>average increment is more than 6.3% and the max increment is
>more than 10.2% on Intel CORE 2 CPU. The performance increment is
>gained via the following methods:
>
>- Two additional temporary registers are used to hold the subset of
> the state, so that the dependency between instructions is reduced.
>
>- The expanded key is loaded via 2 64bit load instead of 4 32-bit load.
>
>From your description I would assume that the performance can only
increase. However, on my
|model name : AMD Athlon(tm) 64 Processor 3200+
the opposite is the case [1], [2]. I dunno why and I didn't mixup
patched & unpached :). I checked this patch on
|model name : Intel(R) Core(TM)2 CPU T7200 @ 2.00GHz
and the performance really increases [3], [4].
[1] http://download.breakpoint.cc/aes_patch/patched.txt
[2] http://download.breakpoint.cc/aes_patch/unpatched.txt
[3] http://download.breakpoint.cc/aes_patch/perf_patched.txt
[4] http://download.breakpoint.cc/aes_patch/perf_originall.txt
>---
> arch/x86/crypto/aes-x86_64-asm_64.S | 101 ++++++++++++++++++++----------------
> include/crypto/aes.h | 1
> 2 files changed, 58 insertions(+), 44 deletions(-)
>
>--- a/include/crypto/aes.h
>+++ b/include/crypto/aes.h
>@@ -19,6 +19,7 @@
>
> struct crypto_aes_ctx {
> u32 key_length;
>+ u32 _pad1;
Why is this pad required? Do you want special alignment of the keys?
> u32 key_enc[AES_MAX_KEYLENGTH_U32];
> u32 key_dec[AES_MAX_KEYLENGTH_U32];
> };
>
Sebastian
On Wed, 2008-04-16 at 09:31 +0200, Sebastian Siewior wrote:
> * Huang, Ying | 2008-04-09 14:41:02 [+0800]:
>
> >This patch increases the performance of AES x86-64 implementation. The
> >average increment is more than 6.3% and the max increment is
> >more than 10.2% on Intel CORE 2 CPU. The performance increment is
> >gained via the following methods:
> >
> >- Two additional temporary registers are used to hold the subset of
> > the state, so that the dependency between instructions is reduced.
> >
> >- The expanded key is loaded via 2 64bit load instead of 4 32-bit load.
> >
>
> From your description I would assume that the performance can only
> increase. However, on my
> |model name : AMD Athlon(tm) 64 Processor 3200+
> the opposite is the case [1], [2]. I dunno why and I didn't mixup
> patched & unpached :). I checked this patch on
En. I have no AMD machine. So I have not tested the patch on it. Maybe
there are some pipeline or load/store unit difference between Intel and
AMD CPUs. Tomorrow I can split the patch into a set of small patches,
with one patch for one small step. Can you help me to test these patches
to find out the reason for degradation on AMD CPU.
> |model name : Intel(R) Core(TM)2 CPU T7200 @ 2.00GHz
> and the performance really increases [3], [4].
>
> [1] http://download.breakpoint.cc/aes_patch/patched.txt
> [2] http://download.breakpoint.cc/aes_patch/unpatched.txt
> [3] http://download.breakpoint.cc/aes_patch/perf_patched.txt
> [4] http://download.breakpoint.cc/aes_patch/perf_originall.txt
>
> >---
> > arch/x86/crypto/aes-x86_64-asm_64.S | 101 ++++++++++++++++++++----------------
> > include/crypto/aes.h | 1
> > 2 files changed, 58 insertions(+), 44 deletions(-)
> >
> >--- a/include/crypto/aes.h
> >+++ b/include/crypto/aes.h
> >@@ -19,6 +19,7 @@
> >
> > struct crypto_aes_ctx {
> > u32 key_length;
> >+ u32 _pad1;
>
> Why is this pad required? Do you want special alignment of the keys?
Because the key is loaded in 64bit in this patch, I want to align the
key with 64bit address.
> > u32 key_enc[AES_MAX_KEYLENGTH_U32];
> > u32 key_dec[AES_MAX_KEYLENGTH_U32];
> > };
> >
Best Regards,
Huang Ying
"Huang, Ying" <[email protected]> writes:
>
> En. I have no AMD machine. So I have not tested the patch on it. Maybe
> there are some pipeline or load/store unit difference between Intel and
> AMD CPUs. Tomorrow I can split the patch into a set of small patches,
> with one patch for one small step. Can you help me to test these patches
> to find out the reason for degradation on AMD CPU.
It would be also quite possible to use two different implementations,
one for AMD another for Intel. crypto frame work should have no
problems dealing with that.
-Andi
On Wed, Apr 16, 2008 at 10:23:04AM +0200, Andi Kleen wrote:
>
> It would be also quite possible to use two different implementations,
> one for AMD another for Intel. crypto frame work should have no
> problems dealing with that.
Yes that would definitely be an option.
Thanks,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
cut Alexander Kjeldaas <[email protected]> from CC coz his mails bounce.
* Huang, Ying | 2008-04-16 16:19:09 [+0800]:
>Can you help me to test these patches
>to find out the reason for degradation on AMD CPU.
Sure.
>> >--- a/include/crypto/aes.h
>> >+++ b/include/crypto/aes.h
>> >@@ -19,6 +19,7 @@
>> >
>> > struct crypto_aes_ctx {
>> > u32 key_length;
>> >+ u32 _pad1;
>>
>> Why is this pad required? Do you want special alignment of the keys?
>
>Because the key is loaded in 64bit in this patch, I want to align the
>key with 64bit address.
Than this won't work all the time. To make it bulletproof
- set .cra_alignmask in the glue code properly
- use the attribute aligned thing
- retrieve your private struct via crypto_tfm_ctx_aligned()
You might want to take a look on padlock-aes.c. The same thing is done
there but instead of crypto_tfm_ctx_aligned() a private function is
used (to let the compiler optimize most of the code away). Depending on
Herbert's mood you might get away with this as well (what would be
probably the case since you might prefer to do it asm) :)
>> > u32 key_enc[AES_MAX_KEYLENGTH_U32];
>> > u32 key_dec[AES_MAX_KEYLENGTH_U32];
>> > };
>> >
>
>Best Regards,
>Huang Ying
>
Sebastian
On Wed, 2008-04-16 at 20:40 +0200, Sebastian Siewior wrote:
[...]
> >> >--- a/include/crypto/aes.h
> >> >+++ b/include/crypto/aes.h
> >> >@@ -19,6 +19,7 @@
> >> >
> >> > struct crypto_aes_ctx {
> >> > u32 key_length;
> >> >+ u32 _pad1;
> >>
> >> Why is this pad required? Do you want special alignment of the keys?
> >
> >Because the key is loaded in 64bit in this patch, I want to align the
> >key with 64bit address.
>
> Than this won't work all the time. To make it bulletproof
> - set .cra_alignmask in the glue code properly
> - use the attribute aligned thing
> - retrieve your private struct via crypto_tfm_ctx_aligned()
As far as I know, the CRYPTO_MINALIGN is defined in
include/linux/crypto.h as __alignof__(unsigned long long), and the
__crt_ctx in crypto_tfm is aligned in CRYPTO_MINALIGN. So I think adding
a pad is sufficient for x86_64 implementation.
Best Regards,
Huang Ying
Hi, Sebastian,
The files attached is the separated patches, from step1 to step 7. Thank
you very much for your help.
Best Regards,
Huang Ying
On Wed, 2008-04-16 at 20:40 +0200, Sebastian Siewior wrote:
> cut Alexander Kjeldaas <[email protected]> from CC coz his mails bounce.
>
> * Huang, Ying | 2008-04-16 16:19:09 [+0800]:
>
> >Can you help me to test these patches
> >to find out the reason for degradation on AMD CPU.
> Sure.
>
> >> >--- a/include/crypto/aes.h
> >> >+++ b/include/crypto/aes.h
> >> >@@ -19,6 +19,7 @@
> >> >
> >> > struct crypto_aes_ctx {
> >> > u32 key_length;
> >> >+ u32 _pad1;
> >>
> >> Why is this pad required? Do you want special alignment of the keys?
> >
> >Because the key is loaded in 64bit in this patch, I want to align the
> >key with 64bit address.
>
> Than this won't work all the time. To make it bulletproof
> - set .cra_alignmask in the glue code properly
> - use the attribute aligned thing
> - retrieve your private struct via crypto_tfm_ctx_aligned()
>
> You might want to take a look on padlock-aes.c. The same thing is done
> there but instead of crypto_tfm_ctx_aligned() a private function is
> used (to let the compiler optimize most of the code away). Depending on
> Herbert's mood you might get away with this as well (what would be
> probably the case since you might prefer to do it asm) :)
>
> >> > u32 key_enc[AES_MAX_KEYLENGTH_U32];
> >> > u32 key_dec[AES_MAX_KEYLENGTH_U32];
> >> > };
> >> >
> >
> >Best Regards,
> >Huang Ying
> >
>
> Sebastian
On Thu, Apr 17, 2008 at 09:52:03AM +0800, Huang, Ying wrote:
>
> As far as I know, the CRYPTO_MINALIGN is defined in
> include/linux/crypto.h as __alignof__(unsigned long long), and the
> __crt_ctx in crypto_tfm is aligned in CRYPTO_MINALIGN. So I think adding
> a pad is sufficient for x86_64 implementation.
It should be sufficient but it would be better to use an align
attribute to better document the intention.
Thanks,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Thu, 2008-04-17 at 11:34 +0800, Herbert Xu wrote:
> On Thu, Apr 17, 2008 at 09:52:03AM +0800, Huang, Ying wrote:
> >
> > As far as I know, the CRYPTO_MINALIGN is defined in
> > include/linux/crypto.h as __alignof__(unsigned long long), and the
> > __crt_ctx in crypto_tfm is aligned in CRYPTO_MINALIGN. So I think adding
> > a pad is sufficient for x86_64 implementation.
>
> It should be sufficient but it would be better to use an align
> attribute to better document the intention.
OK. I will use the align attribute.
Best Regards,
Huang Ying
* Herbert Xu | 2008-04-17 11:34:02 [+0800]:
>On Thu, Apr 17, 2008 at 09:52:03AM +0800, Huang, Ying wrote:
>>
>> As far as I know, the CRYPTO_MINALIGN is defined in
>> include/linux/crypto.h as __alignof__(unsigned long long), and the
>> __crt_ctx in crypto_tfm is aligned in CRYPTO_MINALIGN. So I think adding
>> a pad is sufficient for x86_64 implementation.
>
>It should be sufficient but it would be better to use an align
Doesn't this imply that kmalloc() returns memory that is always pointer
aligned what isn't the case AFAIK?
>Thanks,
Sebastian
* Huang, Ying | 2008-04-17 11:36:43 [+0800]:
>Hi, Sebastian,
Hi Huang,
>The files attached is the separated patches, from step1 to step 7. Thank
>you very much for your help.
I've run the following script:
|#!/bin/bash
|check_error()
|{
| r=$?
| if [ ! $r -eq 0 ]
| then
| exit 1
| fi
|}
|
|modprobe tcrypt mode=200
|modprobe tcrypt mode=200
|dmesg -c > step-0.txt
|
|for ((i=1; i<=7; i++))
|do
| quilt push step${i}.patch
| check_error
|
| make
| check_error
|
| rmmod aes_x86_64
| check_error
|
| insmod arch/x86/crypto/aes-x86_64.ko
| check_error
|
| modprobe tcrypt mode=200
| modprobe tcrypt mode=200
| dmesg -c > step-${i}.txt
|done
and the result is attached.
>Best Regards,
>Huang Ying
Sebastian
On Thu, Apr 24, 2008 at 12:28:43AM +0200, Sebastian Siewior wrote:
> >> __crt_ctx in crypto_tfm is aligned in CRYPTO_MINALIGN. So I think adding
> >> a pad is sufficient for x86_64 implementation.
> >
> >It should be sufficient but it would be better to use an align
> Doesn't this imply that kmalloc() returns memory that is always pointer
> aligned what isn't the case AFAIK?
Parse error :)
kmalloc returns memory that should always be aligned to
CRYPTO_MINALIGN and in particular it's always pointer aligned.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
Hi, Sebastian,
Thank you very much for your help. From the result you sent, the biggest
performance degradation is between step 4 and step 5. In that step, one
more register is saved before and restored after encryption/decryption.
So I think the reason maybe the read/write port throughput of CPU.
I changed the patches to group the read or write together instead of
interleaving. Can you help me to test these new patches? The new patches
is attached with the mail.
Best Regards,
Huang Ying
On Thu, 2008-04-24 at 00:32 +0200, Sebastian Siewior wrote:
> * Huang, Ying | 2008-04-17 11:36:43 [+0800]:
>
> >Hi, Sebastian,
> Hi Huang,
>
> >The files attached is the separated patches, from step1 to step 7. Thank
> >you very much for your help.
> I've run the following script:
>
> |#!/bin/bash
> |check_error()
> |{
> | r=$?
> | if [ ! $r -eq 0 ]
> | then
> | exit 1
> | fi
> |}
> |
> |modprobe tcrypt mode=200
> |modprobe tcrypt mode=200
> |dmesg -c > step-0.txt
> |
> |for ((i=1; i<=7; i++))
> |do
> | quilt push step${i}.patch
> | check_error
> |
> | make
> | check_error
> |
> | rmmod aes_x86_64
> | check_error
> |
> | insmod arch/x86/crypto/aes-x86_64.ko
> | check_error
> |
> | modprobe tcrypt mode=200
> | modprobe tcrypt mode=200
> | dmesg -c > step-${i}.txt
> |done
>
> and the result is attached.
>
> >Best Regards,
> >Huang Ying
>
> Sebastian
* Huang, Ying | 2008-04-25 11:11:17 [+0800]:
>Hi, Sebastian,
Hi Huang,
>So I think the reason maybe the read/write port throughput of CPU.
Ah so it is just a local problem you say? I may get my fingers on
another amd box....
>I changed the patches to group the read or write together instead of
>interleaving. Can you help me to test these new patches? The new patches
>is attached with the mail.
Sure.
>Best Regards,
>Huang Ying
Sebastian
Hi, Sebastian,
On Fri, 2008-04-25 at 09:12 +0200, Sebastian Siewior wrote:
> * Huang, Ying | 2008-04-25 11:11:17 [+0800]:
>
> >Hi, Sebastian,
> Hi Huang,
>
> >So I think the reason maybe the read/write port throughput of CPU.
> Ah so it is just a local problem you say? I may get my fingers on
> another amd box....
I mean the read/write port design difference between CPU
micro-architecture. It is not a local problem. Sorry for my English.
> >I changed the patches to group the read or write together instead of
> >interleaving. Can you help me to test these new patches? The new patches
> >is attached with the mail.
> Sure.
Thank you very much.
Best Regards,
Huang Ying
* Huang, Ying | 2008-04-25 15:21:27 [+0800]:
>Hi, Sebastian,
Hi Huang,
>> >So I think the reason maybe the read/write port throughput of CPU.
>> Ah so it is just a local problem you say? I may get my fingers on
>> another amd box....
>
>I mean the read/write port design difference between CPU
>micro-architecture. It is not a local problem. Sorry for my English.
No, that is what I meant somehow. It is possible that AMD improved this
in a later CPU generation. The AMD box I have is a pretty old one in
comparison to Intel's dual core. It is also possible that they are
aiming a different target and nothing changed :)
>Best Regards,
>Huang Ying
Sebastian
* Huang, Ying | 2008-04-25 11:11:17 [+0800]:
>Hi, Sebastian,
Hi Huang,
sorry for the delay.
>I changed the patches to group the read or write together instead of
>interleaving. Can you help me to test these new patches? The new patches
>is attached with the mail.
The new results are attached.
>
>Best Regards,
>Huang Ying
Sebastian
one of the more important details in evaluating these changes would be the
family/model/stepping of the processors being microbenchmarked... could
you folks include /proc/cpuinfo with the results?
also -- please drop the #define for R16 to %rsp ... it obfuscates more
than it helps anything.
thanks
-dean
On Wed, 30 Apr 2008, Sebastian Siewior wrote:
> * Huang, Ying | 2008-04-25 11:11:17 [+0800]:
>
> >Hi, Sebastian,
> Hi Huang,
>
> sorry for the delay.
>
> >I changed the patches to group the read or write together instead of
> >interleaving. Can you help me to test these new patches? The new patches
> >is attached with the mail.
> The new results are attached.
>
> >
> >Best Regards,
> >Huang Ying
>
> Sebastian
>
Hi,
On Sat, 2008-05-03 at 23:25 -0700, dean gaudet wrote:
> one of the more important details in evaluating these changes would be the
> family/model/stepping of the processors being microbenchmarked... could
> you folks include /proc/cpuinfo with the results?
The file attached is /proc/cpuinfo of my testing machine.
Best Regards,
Huang Ying
> also -- please drop the #define for R16 to %rsp ... it obfuscates more
> than it helps anything.
>
> thanks
> -dean
>
> On Wed, 30 Apr 2008, Sebastian Siewior wrote:
>
> > * Huang, Ying | 2008-04-25 11:11:17 [+0800]:
> >
> > >Hi, Sebastian,
> > Hi Huang,
> >
> > sorry for the delay.
> >
> > >I changed the patches to group the read or write together instead of
> > >interleaving. Can you help me to test these new patches? The new patches
> > >is attached with the mail.
> > The new results are attached.
> >
> > >
> > >Best Regards,
> > >Huang Ying
> >
> > Sebastian
> >
Hi, Sebastian,
On Wed, 2008-04-30 at 00:12 +0200, Sebastian Siewior wrote:
> * Huang, Ying | 2008-04-25 11:11:17 [+0800]:
>
> >Hi, Sebastian,
> Hi Huang,
>
> sorry for the delay.
>
> >I changed the patches to group the read or write together instead of
> >interleaving. Can you help me to test these new patches? The new patches
> >is attached with the mail.
> The new results are attached.
It seems that the performance degradation between step4 to step5 is
decreased. But the overall performance degradation between step0 to
step7 is still about 5%.
I also test the patches on Pentium 4 CPUs, and the performance decreased
too. So I think this optimization is CPU micro-architecture dependent.
While the dependency between instructions are reduced, more registers
(at most 3) are saved/restored before/after encryption/decryption. If
the CPU has no extra execution unit for newly independent instructions
but more registers are saved/restored, the performance will decrease.
We maybe should select different implementation based on
micro-architecture.
Best Regards,
Huang Ying