2010-11-03 21:14:45

by Mathias Krause

[permalink] [raw]
Subject: [PATCH v3] x86, crypto: ported aes-ni implementation to x86

The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.

To illustrate the performance gain here's a short summary of the tcrypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:

x86: i568 aes-ni delta
256 bit, 8kB blocks, ECB: 125.94 MB/s 187.09 MB/s +48.6%
256 bit, 8kB blocks, CBC: 118.04 MB/s 171.61 MB/s +45.4%
320 bit, 8kB blocks, LRW: 128.20 MB/s 168.32 MB/s +31.3%
512 bit, 8kB blocks, XTS: 118.35 MB/s 166.49 MB/s +40.7%

Additionally, due to some minor optimizations, the 64-bit version also
got a performance gain up to 20% as seen below:

x86-64: old impl. new impl. delta
256 bit, 8kB blocks, ECB: 152.25 MB/s 183.84 MB/s +20.7%
256 bit, 8kB blocks, CBC: 144.12 MB/s 170.03 MB/s +18.0%
320 bit, 8kB blocks, LRW: 159.13 MB/s 169.69 MB/s +6.6%
512 bit, 8kB blocks, XTS: 144.27 MB/s 172.14 MB/s +19.3%

Signed-off-by: Mathias Krause <[email protected]>
---
v3 changes:
* fixed 32-bit implementation of aesni_ecb_enc (a hunk somehow moved to the end
of another function)

Sorry for the noise. I should have reviewed the patch more carefully :/

v2 changes:
* hide almost all register names in macros so the same code base can be shared
between x86 and x86_64
* unified Kconfig documentation again
* added alignment constraints for internal functions.
---
arch/x86/crypto/aesni-intel_asm.S | 149 ++++++++++++++++++++++++++++-------
arch/x86/crypto/aesni-intel_glue.c | 22 ++++-
crypto/Kconfig | 8 +-
3 files changed, 141 insertions(+), 38 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index ff16756..48d6f7c 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -9,6 +9,9 @@
* Vinodh Gopal <[email protected]>
* Kahraman Akdemir
*
+ * Ported x86_64 version to x86:
+ * Author: Mathias Krause <[email protected]>
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@@ -32,12 +35,16 @@
#define IN IN1
#define KEY %xmm2
#define IV %xmm3
+
#define BSWAP_MASK %xmm10
#define CTR %xmm11
#define INC %xmm12

+#ifdef __x86_64__
+#define AREG %rax
#define KEYP %rdi
#define OUTP %rsi
+#define UKEYP OUTP
#define INP %rdx
#define LEN %rcx
#define IVP %r8
@@ -46,6 +53,18 @@
#define TKEYP T1
#define T2 %r11
#define TCTR_LOW T2
+#else
+#define AREG %eax
+#define KEYP %edi
+#define OUTP AREG
+#define UKEYP OUTP
+#define INP %edx
+#define LEN %esi
+#define IVP %ebp
+#define KLEN %ebx
+#define T1 %ecx
+#define TKEYP T1
+#endif

_key_expansion_128:
_key_expansion_256a:
@@ -55,10 +74,11 @@ _key_expansion_256a:
shufps $0b10001100, %xmm0, %xmm4
pxor %xmm4, %xmm0
pxor %xmm1, %xmm0
- movaps %xmm0, (%rcx)
- add $0x10, %rcx
+ movaps %xmm0, (TKEYP)
+ add $0x10, TKEYP
ret

+.align 4
_key_expansion_192a:
pshufd $0b01010101, %xmm1, %xmm1
shufps $0b00010000, %xmm0, %xmm4
@@ -76,12 +96,13 @@ _key_expansion_192a:

movaps %xmm0, %xmm1
shufps $0b01000100, %xmm0, %xmm6
- movaps %xmm6, (%rcx)
+ movaps %xmm6, (TKEYP)
shufps $0b01001110, %xmm2, %xmm1
- movaps %xmm1, 16(%rcx)
- add $0x20, %rcx
+ movaps %xmm1, 0x10(TKEYP)
+ add $0x20, TKEYP
ret

+.align 4
_key_expansion_192b:
pshufd $0b01010101, %xmm1, %xmm1
shufps $0b00010000, %xmm0, %xmm4
@@ -96,10 +117,11 @@ _key_expansion_192b:
pxor %xmm3, %xmm2
pxor %xmm5, %xmm2

- movaps %xmm0, (%rcx)
- add $0x10, %rcx
+ movaps %xmm0, (TKEYP)
+ add $0x10, TKEYP
ret

+.align 4
_key_expansion_256b:
pshufd $0b10101010, %xmm1, %xmm1
shufps $0b00010000, %xmm2, %xmm4
@@ -107,8 +129,8 @@ _key_expansion_256b:
shufps $0b10001100, %xmm2, %xmm4
pxor %xmm4, %xmm2
pxor %xmm1, %xmm2
- movaps %xmm2, (%rcx)
- add $0x10, %rcx
+ movaps %xmm2, (TKEYP)
+ add $0x10, TKEYP
ret

/*
@@ -116,17 +138,23 @@ _key_expansion_256b:
* unsigned int key_len)
*/
ENTRY(aesni_set_key)
- movups (%rsi), %xmm0 # user key (first 16 bytes)
- movaps %xmm0, (%rdi)
- lea 0x10(%rdi), %rcx # key addr
- movl %edx, 480(%rdi)
+#ifndef __x86_64__
+ pushl KEYP
+ movl 8(%esp), KEYP # ctx
+ movl 12(%esp), UKEYP # in_key
+ movl 16(%esp), %edx # key_len
+#endif
+ movups (UKEYP), %xmm0 # user key (first 16 bytes)
+ movaps %xmm0, (KEYP)
+ lea 0x10(KEYP), TKEYP # key addr
+ movl %edx, 480(KEYP)
pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
cmp $24, %dl
jb .Lenc_key128
je .Lenc_key192
- movups 0x10(%rsi), %xmm2 # other user key
- movaps %xmm2, (%rcx)
- add $0x10, %rcx
+ movups 0x10(UKEYP), %xmm2 # other user key
+ movaps %xmm2, (TKEYP)
+ add $0x10, TKEYP
AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
call _key_expansion_256a
AESKEYGENASSIST 0x1 %xmm0 %xmm1
@@ -155,7 +183,7 @@ ENTRY(aesni_set_key)
call _key_expansion_256a
jmp .Ldec_key
.Lenc_key192:
- movq 0x10(%rsi), %xmm2 # other user key
+ movq 0x10(UKEYP), %xmm2 # other user key
AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
call _key_expansion_192a
AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
@@ -195,33 +223,47 @@ ENTRY(aesni_set_key)
AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
call _key_expansion_128
.Ldec_key:
- sub $0x10, %rcx
- movaps (%rdi), %xmm0
- movaps (%rcx), %xmm1
- movaps %xmm0, 240(%rcx)
- movaps %xmm1, 240(%rdi)
- add $0x10, %rdi
- lea 240-16(%rcx), %rsi
+ sub $0x10, TKEYP
+ movaps (KEYP), %xmm0
+ movaps (TKEYP), %xmm1
+ movaps %xmm0, 240(TKEYP)
+ movaps %xmm1, 240(KEYP)
+ add $0x10, KEYP
+ lea 240-16(TKEYP), UKEYP
.align 4
.Ldec_key_loop:
- movaps (%rdi), %xmm0
+ movaps (KEYP), %xmm0
AESIMC %xmm0 %xmm1
- movaps %xmm1, (%rsi)
- add $0x10, %rdi
- sub $0x10, %rsi
- cmp %rcx, %rdi
+ movaps %xmm1, (UKEYP)
+ add $0x10, KEYP
+ sub $0x10, UKEYP
+ cmp TKEYP, KEYP
jb .Ldec_key_loop
- xor %rax, %rax
+ xor AREG, AREG
+#ifndef __x86_64__
+ popl KEYP
+#endif
ret

/*
* void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
*/
ENTRY(aesni_enc)
+#ifndef __x86_64__
+ pushl KEYP
+ pushl KLEN
+ movl 12(%esp), KEYP
+ movl 16(%esp), OUTP
+ movl 20(%esp), INP
+#endif
movl 480(KEYP), KLEN # key length
movups (INP), STATE # input
call _aesni_enc1
movups STATE, (OUTP) # output
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+#endif
ret

/*
@@ -236,6 +278,7 @@ ENTRY(aesni_enc)
* KEY
* TKEYP (T1)
*/
+.align 4
_aesni_enc1:
movaps (KEYP), KEY # key
mov KEYP, TKEYP
@@ -298,6 +341,7 @@ _aesni_enc1:
* KEY
* TKEYP (T1)
*/
+.align 4
_aesni_enc4:
movaps (KEYP), KEY # key
mov KEYP, TKEYP
@@ -391,11 +435,22 @@ _aesni_enc4:
* void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
*/
ENTRY(aesni_dec)
+#ifndef __x86_64__
+ pushl KEYP
+ pushl KLEN
+ movl 12(%esp), KEYP
+ movl 16(%esp), OUTP
+ movl 20(%esp), INP
+#endif
mov 480(KEYP), KLEN # key length
add $240, KEYP
movups (INP), STATE # input
call _aesni_dec1
movups STATE, (OUTP) #output
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+#endif
ret

/*
@@ -410,6 +465,7 @@ ENTRY(aesni_dec)
* KEY
* TKEYP (T1)
*/
+.align 4
_aesni_dec1:
movaps (KEYP), KEY # key
mov KEYP, TKEYP
@@ -472,6 +528,7 @@ _aesni_dec1:
* KEY
* TKEYP (T1)
*/
+.align 4
_aesni_dec4:
movaps (KEYP), KEY # key
mov KEYP, TKEYP
@@ -566,6 +623,15 @@ _aesni_dec4:
* size_t len)
*/
ENTRY(aesni_ecb_enc)
+#ifndef __x86_64__
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl 16(%esp), KEYP
+ movl 20(%esp), OUTP
+ movl 24(%esp), INP
+ movl 28(%esp), LEN
+#endif
test LEN, LEN # check length
jz .Lecb_enc_ret
mov 480(KEYP), KLEN
@@ -602,6 +668,11 @@ ENTRY(aesni_ecb_enc)
cmp $16, LEN
jge .Lecb_enc_loop1
.Lecb_enc_ret:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+#endif
ret

/*
@@ -609,6 +680,15 @@ ENTRY(aesni_ecb_enc)
* size_t len);
*/
ENTRY(aesni_ecb_dec)
+#ifndef __x86_64__
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl 16(%esp), KEYP
+ movl 20(%esp), OUTP
+ movl 24(%esp), INP
+ movl 28(%esp), LEN
+#endif
test LEN, LEN
jz .Lecb_dec_ret
mov 480(KEYP), KLEN
@@ -646,8 +726,14 @@ ENTRY(aesni_ecb_dec)
cmp $16, LEN
jge .Lecb_dec_loop1
.Lecb_dec_ret:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+#endif
ret

+#ifdef __x86_64__
/*
* void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
* size_t len, u8 *iv)
@@ -744,6 +830,7 @@ ENTRY(aesni_cbc_dec)
* INC: == 1, in little endian
* BSWAP_MASK == endian swapping mask
*/
+.align 4
_aesni_inc_init:
movaps .Lbswap_mask, BSWAP_MASK
movaps IV, CTR
@@ -768,6 +855,7 @@ _aesni_inc_init:
* CTR: == output IV, in little endian
* TCTR_LOW: == lower qword of CTR
*/
+.align 4
_aesni_inc:
paddq INC, CTR
add $1, TCTR_LOW
@@ -839,3 +927,4 @@ ENTRY(aesni_ctr_enc)
movups IV, (IVP)
.Lctr_enc_just_ret:
ret
+#endif
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2cb3dcc..d0f0e7b 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -55,12 +55,14 @@ asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len);
asmlinkage void aesni_ecb_dec(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len);
+#ifdef CONFIG_X86_64
asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
+#endif

static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
{
@@ -254,6 +256,7 @@ static struct crypto_alg blk_ecb_alg = {
},
};

+#ifdef CONFIG_X86_64
static int cbc_encrypt(struct blkcipher_desc *desc,
struct scatterlist *dst, struct scatterlist *src,
unsigned int nbytes)
@@ -389,6 +392,7 @@ static struct crypto_alg blk_ctr_alg = {
},
},
};
+#endif

static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
unsigned int key_len)
@@ -500,6 +504,7 @@ static struct crypto_alg ablk_ecb_alg = {
},
};

+#ifdef CONFIG_X86_64
static int ablk_cbc_init(struct crypto_tfm *tfm)
{
struct cryptd_ablkcipher *cryptd_tfm;
@@ -612,6 +617,7 @@ static struct crypto_alg ablk_rfc3686_ctr_alg = {
},
};
#endif
+#endif

#ifdef HAS_LRW
static int ablk_lrw_init(struct crypto_tfm *tfm)
@@ -744,12 +750,13 @@ static int __init aesni_init(void)
goto __aes_err;
if ((err = crypto_register_alg(&blk_ecb_alg)))
goto blk_ecb_err;
+ if ((err = crypto_register_alg(&ablk_ecb_alg)))
+ goto ablk_ecb_err;
+#ifdef CONFIG_X86_64
if ((err = crypto_register_alg(&blk_cbc_alg)))
goto blk_cbc_err;
if ((err = crypto_register_alg(&blk_ctr_alg)))
goto blk_ctr_err;
- if ((err = crypto_register_alg(&ablk_ecb_alg)))
- goto ablk_ecb_err;
if ((err = crypto_register_alg(&ablk_cbc_alg)))
goto ablk_cbc_err;
if ((err = crypto_register_alg(&ablk_ctr_alg)))
@@ -758,6 +765,7 @@ static int __init aesni_init(void)
if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg)))
goto ablk_rfc3686_ctr_err;
#endif
+#endif
#ifdef HAS_LRW
if ((err = crypto_register_alg(&ablk_lrw_alg)))
goto ablk_lrw_err;
@@ -784,6 +792,7 @@ ablk_pcbc_err:
crypto_unregister_alg(&ablk_lrw_alg);
ablk_lrw_err:
#endif
+#ifdef CONFIG_X86_64
#ifdef HAS_CTR
crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
ablk_rfc3686_ctr_err:
@@ -792,12 +801,13 @@ ablk_rfc3686_ctr_err:
ablk_ctr_err:
crypto_unregister_alg(&ablk_cbc_alg);
ablk_cbc_err:
- crypto_unregister_alg(&ablk_ecb_alg);
-ablk_ecb_err:
crypto_unregister_alg(&blk_ctr_alg);
blk_ctr_err:
crypto_unregister_alg(&blk_cbc_alg);
blk_cbc_err:
+#endif
+ crypto_unregister_alg(&ablk_ecb_alg);
+ablk_ecb_err:
crypto_unregister_alg(&blk_ecb_alg);
blk_ecb_err:
crypto_unregister_alg(&__aesni_alg);
@@ -818,14 +828,16 @@ static void __exit aesni_exit(void)
#ifdef HAS_LRW
crypto_unregister_alg(&ablk_lrw_alg);
#endif
+#ifdef CONFIG_X86_64
#ifdef HAS_CTR
crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
#endif
crypto_unregister_alg(&ablk_ctr_alg);
crypto_unregister_alg(&ablk_cbc_alg);
- crypto_unregister_alg(&ablk_ecb_alg);
crypto_unregister_alg(&blk_ctr_alg);
crypto_unregister_alg(&blk_cbc_alg);
+#endif
+ crypto_unregister_alg(&ablk_ecb_alg);
crypto_unregister_alg(&blk_ecb_alg);
crypto_unregister_alg(&__aesni_alg);
crypto_unregister_alg(&aesni_alg);
diff --git a/crypto/Kconfig b/crypto/Kconfig
index e4bac29..459fd35 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -539,8 +539,9 @@ config CRYPTO_AES_X86_64

config CRYPTO_AES_NI_INTEL
tristate "AES cipher algorithms (AES-NI)"
- depends on (X86 || UML_X86) && 64BIT
- select CRYPTO_AES_X86_64
+ depends on (X86 || UML_X86)
+ select CRYPTO_AES_X86_64 if 64BIT
+ select CRYPTO_AES_586 if !64BIT
select CRYPTO_CRYPTD
select CRYPTO_ALGAPI
select CRYPTO_FPU
@@ -565,7 +566,8 @@ config CRYPTO_AES_NI_INTEL

In addition to AES cipher algorithm support, the
acceleration for some popular block cipher mode is supported
- too, including ECB, CBC, CTR, LRW, PCBC, XTS.
+ too, including ECB, LRW, PCBC, XTS. The 64 bit version has
+ additional acceleration for CBC and CTR.

config CRYPTO_ANUBIS
tristate "Anubis cipher algorithm"
--
1.5.6.5


2010-11-03 22:27:13

by Huang, Ying

[permalink] [raw]
Subject: Re: [PATCH v3] x86, crypto: ported aes-ni implementation to x86

On Wed, 2010-11-03 at 14:14 -0700, Mathias Krause wrote:
> The AES-NI instructions are also available in legacy mode so the 32-bit
> architecture may profit from those, too.
>
> To illustrate the performance gain here's a short summary of the tcrypt
> speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
> implementations:
>
> x86: i568 aes-ni delta
> 256 bit, 8kB blocks, ECB: 125.94 MB/s 187.09 MB/s +48.6%

Which method do you used for speed testing?

modprobe tcrypt mode=200 sec=<?>

That actually does not work very well for AES-NI. Because AES-NI
blkcipher is tested in synchronous mode, and in that mode,
kernel_fpu_begin/end() must be called for every block, and
kernel_fpu_begin/end() is quite slow. At the same time, some further
optimization for AES-NI can not be tested (such as "ecb-aes-aesni"
driver) in that mode, because they are only available in asynchronous
mode.

When developing AES-NI for x86_64, I uses dm-crypt + AES-NI for speed
testing, where AES-NI blkcipher will be tested in asynchronous mode, and
kernel_fpu_begin/end() is called for every page. Can you use that to
test?

Or you can add test_acipher_speed (similar with test_ahash_speed) to
test cipher in asynchronous mode.

Best Regards,
Huang Ying

2010-11-04 07:38:48

by Mathias Krause

[permalink] [raw]
Subject: Re: [PATCH v3] x86, crypto: ported aes-ni implementation to x86

On 03.11.2010, 23:27 Huang Ying wrote:
> On Wed, 2010-11-03 at 14:14 -0700, Mathias Krause wrote:
>> The AES-NI instructions are also available in legacy mode so the 32-bit
>> architecture may profit from those, too.
>>
>> To illustrate the performance gain here's a short summary of the tcrypt
>> speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
>> implementations:
>>
>> x86: i568 aes-ni delta
>> 256 bit, 8kB blocks, ECB: 125.94 MB/s 187.09 MB/s +48.6%
>
> Which method do you used for speed testing?
>
> modprobe tcrypt mode=200 sec=<?>

Yes. I used: modprobe tcrypt mode=200 sec=1

> That actually does not work very well for AES-NI. Because AES-NI
> blkcipher is tested in synchronous mode, and in that mode,
> kernel_fpu_begin/end() must be called for every block, and
> kernel_fpu_begin/end() is quite slow.

That's what I figured, too. Can this slowdown be avoided by saving and
restoring the used FPU registers within the assembler implementation or
would this be even slower?

> At the same time, some further
> optimization for AES-NI can not be tested (such as "ecb-aes-aesni"
> driver) in that mode, because they are only available in asynchronous
> mode.

After finding the bug in the second version of the patch I noticed this,
too.

> When developing AES-NI for x86_64, I uses dm-crypt + AES-NI for speed
> testing, where AES-NI blkcipher will be tested in asynchronous mode, and
> kernel_fpu_begin/end() is called for every page. Can you use that to
> test?

But wouldn't this be even slower than the above measurement? I took the
results for 8kB blocks and a page would only be 4kB ... well, depends on
what kind of pages you took. IIRC x86-64 not only supports 2MB but also
1GB pages ;)

> Or you can add test_acipher_speed (similar with test_ahash_speed) to
> test cipher in asynchronous mode.

Maybe I'll try this approach, since it looks like just a minor
modification of the tcrypt module.
Thanks for the hints!

Best regards,
Mathias

>
> Best Regards,
> Huang Ying

2010-11-04 12:24:38

by Huang, Ying

[permalink] [raw]
Subject: Re: [PATCH v3] x86, crypto: ported aes-ni implementation to x86

On Thu, 2010-11-04 at 00:38 -0700, Mathias Krause wrote:
> On 03.11.2010, 23:27 Huang Ying wrote:
> > On Wed, 2010-11-03 at 14:14 -0700, Mathias Krause wrote:
> >> The AES-NI instructions are also available in legacy mode so the 32-bit
> >> architecture may profit from those, too.
> >>
> >> To illustrate the performance gain here's a short summary of the tcrypt
> >> speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
> >> implementations:
> >>
> >> x86: i568 aes-ni delta
> >> 256 bit, 8kB blocks, ECB: 125.94 MB/s 187.09 MB/s +48.6%
> >
> > Which method do you used for speed testing?
> >
> > modprobe tcrypt mode=200 sec=<?>
>
> Yes. I used: modprobe tcrypt mode=200 sec=1
>
> > That actually does not work very well for AES-NI. Because AES-NI
> > blkcipher is tested in synchronous mode, and in that mode,
> > kernel_fpu_begin/end() must be called for every block, and
> > kernel_fpu_begin/end() is quite slow.
>
> That's what I figured, too. Can this slowdown be avoided by saving and
> restoring the used FPU registers within the assembler implementation or
> would this be even slower?

That is a customized version of kernel_fpu_begin/end(), I think the x86
maintainer will not like it. And the benefit may be small too.

> > At the same time, some further
> > optimization for AES-NI can not be tested (such as "ecb-aes-aesni"
> > driver) in that mode, because they are only available in asynchronous
> > mode.
>
> After finding the bug in the second version of the patch I noticed this,
> too.
>
> > When developing AES-NI for x86_64, I uses dm-crypt + AES-NI for speed
> > testing, where AES-NI blkcipher will be tested in asynchronous mode, and
> > kernel_fpu_begin/end() is called for every page. Can you use that to
> > test?
>
> But wouldn't this be even slower than the above measurement? I took the
> results for 8kB blocks and a page would only be 4kB ... well, depends on
> what kind of pages you took. IIRC x86-64 not only supports 2MB but also
> 1GB pages ;)

There is other difference between them. In synchronous mode
kernel_fpu_begin/end() is called for every block, while in asynchronous
mode and dm-crypt, kernel_fpu_begin/end() is called for every page. So
although the block size is smaller, the result will be better.

> > Or you can add test_acipher_speed (similar with test_ahash_speed) to
> > test cipher in asynchronous mode.
>
> Maybe I'll try this approach, since it looks like just a minor
> modification of the tcrypt module.

Thanks!

Best Regards,
Huang Ying

2010-11-11 22:18:43

by Mathias Krause

[permalink] [raw]
Subject: Re: [PATCH v3] x86, crypto: ported aes-ni implementation to x86

Hello Huang Ying,

On 03.11.2010, 23:27 Huang Ying wrote:
> On Wed, 2010-11-03 at 14:14 -0700, Mathias Krause wrote:
>> The AES-NI instructions are also available in legacy mode so the 32-bit
>> architecture may profit from those, too.
>>
>> To illustrate the performance gain here's a short summary of the tcrypt
>> speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
>> implementations:
>>
>> x86: i568 aes-ni delta
>> 256 bit, 8kB blocks, ECB: 125.94 MB/s 187.09 MB/s +48.6%
>
> Which method do you used for speed testing?
>
> modprobe tcrypt mode=200 sec=<?>
>
> That actually does not work very well for AES-NI. Because AES-NI
> blkcipher is tested in synchronous mode, and in that mode,
> kernel_fpu_begin/end() must be called for every block, and
> kernel_fpu_begin/end() is quite slow. At the same time, some further
> optimization for AES-NI can not be tested (such as "ecb-aes-aesni"
> driver) in that mode, because they are only available in asynchronous
> mode.
>
> When developing AES-NI for x86_64, I uses dm-crypt + AES-NI for speed
> testing, where AES-NI blkcipher will be tested in asynchronous mode, and
> kernel_fpu_begin/end() is called for every page. Can you use that to
> test?
>
> Or you can add test_acipher_speed (similar with test_ahash_speed) to
> test cipher in asynchronous mode.

here are the numbers for dm-crypt. I run the test again on the Core i7
M620, 2.67GHz. During the test I noticed that not porting the CBC
variant to x86 was a bad idea so I did that too and got pretty nice
numbers (see v3 vs. v4 of the patch).

All test were run five times in a row using a 256 bit key and doing i/o
to the block device in chunks of 1MB. The numbers are MB/s.

x86 (i586 variant):
1. run 2. run 3. run 4. run 5. run mean
ECB: 93.9 93.9 94.0 93.5 93.8 93.8
CBC: 84.9 84.8 84.9 84.9 84.8 84.8
XTS: 108.2 108.3 109.6 108.3 108.9 108.6
LRW: 105.0 105.0 105.1 105.1 105.1 105.0

x86 (AES-NI), v3 of the patch:
1. run 2. run 3. run 4. run 5. run mean
ECB: 124.8 120.8 124.5 120.6 124.5 123.0
CBC: 112.6 109.6 112.6 110.7 109.4 110.9
XTS: 221.6 221.1 220.9 223.5 224.4 222.3
LRW: 206.2 209.7 207.4 203.7 209.3 207.2

x86 (AES-NI), v4 of the patch:
1. run 2. run 3. run 4. run 5. run mean
ECB: 122.5 121.2 121.6 125.7 125.5 123.3
CBC: 259.5 259.2 261.2 264.0 267.6 262.3
XTS: 225.1 230.7 220.6 217.9 216.3 222.1
LRW: 202.7 202.8 210.6 208.9 202.7 205.5

Comparing the values for the CBC variant between v3 and v4 of the patch
shows that porting the CBC variant to x86 more then doubled the
performance so the little bit ugly #ifdefed code is worth the effort.

x86-64 (old):
1. run 2. run 3. run 4. run 5. run mean
ECB: 121.4 120.9 121.1 121.2 120.9 121.1
CBC: 282.5 286.3 281.5 282.0 294.5 285.3
XTS: 263.6 260.3 263.0 267.0 264.6 263.7
LRW: 249.6 249.8 250.5 253.4 252.2 251.1

x86-64 (new):
1. run 2. run 3. run 4. run 5. run mean
ECB: 122.1 122.0 122.0 127.0 121.9 123.0
CBC: 291.2 286.2 295.6 291.4 289.9 290.8
XTS: 263.3 264.4 264.5 264.2 270.4 265.3
LRW: 254.9 252.3 253.6 258.2 257.5 255.3

Comparing the mean values gives us:

x86: i586 aes-ni delta
ECB: 93.8 123.3 +31.4%
CBC: 84.8 262.3 +209.3%
LRW: 108.6 222.1 +104.5%
XTS: 105.0 205.5 +95.7%

x86-64: old new delta
ECB: 121.1 123.0 +1.5%
CBC: 285.3 290.8 +1.9%
LRW: 263.7 265.3 +0.6%
XTS: 251.1 255.3 +1.7%

The improvement for the old vs. the new x86-64 version is not as
drastically as for the synchronous variant (see the tcrypt tests in the
previous email), but nevertheless an improvement. The improvement for
the x86 case, albeit, should be noticeable. It's almost as fast as the
x86-64 version.

I'll post the new version of the patch in a follow-up email.


Regards,
Mathias

2010-11-12 00:33:13

by Huang, Ying

[permalink] [raw]
Subject: Re: [PATCH v3] x86, crypto: ported aes-ni implementation to x86

Hi, Mathias,

On Fri, 2010-11-12 at 06:18 +0800, Mathias Krause wrote:
> All test were run five times in a row using a 256 bit key and doing i/o
> to the block device in chunks of 1MB. The numbers are MB/s.
>
> x86 (i586 variant):
> 1. run 2. run 3. run 4. run 5. run mean
> ECB: 93.9 93.9 94.0 93.5 93.8 93.8
> CBC: 84.9 84.8 84.9 84.9 84.8 84.8
> XTS: 108.2 108.3 109.6 108.3 108.9 108.6
> LRW: 105.0 105.0 105.1 105.1 105.1 105.0
>
> x86 (AES-NI), v3 of the patch:
> 1. run 2. run 3. run 4. run 5. run mean
> ECB: 124.8 120.8 124.5 120.6 124.5 123.0
> CBC: 112.6 109.6 112.6 110.7 109.4 110.9
> XTS: 221.6 221.1 220.9 223.5 224.4 222.3
> LRW: 206.2 209.7 207.4 203.7 209.3 207.2
>
> x86 (AES-NI), v4 of the patch:
> 1. run 2. run 3. run 4. run 5. run mean
> ECB: 122.5 121.2 121.6 125.7 125.5 123.3
> CBC: 259.5 259.2 261.2 264.0 267.6 262.3
> XTS: 225.1 230.7 220.6 217.9 216.3 222.1
> LRW: 202.7 202.8 210.6 208.9 202.7 205.5
>
> Comparing the values for the CBC variant between v3 and v4 of the patch
> shows that porting the CBC variant to x86 more then doubled the
> performance so the little bit ugly #ifdefed code is worth the effort.
>
> x86-64 (old):
> 1. run 2. run 3. run 4. run 5. run mean
> ECB: 121.4 120.9 121.1 121.2 120.9 121.1
> CBC: 282.5 286.3 281.5 282.0 294.5 285.3
> XTS: 263.6 260.3 263.0 267.0 264.6 263.7
> LRW: 249.6 249.8 250.5 253.4 252.2 251.1
>
> x86-64 (new):
> 1. run 2. run 3. run 4. run 5. run mean
> ECB: 122.1 122.0 122.0 127.0 121.9 123.0
> CBC: 291.2 286.2 295.6 291.4 289.9 290.8
> XTS: 263.3 264.4 264.5 264.2 270.4 265.3
> LRW: 254.9 252.3 253.6 258.2 257.5 255.3
>
> Comparing the mean values gives us:
>
> x86: i586 aes-ni delta
> ECB: 93.8 123.3 +31.4%

Why the improvement of ECB is so small? I can not understand it. It
should be as big as CBC.

Best Regards,
Huang Ying

> CBC: 84.8 262.3 +209.3%
> LRW: 108.6 222.1 +104.5%
> XTS: 105.0 205.5 +95.7%
>
> x86-64: old new delta
> ECB: 121.1 123.0 +1.5%
> CBC: 285.3 290.8 +1.9%
> LRW: 263.7 265.3 +0.6%
> XTS: 251.1 255.3 +1.7%
>
> The improvement for the old vs. the new x86-64 version is not as
> drastically as for the synchronous variant (see the tcrypt tests in the
> previous email), but nevertheless an improvement. The improvement for
> the x86 case, albeit, should be noticeable. It's almost as fast as the
> x86-64 version.
>
> I'll post the new version of the patch in a follow-up email.
>
>
> Regards,
> Mathias
>

2010-11-12 07:30:53

by Mathias Krause

[permalink] [raw]
Subject: Re: [PATCH v3] x86, crypto: ported aes-ni implementation to x86

On 12.11.2010, 01:33 Huang Ying wrote:
> Hi, Mathias,
>
> On Fri, 2010-11-12 at 06:18 +0800, Mathias Krause wrote:
>> All test were run five times in a row using a 256 bit key and doing i/o
>> to the block device in chunks of 1MB. The numbers are MB/s.
>>
>> x86 (i586 variant):
>> 1. run 2. run 3. run 4. run 5. run mean
>> ECB: 93.9 93.9 94.0 93.5 93.8 93.8
>> CBC: 84.9 84.8 84.9 84.9 84.8 84.8
>> XTS: 108.2 108.3 109.6 108.3 108.9 108.6
>> LRW: 105.0 105.0 105.1 105.1 105.1 105.0
>>
>> x86 (AES-NI), v3 of the patch:
>> 1. run 2. run 3. run 4. run 5. run mean
>> ECB: 124.8 120.8 124.5 120.6 124.5 123.0
>> CBC: 112.6 109.6 112.6 110.7 109.4 110.9
>> XTS: 221.6 221.1 220.9 223.5 224.4 222.3
>> LRW: 206.2 209.7 207.4 203.7 209.3 207.2
>>
>> x86 (AES-NI), v4 of the patch:
>> 1. run 2. run 3. run 4. run 5. run mean
>> ECB: 122.5 121.2 121.6 125.7 125.5 123.3
>> CBC: 259.5 259.2 261.2 264.0 267.6 262.3
>> XTS: 225.1 230.7 220.6 217.9 216.3 222.1
>> LRW: 202.7 202.8 210.6 208.9 202.7 205.5
>>
>> Comparing the values for the CBC variant between v3 and v4 of the patch
>> shows that porting the CBC variant to x86 more then doubled the
>> performance so the little bit ugly #ifdefed code is worth the effort.
>>
>> x86-64 (old):
>> 1. run 2. run 3. run 4. run 5. run mean
>> ECB: 121.4 120.9 121.1 121.2 120.9 121.1
>> CBC: 282.5 286.3 281.5 282.0 294.5 285.3
>> XTS: 263.6 260.3 263.0 267.0 264.6 263.7
>> LRW: 249.6 249.8 250.5 253.4 252.2 251.1
>>
>> x86-64 (new):
>> 1. run 2. run 3. run 4. run 5. run mean
>> ECB: 122.1 122.0 122.0 127.0 121.9 123.0
>> CBC: 291.2 286.2 295.6 291.4 289.9 290.8
>> XTS: 263.3 264.4 264.5 264.2 270.4 265.3
>> LRW: 254.9 252.3 253.6 258.2 257.5 255.3
>>
>> Comparing the mean values gives us:
>>
>> x86: i586 aes-ni delta
>> ECB: 93.8 123.3 +31.4%
>
> Why the improvement of ECB is so small? I can not understand it. It
> should be as big as CBC.

I don't know why the ECB variant is so slow compared to the other variants.
But it is so even for the current x86-64 version. See the above values for
"x86-64 (old)". I setup dm-crypt for this test like this:
# cryptsetup -c aes-ecb-plain -d /dev/urandom create cfs /dev/loop0

What where the numbers you measured in your tests while developing the
x86-64 version?

Best regards,
Mathias

>
> Best Regards,
> Huang Ying
>
>> CBC: 84.8 262.3 +209.3%
>> LRW: 108.6 222.1 +104.5%
>> XTS: 105.0 205.5 +95.7%
>>
>> x86-64: old new delta
>> ECB: 121.1 123.0 +1.5%
>> CBC: 285.3 290.8 +1.9%
>> LRW: 263.7 265.3 +0.6%
>> XTS: 251.1 255.3 +1.7%
>>
>> The improvement for the old vs. the new x86-64 version is not as
>> drastically as for the synchronous variant (see the tcrypt tests in the
>> previous email), but nevertheless an improvement. The improvement for
>> the x86 case, albeit, should be noticeable. It's almost as fast as the
>> x86-64 version.
>>
>> I'll post the new version of the patch in a follow-up email.
>>
>>
>> Regards,
>> Mathias
>>
>

2010-11-12 07:34:38

by Huang, Ying

[permalink] [raw]
Subject: Re: [PATCH v3] x86, crypto: ported aes-ni implementation to x86

On Fri, 2010-11-12 at 15:30 +0800, Mathias Krause wrote:
> On 12.11.2010, 01:33 Huang Ying wrote:
> > Hi, Mathias,
> >
> > On Fri, 2010-11-12 at 06:18 +0800, Mathias Krause wrote:
> >> All test were run five times in a row using a 256 bit key and doing i/o
> >> to the block device in chunks of 1MB. The numbers are MB/s.
> >>
> >> x86 (i586 variant):
> >> 1. run 2. run 3. run 4. run 5. run mean
> >> ECB: 93.9 93.9 94.0 93.5 93.8 93.8
> >> CBC: 84.9 84.8 84.9 84.9 84.8 84.8
> >> XTS: 108.2 108.3 109.6 108.3 108.9 108.6
> >> LRW: 105.0 105.0 105.1 105.1 105.1 105.0
> >>
> >> x86 (AES-NI), v3 of the patch:
> >> 1. run 2. run 3. run 4. run 5. run mean
> >> ECB: 124.8 120.8 124.5 120.6 124.5 123.0
> >> CBC: 112.6 109.6 112.6 110.7 109.4 110.9
> >> XTS: 221.6 221.1 220.9 223.5 224.4 222.3
> >> LRW: 206.2 209.7 207.4 203.7 209.3 207.2
> >>
> >> x86 (AES-NI), v4 of the patch:
> >> 1. run 2. run 3. run 4. run 5. run mean
> >> ECB: 122.5 121.2 121.6 125.7 125.5 123.3
> >> CBC: 259.5 259.2 261.2 264.0 267.6 262.3
> >> XTS: 225.1 230.7 220.6 217.9 216.3 222.1
> >> LRW: 202.7 202.8 210.6 208.9 202.7 205.5
> >>
> >> Comparing the values for the CBC variant between v3 and v4 of the patch
> >> shows that porting the CBC variant to x86 more then doubled the
> >> performance so the little bit ugly #ifdefed code is worth the effort.
> >>
> >> x86-64 (old):
> >> 1. run 2. run 3. run 4. run 5. run mean
> >> ECB: 121.4 120.9 121.1 121.2 120.9 121.1
> >> CBC: 282.5 286.3 281.5 282.0 294.5 285.3
> >> XTS: 263.6 260.3 263.0 267.0 264.6 263.7
> >> LRW: 249.6 249.8 250.5 253.4 252.2 251.1
> >>
> >> x86-64 (new):
> >> 1. run 2. run 3. run 4. run 5. run mean
> >> ECB: 122.1 122.0 122.0 127.0 121.9 123.0
> >> CBC: 291.2 286.2 295.6 291.4 289.9 290.8
> >> XTS: 263.3 264.4 264.5 264.2 270.4 265.3
> >> LRW: 254.9 252.3 253.6 258.2 257.5 255.3
> >>
> >> Comparing the mean values gives us:
> >>
> >> x86: i586 aes-ni delta
> >> ECB: 93.8 123.3 +31.4%
> >
> > Why the improvement of ECB is so small? I can not understand it. It
> > should be as big as CBC.
>
> I don't know why the ECB variant is so slow compared to the other variants.
> But it is so even for the current x86-64 version. See the above values for
> "x86-64 (old)". I setup dm-crypt for this test like this:
> # cryptsetup -c aes-ecb-plain -d /dev/urandom create cfs /dev/loop0
>
> What where the numbers you measured in your tests while developing the
> x86-64 version?

Can't remember the number. Do you have interest to dig into the issue?

Best Regards,
Huang Ying

2010-11-12 07:42:51

by Mathias Krause

[permalink] [raw]
Subject: Re: [PATCH v3] x86, crypto: ported aes-ni implementation to x86

On 12.11.2010, 08:34 Huang Ying wrote:
On Fri, 2010-11-12 at 15:30 +0800, Mathias Krause wrote:
>> On 12.11.2010, 01:33 Huang Ying wrote:
>>> Hi, Mathias,
>>>
>>> On Fri, 2010-11-12 at 06:18 +0800, Mathias Krause wrote:
>>>> All test were run five times in a row using a 256 bit key and doing i/o
>>>> to the block device in chunks of 1MB. The numbers are MB/s.
>>>>
>>>> x86 (i586 variant):
>>>> 1. run 2. run 3. run 4. run 5. run mean
>>>> ECB: 93.9 93.9 94.0 93.5 93.8 93.8
>>>> CBC: 84.9 84.8 84.9 84.9 84.8 84.8
>>>> XTS: 108.2 108.3 109.6 108.3 108.9 108.6
>>>> LRW: 105.0 105.0 105.1 105.1 105.1 105.0
>>>>
>>>> x86 (AES-NI), v3 of the patch:
>>>> 1. run 2. run 3. run 4. run 5. run mean
>>>> ECB: 124.8 120.8 124.5 120.6 124.5 123.0
>>>> CBC: 112.6 109.6 112.6 110.7 109.4 110.9
>>>> XTS: 221.6 221.1 220.9 223.5 224.4 222.3
>>>> LRW: 206.2 209.7 207.4 203.7 209.3 207.2
>>>>
>>>> x86 (AES-NI), v4 of the patch:
>>>> 1. run 2. run 3. run 4. run 5. run mean
>>>> ECB: 122.5 121.2 121.6 125.7 125.5 123.3
>>>> CBC: 259.5 259.2 261.2 264.0 267.6 262.3
>>>> XTS: 225.1 230.7 220.6 217.9 216.3 222.1
>>>> LRW: 202.7 202.8 210.6 208.9 202.7 205.5
>>>>
>>>> Comparing the values for the CBC variant between v3 and v4 of the patch
>>>> shows that porting the CBC variant to x86 more then doubled the
>>>> performance so the little bit ugly #ifdefed code is worth the effort.
>>>>
>>>> x86-64 (old):
>>>> 1. run 2. run 3. run 4. run 5. run mean
>>>> ECB: 121.4 120.9 121.1 121.2 120.9 121.1
>>>> CBC: 282.5 286.3 281.5 282.0 294.5 285.3
>>>> XTS: 263.6 260.3 263.0 267.0 264.6 263.7
>>>> LRW: 249.6 249.8 250.5 253.4 252.2 251.1
>>>>
>>>> x86-64 (new):
>>>> 1. run 2. run 3. run 4. run 5. run mean
>>>> ECB: 122.1 122.0 122.0 127.0 121.9 123.0
>>>> CBC: 291.2 286.2 295.6 291.4 289.9 290.8
>>>> XTS: 263.3 264.4 264.5 264.2 270.4 265.3
>>>> LRW: 254.9 252.3 253.6 258.2 257.5 255.3
>>>>
>>>> Comparing the mean values gives us:
>>>>
>>>> x86: i586 aes-ni delta
>>>> ECB: 93.8 123.3 +31.4%
>>>
>>> Why the improvement of ECB is so small? I can not understand it. It
>>> should be as big as CBC.
>>
>> I don't know why the ECB variant is so slow compared to the other variants.
>> But it is so even for the current x86-64 version. See the above values for
>> "x86-64 (old)". I setup dm-crypt for this test like this:
>> # cryptsetup -c aes-ecb-plain -d /dev/urandom create cfs /dev/loop0
>>
>> What where the numbers you measured in your tests while developing the
>> x86-64 version?
>
> Can't remember the number. Do you have interest to dig into the issue?

Sure. Increasing performance is always a good thing to do. :)

Best regards,
Mathias

2010-11-12 23:25:08

by Mathias Krause

[permalink] [raw]
Subject: Re: [PATCH v3] x86, crypto: ported aes-ni implementation to x86

On 12.11.2010, 08:34 Huang Ying wrote:
On Fri, 2010-11-12 at 15:30 +0800, Mathias Krause wrote:
>> On 12.11.2010, 01:33 Huang Ying wrote:
>>> Why the improvement of ECB is so small? I can not understand it. It
>>> should be as big as CBC.
>>
>> I don't know why the ECB variant is so slow compared to the other variants.
>> But it is so even for the current x86-64 version. See the above values for
>> "x86-64 (old)". I setup dm-crypt for this test like this:
>> # cryptsetup -c aes-ecb-plain -d /dev/urandom create cfs /dev/loop0
>>
>> What where the numbers you measured in your tests while developing the
>> x86-64 version?
>
> Can't remember the number. Do you have interest to dig into the issue?

I looked at /proc/crypto while doing the tests again and noticed that ECB
isn't handled using cryptd, while all other modes, e.g. CBC and CTR, are.
The reason for that seems to be that for ECB, and only for ECB, the kernel
is using the synchronous block algorithm instead of the asynchronous one.
So the question is: Why is the ECB variant handled using the synchronous
cipher -- because of the missing iv handling in this mode?

Best regards,
Mathias

2010-11-18 07:38:45

by Mathias Krause

[permalink] [raw]
Subject: Re: [PATCH v3] x86, crypto: ported aes-ni implementation to x86

On 13.11.2010, 00:25 Mathias Krause wrote:
> On 12.11.2010, 08:34 Huang Ying wrote:
> On Fri, 2010-11-12 at 15:30 +0800, Mathias Krause wrote:
>>> On 12.11.2010, 01:33 Huang Ying wrote:
>>>> Why the improvement of ECB is so small? I can not understand it. It
>>>> should be as big as CBC.
>>>
>>> I don't know why the ECB variant is so slow compared to the other variants.
>>> But it is so even for the current x86-64 version. See the above values for
>>> "x86-64 (old)". I setup dm-crypt for this test like this:
>>> # cryptsetup -c aes-ecb-plain -d /dev/urandom create cfs /dev/loop0
>>>
>>> What where the numbers you measured in your tests while developing the
>>> x86-64 version?
>>
>> Can't remember the number. Do you have interest to dig into the issue?
>
> I looked at /proc/crypto while doing the tests again and noticed that ECB
> isn't handled using cryptd, while all other modes, e.g. CBC and CTR, are.
> The reason for that seems to be that for ECB, and only for ECB, the kernel
> is using the synchronous block algorithm instead of the asynchronous one.
> So the question is: Why is the ECB variant handled using the synchronous
> cipher -- because of the missing iv handling in this mode?

Herbert, any idea why this is the case?

Regards,
Mathias