2010-10-29 21:19:58

by Mathias Krause

[permalink] [raw]
Subject: [PATCH] x86, crypto: ported aes-ni implementation to x86

The AES-NI instructions are also available in legacy mode so the x86
architecture may profit from those, too.

To illustrate the performance gain here's a short summary of the tcrypt
speed test on a Core i5 M 520 running at 2.40GHz comparing both
assembler implementations:

aes-i586 aes-ni-i586 delta
256 bit, 8kB blocks, ECB: 46.81 MB/s 164.46 MB/s +251%
256 bit, 8kB blocks, CBC: 43.89 MB/s 62.18 MB/s +41%
384 bit, 8kB blocks, LRW: 42.24 MB/s 142.90 MB/s +238%
512 bit, 8kB blocks, XTS: 43.41 MB/s 148.67 MB/s +242%

Signed-off-by: Mathias Krause <[email protected]>
---
arch/x86/crypto/Makefile | 7 +-
arch/x86/crypto/aesni-intel_asm-i586.S | 773 +++++++++++++++++++++++++++
arch/x86/crypto/aesni-intel_asm-x86_64.S | 841 ++++++++++++++++++++++++++++++
arch/x86/crypto/aesni-intel_asm.S | 841 ------------------------------
arch/x86/crypto/aesni-intel_glue.c | 18 +
crypto/Kconfig | 32 ++-
6 files changed, 1667 insertions(+), 845 deletions(-)
create mode 100644 arch/x86/crypto/aesni-intel_asm-i586.S
create mode 100644 arch/x86/crypto/aesni-intel_asm-x86_64.S
delete mode 100644 arch/x86/crypto/aesni-intel_asm.S

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 1a58ad8..949e7e5 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -5,25 +5,26 @@
obj-$(CONFIG_CRYPTO_FPU) += fpu.o

obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
+obj-$(CONFIG_CRYPTO_AES_NI_INTEL_586) += aesni-intel-i586.o
obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o

obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
+obj-$(CONFIG_CRYPTO_AES_NI_INTEL_X86_64) += aesni-intel-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
-obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o

obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o

aes-i586-y := aes-i586-asm_32.o aes_glue.o
+aesni-intel-i586-y := aesni-intel_asm-i586.o aesni-intel_glue.o
twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o

aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
+aesni-intel-x86_64-y := aesni-intel_asm-x86_64.o aesni-intel_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o

-aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
-
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/aesni-intel_asm-i586.S b/arch/x86/crypto/aesni-intel_asm-i586.S
new file mode 100644
index 0000000..e2bdb5a
--- /dev/null
+++ b/arch/x86/crypto/aesni-intel_asm-i586.S
@@ -0,0 +1,773 @@
+/*
+ * Implement AES algorithm in Intel AES-NI instructions.
+ *
+ * The white paper of AES-NI instructions can be downloaded from:
+ * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
+ *
+ * Copyright (C) 2008, Intel Corp.
+ * Author: Huang Ying <[email protected]>
+ * Vinodh Gopal <[email protected]>
+ * Kahraman Akdemir
+ * Copyright (C) 2010 secunet Security Networks AG
+ * Author: Mathias Krause <[email protected]>
+ * ported x86_64 version to x86
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+
+.text
+
+#define STATE1 %xmm0
+#define STATE2 %xmm4
+#define STATE3 %xmm5
+#define STATE4 %xmm6
+#define STATE STATE1
+#define IN1 %xmm1
+#define IN2 %xmm7
+#define IN IN1
+#define KEY %xmm2
+#define IV %xmm3
+
+#define KEYP %edi
+#define OUTP %eax
+#define INP %edx
+#define LEN %esi
+#define IVP %ebp
+#define EKLEN 480(KEYP)
+#define DKLEN 240(KEYP)
+#define T1 %ecx
+#define TKEYP T1
+
+_key_expansion_128:
+_key_expansion_256a:
+ pshufd $0b11111111, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+ movaps %xmm0, (%ecx)
+ add $0x10, %ecx
+ ret
+
+_key_expansion_192a:
+ pshufd $0b01010101, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+
+ movaps %xmm2, %xmm5
+ movaps %xmm2, %xmm6
+ pslldq $4, %xmm5
+ pshufd $0b11111111, %xmm0, %xmm3
+ pxor %xmm3, %xmm2
+ pxor %xmm5, %xmm2
+
+ movaps %xmm0, %xmm1
+ shufps $0b01000100, %xmm0, %xmm6
+ movaps %xmm6, (%ecx)
+ shufps $0b01001110, %xmm2, %xmm1
+ movaps %xmm1, 0x10(%ecx)
+ add $0x20, %ecx
+ ret
+
+_key_expansion_192b:
+ pshufd $0b01010101, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+
+ movaps %xmm2, %xmm5
+ pslldq $4, %xmm5
+ pshufd $0b11111111, %xmm0, %xmm3
+ pxor %xmm3, %xmm2
+ pxor %xmm5, %xmm2
+
+ movaps %xmm0, (%ecx)
+ add $0x10, %ecx
+ ret
+
+_key_expansion_256b:
+ pshufd $0b10101010, %xmm1, %xmm1
+ shufps $0b00010000, %xmm2, %xmm4
+ pxor %xmm4, %xmm2
+ shufps $0b10001100, %xmm2, %xmm4
+ pxor %xmm4, %xmm2
+ pxor %xmm1, %xmm2
+ movaps %xmm2, (%ecx)
+ add $0x10, %ecx
+ ret
+
+/*
+ * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+ * unsigned int key_len)
+ */
+ENTRY(aesni_set_key)
+ pushl %edi
+ movl 8(%esp), %edi # ctx
+ movl 12(%esp), %edx # in_key
+ movl 16(%esp), %eax # key_len
+
+ movups (%edx), %xmm0 # user key (first 16 bytes)
+ movaps %xmm0, (%edi)
+ lea 0x10(%edi), %ecx # key addr
+ movl %eax, 480(%edi)
+ pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
+ cmp $24, %al
+ jb .Lenc_key128
+ je .Lenc_key192
+ movups 0x10(%edx), %xmm2 # other user key
+ movaps %xmm2, (%ecx)
+ add $0x10, %ecx
+ AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x1 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x2 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x4 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x8 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x10 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x20 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
+ call _key_expansion_256a
+ jmp .Ldec_key
+.Lenc_key192:
+ movq 0x10(%edx), %xmm2 # other user key
+ AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
+ call _key_expansion_192a
+ AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
+ call _key_expansion_192b
+ AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
+ call _key_expansion_192a
+ AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
+ call _key_expansion_192b
+ AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
+ call _key_expansion_192a
+ AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
+ call _key_expansion_192b
+ AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
+ call _key_expansion_192a
+ AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
+ call _key_expansion_192b
+ jmp .Ldec_key
+.Lenc_key128:
+ AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
+ call _key_expansion_128
+ AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
+ call _key_expansion_128
+ AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
+ call _key_expansion_128
+ AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
+ call _key_expansion_128
+ AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
+ call _key_expansion_128
+ AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
+ call _key_expansion_128
+ AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
+ call _key_expansion_128
+ AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
+ call _key_expansion_128
+ AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
+ call _key_expansion_128
+ AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
+ call _key_expansion_128
+.Ldec_key:
+ sub $0x10, %ecx
+ movaps (%edi), %xmm0
+ movaps (%ecx), %xmm1
+ movaps %xmm0, 240(%ecx)
+ movaps %xmm1, 240(%edi)
+ add $0x10, %edi
+ lea 240-16(%ecx), %edx
+.align 4
+.Ldec_key_loop:
+ movaps (%edi), %xmm0
+ AESIMC %xmm0 %xmm1
+ movaps %xmm1, (%edx)
+ add $0x10, %edi
+ sub $0x10, %edx
+ cmp %ecx, %edi
+ jb .Ldec_key_loop
+ xor %eax, %eax
+ popl %edi
+ ret
+
+/*
+ * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
+ */
+ENTRY(aesni_enc)
+ pushl KEYP
+ movl 8(%esp), KEYP
+ movl 12(%esp), OUTP
+ movl 16(%esp), INP
+ movups (INP), STATE # input
+ call _aesni_enc1
+ movups STATE, (OUTP) # output
+ popl KEYP
+ ret
+
+/*
+ * _aesni_enc1: internal ABI
+ * input:
+ * KEYP: key struct pointer
+ * EKLEN: round count
+ * STATE: initial state (input)
+ * output:
+ * STATE: finial state (output)
+ * changed:
+ * KEY
+ * TKEYP (T1)
+ */
+_aesni_enc1:
+ movaps (KEYP), KEY # key
+ mov KEYP, TKEYP
+ pxor KEY, STATE # round 0
+ add $0x30, TKEYP
+ cmp $24, EKLEN
+ jb .Lenc128
+ lea 0x20(TKEYP), TKEYP
+ je .Lenc192
+ add $0x20, TKEYP
+ movaps -0x60(TKEYP), KEY
+ AESENC KEY STATE
+ movaps -0x50(TKEYP), KEY
+ AESENC KEY STATE
+.align 4
+.Lenc192:
+ movaps -0x40(TKEYP), KEY
+ AESENC KEY STATE
+ movaps -0x30(TKEYP), KEY
+ AESENC KEY STATE
+.align 4
+.Lenc128:
+ movaps -0x20(TKEYP), KEY
+ AESENC KEY STATE
+ movaps -0x10(TKEYP), KEY
+ AESENC KEY STATE
+ movaps (TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x10(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x20(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x30(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x40(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x50(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x60(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x70(TKEYP), KEY
+ AESENCLAST KEY STATE
+ ret
+
+/*
+ * _aesni_enc4: internal ABI
+ * input:
+ * KEYP: key struct pointer
+ * EKLEN: round count
+ * STATE1: initial state (input)
+ * STATE2
+ * STATE3
+ * STATE4
+ * output:
+ * STATE1: finial state (output)
+ * STATE2
+ * STATE3
+ * STATE4
+ * changed:
+ * KEY
+ * TKEYP (T1)
+ */
+_aesni_enc4:
+ movaps (KEYP), KEY # key
+ mov KEYP, TKEYP
+ pxor KEY, STATE1 # round 0
+ pxor KEY, STATE2
+ pxor KEY, STATE3
+ pxor KEY, STATE4
+ add $0x30, TKEYP
+ cmp $24, EKLEN
+ jb .L4enc128
+ lea 0x20(TKEYP), TKEYP
+ je .L4enc192
+ add $0x20, TKEYP
+ movaps -0x60(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps -0x50(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+#.align 4
+.L4enc192:
+ movaps -0x40(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps -0x30(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+#.align 4
+.L4enc128:
+ movaps -0x20(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps -0x10(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps (TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x10(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x20(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x30(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x40(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x50(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x60(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x70(TKEYP), KEY
+ AESENCLAST KEY STATE1 # last round
+ AESENCLAST KEY STATE2
+ AESENCLAST KEY STATE3
+ AESENCLAST KEY STATE4
+ ret
+
+/*
+ * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
+ */
+ENTRY(aesni_dec)
+ pushl KEYP
+ movl 8(%esp), KEYP
+ movl 12(%esp), OUTP
+ movl 16(%esp), INP
+ add $240, KEYP
+ movups (INP), STATE # input
+ call _aesni_dec1
+ movups STATE, (OUTP) #output
+ popl KEYP
+ ret
+
+/*
+ * _aesni_dec1: internal ABI
+ * input:
+ * KEYP: key struct pointer
+ * DKLEN: key length
+ * STATE: initial state (input)
+ * output:
+ * STATE: finial state (output)
+ * changed:
+ * KEY
+ * TKEYP (T1)
+ */
+_aesni_dec1:
+ movaps (KEYP), KEY # key
+ mov KEYP, TKEYP
+ pxor KEY, STATE # round 0
+ add $0x30, TKEYP
+ cmp $24, DKLEN
+ jb .Ldec128
+ lea 0x20(TKEYP), TKEYP
+ je .Ldec192
+ add $0x20, TKEYP
+ movaps -0x60(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps -0x50(TKEYP), KEY
+ AESDEC KEY STATE
+.align 4
+.Ldec192:
+ movaps -0x40(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps -0x30(TKEYP), KEY
+ AESDEC KEY STATE
+.align 4
+.Ldec128:
+ movaps -0x20(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps -0x10(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps (TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x10(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x20(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x30(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x40(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x50(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x60(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x70(TKEYP), KEY
+ AESDECLAST KEY STATE
+ ret
+
+/*
+ * _aesni_dec4: internal ABI
+ * input:
+ * KEYP: key struct pointer
+ * DKLEN: key length
+ * STATE1: initial state (input)
+ * STATE2
+ * STATE3
+ * STATE4
+ * output:
+ * STATE1: finial state (output)
+ * STATE2
+ * STATE3
+ * STATE4
+ * changed:
+ * KEY
+ * TKEYP (T1)
+ */
+_aesni_dec4:
+ movaps (KEYP), KEY # key
+ mov KEYP, TKEYP
+ pxor KEY, STATE1 # round 0
+ pxor KEY, STATE2
+ pxor KEY, STATE3
+ pxor KEY, STATE4
+ add $0x30, TKEYP
+ cmp $24, DKLEN
+ jb .L4dec128
+ lea 0x20(TKEYP), TKEYP
+ je .L4dec192
+ add $0x20, TKEYP
+ movaps -0x60(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps -0x50(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+.align 4
+.L4dec192:
+ movaps -0x40(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps -0x30(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+.align 4
+.L4dec128:
+ movaps -0x20(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps -0x10(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps (TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x10(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x20(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x30(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x40(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x50(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x60(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x70(TKEYP), KEY
+ AESDECLAST KEY STATE1 # last round
+ AESDECLAST KEY STATE2
+ AESDECLAST KEY STATE3
+ AESDECLAST KEY STATE4
+ ret
+
+/*
+ * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len)
+ */
+ENTRY(aesni_ecb_enc)
+ pushl LEN
+ pushl KEYP
+ movl 12(%esp), KEYP
+ movl 16(%esp), OUTP
+ movl 20(%esp), INP
+ movl 24(%esp), LEN
+ test LEN, LEN # check length
+ jz .Lecb_enc_ret
+ cmp $16, LEN
+ jb .Lecb_enc_ret
+ cmp $64, LEN
+ jb .Lecb_enc_loop1
+.align 4
+.Lecb_enc_loop4:
+ movups (INP), STATE1
+ movups 0x10(INP), STATE2
+ movups 0x20(INP), STATE3
+ movups 0x30(INP), STATE4
+ call _aesni_enc4
+ movups STATE1, (OUTP)
+ movups STATE2, 0x10(OUTP)
+ movups STATE3, 0x20(OUTP)
+ movups STATE4, 0x30(OUTP)
+ sub $64, LEN
+ add $64, INP
+ add $64, OUTP
+ cmp $64, LEN
+ jge .Lecb_enc_loop4
+ cmp $16, LEN
+ jb .Lecb_enc_ret
+.align 4
+.Lecb_enc_loop1:
+ movups (INP), STATE1
+ call _aesni_enc1
+ movups STATE1, (OUTP)
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lecb_enc_loop1
+.Lecb_enc_ret:
+ popl KEYP
+ popl LEN
+ ret
+
+/*
+ * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len);
+ */
+ENTRY(aesni_ecb_dec)
+ pushl LEN
+ pushl KEYP
+ movl 12(%esp), KEYP
+ movl 16(%esp), OUTP
+ movl 20(%esp), INP
+ movl 24(%esp), LEN
+ test LEN, LEN
+ jz .Lecb_dec_ret
+ add $240, KEYP
+ cmp $16, LEN
+ jb .Lecb_dec_ret
+ cmp $64, LEN
+ jb .Lecb_dec_loop1
+.align 4
+.Lecb_dec_loop4:
+ movups (INP), STATE1
+ movups 0x10(INP), STATE2
+ movups 0x20(INP), STATE3
+ movups 0x30(INP), STATE4
+ call _aesni_dec4
+ movups STATE1, (OUTP)
+ movups STATE2, 0x10(OUTP)
+ movups STATE3, 0x20(OUTP)
+ movups STATE4, 0x30(OUTP)
+ sub $64, LEN
+ add $64, INP
+ add $64, OUTP
+ cmp $64, LEN
+ jge .Lecb_dec_loop4
+ cmp $16, LEN
+ jb .Lecb_dec_ret
+.align 4
+.Lecb_dec_loop1:
+ movups (INP), STATE1
+ call _aesni_dec1
+ movups STATE1, (OUTP)
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lecb_dec_loop1
+.Lecb_dec_ret:
+ popl KEYP
+ popl LEN
+ ret
+
+/*
+ * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+ENTRY(aesni_cbc_enc)
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ movl 16(%esp), KEYP
+ movl 20(%esp), OUTP
+ movl 24(%esp), INP
+ movl 28(%esp), LEN
+ cmp $16, LEN
+ jb .Lcbc_enc_ret
+ movups (IVP), STATE # load iv as initial state
+.align 4
+.Lcbc_enc_loop:
+ movups (INP), IN # load input
+ pxor IN, STATE
+ call _aesni_enc1
+ movups STATE, (OUTP) # store output
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lcbc_enc_loop
+ movups STATE, (IVP)
+.Lcbc_enc_ret:
+ popl KEYP
+ popl LEN
+ popl IVP
+ ret
+
+/*
+ * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+ENTRY(aesni_cbc_dec)
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ movl 16(%esp), KEYP
+ movl 20(%esp), OUTP
+ movl 24(%esp), INP
+ movl 28(%esp), LEN
+ cmp $16, LEN
+ jb .Lcbc_dec_just_ret
+ add $240, KEYP
+ movups (IVP), IV
+ cmp $64, LEN
+ jb .Lcbc_dec_loop1
+.align 4
+.Lcbc_dec_loop4:
+ movups (INP), IN1
+ movaps IN1, STATE1
+ movups 0x10(INP), IN2
+ movaps IN2, STATE2
+ movups 0x20(INP), IN1
+ movaps IN1, STATE3
+ movups 0x30(INP), IN2
+ movaps IN2, STATE4
+ call _aesni_dec4
+ pxor IV, STATE1
+ pxor (INP), STATE2
+ pxor 0x10(INP), STATE3
+ pxor IN1, STATE4
+ movaps IN2, IV
+ movups STATE1, (OUTP)
+ movups STATE2, 0x10(OUTP)
+ movups STATE3, 0x20(OUTP)
+ movups STATE4, 0x30(OUTP)
+ sub $64, LEN
+ add $64, INP
+ add $64, OUTP
+ cmp $64, LEN
+ jge .Lcbc_dec_loop4
+ cmp $16, LEN
+ jb .Lcbc_dec_ret
+.align 4
+.Lcbc_dec_loop1:
+ movups (INP), IN
+ movaps IN, STATE
+ call _aesni_dec1
+ pxor IV, STATE
+ movups STATE, (OUTP)
+ movaps IN, IV
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lcbc_dec_loop1
+.Lcbc_dec_ret:
+ movups IV, (IVP)
+.Lcbc_dec_just_ret:
+ popl KEYP
+ popl LEN
+ popl IVP
+ ret
diff --git a/arch/x86/crypto/aesni-intel_asm-x86_64.S b/arch/x86/crypto/aesni-intel_asm-x86_64.S
new file mode 100644
index 0000000..ff16756
--- /dev/null
+++ b/arch/x86/crypto/aesni-intel_asm-x86_64.S
@@ -0,0 +1,841 @@
+/*
+ * Implement AES algorithm in Intel AES-NI instructions.
+ *
+ * The white paper of AES-NI instructions can be downloaded from:
+ * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
+ *
+ * Copyright (C) 2008, Intel Corp.
+ * Author: Huang Ying <[email protected]>
+ * Vinodh Gopal <[email protected]>
+ * Kahraman Akdemir
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+
+.text
+
+#define STATE1 %xmm0
+#define STATE2 %xmm4
+#define STATE3 %xmm5
+#define STATE4 %xmm6
+#define STATE STATE1
+#define IN1 %xmm1
+#define IN2 %xmm7
+#define IN3 %xmm8
+#define IN4 %xmm9
+#define IN IN1
+#define KEY %xmm2
+#define IV %xmm3
+#define BSWAP_MASK %xmm10
+#define CTR %xmm11
+#define INC %xmm12
+
+#define KEYP %rdi
+#define OUTP %rsi
+#define INP %rdx
+#define LEN %rcx
+#define IVP %r8
+#define KLEN %r9d
+#define T1 %r10
+#define TKEYP T1
+#define T2 %r11
+#define TCTR_LOW T2
+
+_key_expansion_128:
+_key_expansion_256a:
+ pshufd $0b11111111, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+ movaps %xmm0, (%rcx)
+ add $0x10, %rcx
+ ret
+
+_key_expansion_192a:
+ pshufd $0b01010101, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+
+ movaps %xmm2, %xmm5
+ movaps %xmm2, %xmm6
+ pslldq $4, %xmm5
+ pshufd $0b11111111, %xmm0, %xmm3
+ pxor %xmm3, %xmm2
+ pxor %xmm5, %xmm2
+
+ movaps %xmm0, %xmm1
+ shufps $0b01000100, %xmm0, %xmm6
+ movaps %xmm6, (%rcx)
+ shufps $0b01001110, %xmm2, %xmm1
+ movaps %xmm1, 16(%rcx)
+ add $0x20, %rcx
+ ret
+
+_key_expansion_192b:
+ pshufd $0b01010101, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+
+ movaps %xmm2, %xmm5
+ pslldq $4, %xmm5
+ pshufd $0b11111111, %xmm0, %xmm3
+ pxor %xmm3, %xmm2
+ pxor %xmm5, %xmm2
+
+ movaps %xmm0, (%rcx)
+ add $0x10, %rcx
+ ret
+
+_key_expansion_256b:
+ pshufd $0b10101010, %xmm1, %xmm1
+ shufps $0b00010000, %xmm2, %xmm4
+ pxor %xmm4, %xmm2
+ shufps $0b10001100, %xmm2, %xmm4
+ pxor %xmm4, %xmm2
+ pxor %xmm1, %xmm2
+ movaps %xmm2, (%rcx)
+ add $0x10, %rcx
+ ret
+
+/*
+ * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+ * unsigned int key_len)
+ */
+ENTRY(aesni_set_key)
+ movups (%rsi), %xmm0 # user key (first 16 bytes)
+ movaps %xmm0, (%rdi)
+ lea 0x10(%rdi), %rcx # key addr
+ movl %edx, 480(%rdi)
+ pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
+ cmp $24, %dl
+ jb .Lenc_key128
+ je .Lenc_key192
+ movups 0x10(%rsi), %xmm2 # other user key
+ movaps %xmm2, (%rcx)
+ add $0x10, %rcx
+ AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x1 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x2 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x4 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x8 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x10 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x20 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
+ call _key_expansion_256a
+ jmp .Ldec_key
+.Lenc_key192:
+ movq 0x10(%rsi), %xmm2 # other user key
+ AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
+ call _key_expansion_192a
+ AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
+ call _key_expansion_192b
+ AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
+ call _key_expansion_192a
+ AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
+ call _key_expansion_192b
+ AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
+ call _key_expansion_192a
+ AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
+ call _key_expansion_192b
+ AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
+ call _key_expansion_192a
+ AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
+ call _key_expansion_192b
+ jmp .Ldec_key
+.Lenc_key128:
+ AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
+ call _key_expansion_128
+ AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
+ call _key_expansion_128
+ AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
+ call _key_expansion_128
+ AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
+ call _key_expansion_128
+ AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
+ call _key_expansion_128
+ AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
+ call _key_expansion_128
+ AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
+ call _key_expansion_128
+ AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
+ call _key_expansion_128
+ AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
+ call _key_expansion_128
+ AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
+ call _key_expansion_128
+.Ldec_key:
+ sub $0x10, %rcx
+ movaps (%rdi), %xmm0
+ movaps (%rcx), %xmm1
+ movaps %xmm0, 240(%rcx)
+ movaps %xmm1, 240(%rdi)
+ add $0x10, %rdi
+ lea 240-16(%rcx), %rsi
+.align 4
+.Ldec_key_loop:
+ movaps (%rdi), %xmm0
+ AESIMC %xmm0 %xmm1
+ movaps %xmm1, (%rsi)
+ add $0x10, %rdi
+ sub $0x10, %rsi
+ cmp %rcx, %rdi
+ jb .Ldec_key_loop
+ xor %rax, %rax
+ ret
+
+/*
+ * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
+ */
+ENTRY(aesni_enc)
+ movl 480(KEYP), KLEN # key length
+ movups (INP), STATE # input
+ call _aesni_enc1
+ movups STATE, (OUTP) # output
+ ret
+
+/*
+ * _aesni_enc1: internal ABI
+ * input:
+ * KEYP: key struct pointer
+ * KLEN: round count
+ * STATE: initial state (input)
+ * output:
+ * STATE: finial state (output)
+ * changed:
+ * KEY
+ * TKEYP (T1)
+ */
+_aesni_enc1:
+ movaps (KEYP), KEY # key
+ mov KEYP, TKEYP
+ pxor KEY, STATE # round 0
+ add $0x30, TKEYP
+ cmp $24, KLEN
+ jb .Lenc128
+ lea 0x20(TKEYP), TKEYP
+ je .Lenc192
+ add $0x20, TKEYP
+ movaps -0x60(TKEYP), KEY
+ AESENC KEY STATE
+ movaps -0x50(TKEYP), KEY
+ AESENC KEY STATE
+.align 4
+.Lenc192:
+ movaps -0x40(TKEYP), KEY
+ AESENC KEY STATE
+ movaps -0x30(TKEYP), KEY
+ AESENC KEY STATE
+.align 4
+.Lenc128:
+ movaps -0x20(TKEYP), KEY
+ AESENC KEY STATE
+ movaps -0x10(TKEYP), KEY
+ AESENC KEY STATE
+ movaps (TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x10(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x20(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x30(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x40(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x50(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x60(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x70(TKEYP), KEY
+ AESENCLAST KEY STATE
+ ret
+
+/*
+ * _aesni_enc4: internal ABI
+ * input:
+ * KEYP: key struct pointer
+ * KLEN: round count
+ * STATE1: initial state (input)
+ * STATE2
+ * STATE3
+ * STATE4
+ * output:
+ * STATE1: finial state (output)
+ * STATE2
+ * STATE3
+ * STATE4
+ * changed:
+ * KEY
+ * TKEYP (T1)
+ */
+_aesni_enc4:
+ movaps (KEYP), KEY # key
+ mov KEYP, TKEYP
+ pxor KEY, STATE1 # round 0
+ pxor KEY, STATE2
+ pxor KEY, STATE3
+ pxor KEY, STATE4
+ add $0x30, TKEYP
+ cmp $24, KLEN
+ jb .L4enc128
+ lea 0x20(TKEYP), TKEYP
+ je .L4enc192
+ add $0x20, TKEYP
+ movaps -0x60(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps -0x50(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+#.align 4
+.L4enc192:
+ movaps -0x40(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps -0x30(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+#.align 4
+.L4enc128:
+ movaps -0x20(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps -0x10(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps (TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x10(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x20(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x30(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x40(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x50(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x60(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x70(TKEYP), KEY
+ AESENCLAST KEY STATE1 # last round
+ AESENCLAST KEY STATE2
+ AESENCLAST KEY STATE3
+ AESENCLAST KEY STATE4
+ ret
+
+/*
+ * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
+ */
+ENTRY(aesni_dec)
+ mov 480(KEYP), KLEN # key length
+ add $240, KEYP
+ movups (INP), STATE # input
+ call _aesni_dec1
+ movups STATE, (OUTP) #output
+ ret
+
+/*
+ * _aesni_dec1: internal ABI
+ * input:
+ * KEYP: key struct pointer
+ * KLEN: key length
+ * STATE: initial state (input)
+ * output:
+ * STATE: finial state (output)
+ * changed:
+ * KEY
+ * TKEYP (T1)
+ */
+_aesni_dec1:
+ movaps (KEYP), KEY # key
+ mov KEYP, TKEYP
+ pxor KEY, STATE # round 0
+ add $0x30, TKEYP
+ cmp $24, KLEN
+ jb .Ldec128
+ lea 0x20(TKEYP), TKEYP
+ je .Ldec192
+ add $0x20, TKEYP
+ movaps -0x60(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps -0x50(TKEYP), KEY
+ AESDEC KEY STATE
+.align 4
+.Ldec192:
+ movaps -0x40(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps -0x30(TKEYP), KEY
+ AESDEC KEY STATE
+.align 4
+.Ldec128:
+ movaps -0x20(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps -0x10(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps (TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x10(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x20(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x30(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x40(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x50(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x60(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x70(TKEYP), KEY
+ AESDECLAST KEY STATE
+ ret
+
+/*
+ * _aesni_dec4: internal ABI
+ * input:
+ * KEYP: key struct pointer
+ * KLEN: key length
+ * STATE1: initial state (input)
+ * STATE2
+ * STATE3
+ * STATE4
+ * output:
+ * STATE1: finial state (output)
+ * STATE2
+ * STATE3
+ * STATE4
+ * changed:
+ * KEY
+ * TKEYP (T1)
+ */
+_aesni_dec4:
+ movaps (KEYP), KEY # key
+ mov KEYP, TKEYP
+ pxor KEY, STATE1 # round 0
+ pxor KEY, STATE2
+ pxor KEY, STATE3
+ pxor KEY, STATE4
+ add $0x30, TKEYP
+ cmp $24, KLEN
+ jb .L4dec128
+ lea 0x20(TKEYP), TKEYP
+ je .L4dec192
+ add $0x20, TKEYP
+ movaps -0x60(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps -0x50(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+.align 4
+.L4dec192:
+ movaps -0x40(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps -0x30(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+.align 4
+.L4dec128:
+ movaps -0x20(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps -0x10(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps (TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x10(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x20(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x30(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x40(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x50(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x60(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x70(TKEYP), KEY
+ AESDECLAST KEY STATE1 # last round
+ AESDECLAST KEY STATE2
+ AESDECLAST KEY STATE3
+ AESDECLAST KEY STATE4
+ ret
+
+/*
+ * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len)
+ */
+ENTRY(aesni_ecb_enc)
+ test LEN, LEN # check length
+ jz .Lecb_enc_ret
+ mov 480(KEYP), KLEN
+ cmp $16, LEN
+ jb .Lecb_enc_ret
+ cmp $64, LEN
+ jb .Lecb_enc_loop1
+.align 4
+.Lecb_enc_loop4:
+ movups (INP), STATE1
+ movups 0x10(INP), STATE2
+ movups 0x20(INP), STATE3
+ movups 0x30(INP), STATE4
+ call _aesni_enc4
+ movups STATE1, (OUTP)
+ movups STATE2, 0x10(OUTP)
+ movups STATE3, 0x20(OUTP)
+ movups STATE4, 0x30(OUTP)
+ sub $64, LEN
+ add $64, INP
+ add $64, OUTP
+ cmp $64, LEN
+ jge .Lecb_enc_loop4
+ cmp $16, LEN
+ jb .Lecb_enc_ret
+.align 4
+.Lecb_enc_loop1:
+ movups (INP), STATE1
+ call _aesni_enc1
+ movups STATE1, (OUTP)
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lecb_enc_loop1
+.Lecb_enc_ret:
+ ret
+
+/*
+ * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len);
+ */
+ENTRY(aesni_ecb_dec)
+ test LEN, LEN
+ jz .Lecb_dec_ret
+ mov 480(KEYP), KLEN
+ add $240, KEYP
+ cmp $16, LEN
+ jb .Lecb_dec_ret
+ cmp $64, LEN
+ jb .Lecb_dec_loop1
+.align 4
+.Lecb_dec_loop4:
+ movups (INP), STATE1
+ movups 0x10(INP), STATE2
+ movups 0x20(INP), STATE3
+ movups 0x30(INP), STATE4
+ call _aesni_dec4
+ movups STATE1, (OUTP)
+ movups STATE2, 0x10(OUTP)
+ movups STATE3, 0x20(OUTP)
+ movups STATE4, 0x30(OUTP)
+ sub $64, LEN
+ add $64, INP
+ add $64, OUTP
+ cmp $64, LEN
+ jge .Lecb_dec_loop4
+ cmp $16, LEN
+ jb .Lecb_dec_ret
+.align 4
+.Lecb_dec_loop1:
+ movups (INP), STATE1
+ call _aesni_dec1
+ movups STATE1, (OUTP)
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lecb_dec_loop1
+.Lecb_dec_ret:
+ ret
+
+/*
+ * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+ENTRY(aesni_cbc_enc)
+ cmp $16, LEN
+ jb .Lcbc_enc_ret
+ mov 480(KEYP), KLEN
+ movups (IVP), STATE # load iv as initial state
+.align 4
+.Lcbc_enc_loop:
+ movups (INP), IN # load input
+ pxor IN, STATE
+ call _aesni_enc1
+ movups STATE, (OUTP) # store output
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lcbc_enc_loop
+ movups STATE, (IVP)
+.Lcbc_enc_ret:
+ ret
+
+/*
+ * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+ENTRY(aesni_cbc_dec)
+ cmp $16, LEN
+ jb .Lcbc_dec_just_ret
+ mov 480(KEYP), KLEN
+ add $240, KEYP
+ movups (IVP), IV
+ cmp $64, LEN
+ jb .Lcbc_dec_loop1
+.align 4
+.Lcbc_dec_loop4:
+ movups (INP), IN1
+ movaps IN1, STATE1
+ movups 0x10(INP), IN2
+ movaps IN2, STATE2
+ movups 0x20(INP), IN3
+ movaps IN3, STATE3
+ movups 0x30(INP), IN4
+ movaps IN4, STATE4
+ call _aesni_dec4
+ pxor IV, STATE1
+ pxor IN1, STATE2
+ pxor IN2, STATE3
+ pxor IN3, STATE4
+ movaps IN4, IV
+ movups STATE1, (OUTP)
+ movups STATE2, 0x10(OUTP)
+ movups STATE3, 0x20(OUTP)
+ movups STATE4, 0x30(OUTP)
+ sub $64, LEN
+ add $64, INP
+ add $64, OUTP
+ cmp $64, LEN
+ jge .Lcbc_dec_loop4
+ cmp $16, LEN
+ jb .Lcbc_dec_ret
+.align 4
+.Lcbc_dec_loop1:
+ movups (INP), IN
+ movaps IN, STATE
+ call _aesni_dec1
+ pxor IV, STATE
+ movups STATE, (OUTP)
+ movaps IN, IV
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lcbc_dec_loop1
+.Lcbc_dec_ret:
+ movups IV, (IVP)
+.Lcbc_dec_just_ret:
+ ret
+
+.align 16
+.Lbswap_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/*
+ * _aesni_inc_init: internal ABI
+ * setup registers used by _aesni_inc
+ * input:
+ * IV
+ * output:
+ * CTR: == IV, in little endian
+ * TCTR_LOW: == lower qword of CTR
+ * INC: == 1, in little endian
+ * BSWAP_MASK == endian swapping mask
+ */
+_aesni_inc_init:
+ movaps .Lbswap_mask, BSWAP_MASK
+ movaps IV, CTR
+ PSHUFB_XMM BSWAP_MASK CTR
+ mov $1, TCTR_LOW
+ MOVQ_R64_XMM TCTR_LOW INC
+ MOVQ_R64_XMM CTR TCTR_LOW
+ ret
+
+/*
+ * _aesni_inc: internal ABI
+ * Increase IV by 1, IV is in big endian
+ * input:
+ * IV
+ * CTR: == IV, in little endian
+ * TCTR_LOW: == lower qword of CTR
+ * INC: == 1, in little endian
+ * BSWAP_MASK == endian swapping mask
+ * output:
+ * IV: Increase by 1
+ * changed:
+ * CTR: == output IV, in little endian
+ * TCTR_LOW: == lower qword of CTR
+ */
+_aesni_inc:
+ paddq INC, CTR
+ add $1, TCTR_LOW
+ jnc .Linc_low
+ pslldq $8, INC
+ paddq INC, CTR
+ psrldq $8, INC
+.Linc_low:
+ movaps CTR, IV
+ PSHUFB_XMM BSWAP_MASK IV
+ ret
+
+/*
+ * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+ENTRY(aesni_ctr_enc)
+ cmp $16, LEN
+ jb .Lctr_enc_just_ret
+ mov 480(KEYP), KLEN
+ movups (IVP), IV
+ call _aesni_inc_init
+ cmp $64, LEN
+ jb .Lctr_enc_loop1
+.align 4
+.Lctr_enc_loop4:
+ movaps IV, STATE1
+ call _aesni_inc
+ movups (INP), IN1
+ movaps IV, STATE2
+ call _aesni_inc
+ movups 0x10(INP), IN2
+ movaps IV, STATE3
+ call _aesni_inc
+ movups 0x20(INP), IN3
+ movaps IV, STATE4
+ call _aesni_inc
+ movups 0x30(INP), IN4
+ call _aesni_enc4
+ pxor IN1, STATE1
+ movups STATE1, (OUTP)
+ pxor IN2, STATE2
+ movups STATE2, 0x10(OUTP)
+ pxor IN3, STATE3
+ movups STATE3, 0x20(OUTP)
+ pxor IN4, STATE4
+ movups STATE4, 0x30(OUTP)
+ sub $64, LEN
+ add $64, INP
+ add $64, OUTP
+ cmp $64, LEN
+ jge .Lctr_enc_loop4
+ cmp $16, LEN
+ jb .Lctr_enc_ret
+.align 4
+.Lctr_enc_loop1:
+ movaps IV, STATE
+ call _aesni_inc
+ movups (INP), IN
+ call _aesni_enc1
+ pxor IN, STATE
+ movups STATE, (OUTP)
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lctr_enc_loop1
+.Lctr_enc_ret:
+ movups IV, (IVP)
+.Lctr_enc_just_ret:
+ ret
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
deleted file mode 100644
index ff16756..0000000
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ /dev/null
@@ -1,841 +0,0 @@
-/*
- * Implement AES algorithm in Intel AES-NI instructions.
- *
- * The white paper of AES-NI instructions can be downloaded from:
- * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
- *
- * Copyright (C) 2008, Intel Corp.
- * Author: Huang Ying <[email protected]>
- * Vinodh Gopal <[email protected]>
- * Kahraman Akdemir
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/linkage.h>
-#include <asm/inst.h>
-
-.text
-
-#define STATE1 %xmm0
-#define STATE2 %xmm4
-#define STATE3 %xmm5
-#define STATE4 %xmm6
-#define STATE STATE1
-#define IN1 %xmm1
-#define IN2 %xmm7
-#define IN3 %xmm8
-#define IN4 %xmm9
-#define IN IN1
-#define KEY %xmm2
-#define IV %xmm3
-#define BSWAP_MASK %xmm10
-#define CTR %xmm11
-#define INC %xmm12
-
-#define KEYP %rdi
-#define OUTP %rsi
-#define INP %rdx
-#define LEN %rcx
-#define IVP %r8
-#define KLEN %r9d
-#define T1 %r10
-#define TKEYP T1
-#define T2 %r11
-#define TCTR_LOW T2
-
-_key_expansion_128:
-_key_expansion_256a:
- pshufd $0b11111111, %xmm1, %xmm1
- shufps $0b00010000, %xmm0, %xmm4
- pxor %xmm4, %xmm0
- shufps $0b10001100, %xmm0, %xmm4
- pxor %xmm4, %xmm0
- pxor %xmm1, %xmm0
- movaps %xmm0, (%rcx)
- add $0x10, %rcx
- ret
-
-_key_expansion_192a:
- pshufd $0b01010101, %xmm1, %xmm1
- shufps $0b00010000, %xmm0, %xmm4
- pxor %xmm4, %xmm0
- shufps $0b10001100, %xmm0, %xmm4
- pxor %xmm4, %xmm0
- pxor %xmm1, %xmm0
-
- movaps %xmm2, %xmm5
- movaps %xmm2, %xmm6
- pslldq $4, %xmm5
- pshufd $0b11111111, %xmm0, %xmm3
- pxor %xmm3, %xmm2
- pxor %xmm5, %xmm2
-
- movaps %xmm0, %xmm1
- shufps $0b01000100, %xmm0, %xmm6
- movaps %xmm6, (%rcx)
- shufps $0b01001110, %xmm2, %xmm1
- movaps %xmm1, 16(%rcx)
- add $0x20, %rcx
- ret
-
-_key_expansion_192b:
- pshufd $0b01010101, %xmm1, %xmm1
- shufps $0b00010000, %xmm0, %xmm4
- pxor %xmm4, %xmm0
- shufps $0b10001100, %xmm0, %xmm4
- pxor %xmm4, %xmm0
- pxor %xmm1, %xmm0
-
- movaps %xmm2, %xmm5
- pslldq $4, %xmm5
- pshufd $0b11111111, %xmm0, %xmm3
- pxor %xmm3, %xmm2
- pxor %xmm5, %xmm2
-
- movaps %xmm0, (%rcx)
- add $0x10, %rcx
- ret
-
-_key_expansion_256b:
- pshufd $0b10101010, %xmm1, %xmm1
- shufps $0b00010000, %xmm2, %xmm4
- pxor %xmm4, %xmm2
- shufps $0b10001100, %xmm2, %xmm4
- pxor %xmm4, %xmm2
- pxor %xmm1, %xmm2
- movaps %xmm2, (%rcx)
- add $0x10, %rcx
- ret
-
-/*
- * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
- * unsigned int key_len)
- */
-ENTRY(aesni_set_key)
- movups (%rsi), %xmm0 # user key (first 16 bytes)
- movaps %xmm0, (%rdi)
- lea 0x10(%rdi), %rcx # key addr
- movl %edx, 480(%rdi)
- pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
- cmp $24, %dl
- jb .Lenc_key128
- je .Lenc_key192
- movups 0x10(%rsi), %xmm2 # other user key
- movaps %xmm2, (%rcx)
- add $0x10, %rcx
- AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
- call _key_expansion_256a
- AESKEYGENASSIST 0x1 %xmm0 %xmm1
- call _key_expansion_256b
- AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
- call _key_expansion_256a
- AESKEYGENASSIST 0x2 %xmm0 %xmm1
- call _key_expansion_256b
- AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
- call _key_expansion_256a
- AESKEYGENASSIST 0x4 %xmm0 %xmm1
- call _key_expansion_256b
- AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
- call _key_expansion_256a
- AESKEYGENASSIST 0x8 %xmm0 %xmm1
- call _key_expansion_256b
- AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
- call _key_expansion_256a
- AESKEYGENASSIST 0x10 %xmm0 %xmm1
- call _key_expansion_256b
- AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
- call _key_expansion_256a
- AESKEYGENASSIST 0x20 %xmm0 %xmm1
- call _key_expansion_256b
- AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
- call _key_expansion_256a
- jmp .Ldec_key
-.Lenc_key192:
- movq 0x10(%rsi), %xmm2 # other user key
- AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
- call _key_expansion_192a
- AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
- call _key_expansion_192b
- AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
- call _key_expansion_192a
- AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
- call _key_expansion_192b
- AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
- call _key_expansion_192a
- AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
- call _key_expansion_192b
- AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
- call _key_expansion_192a
- AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
- call _key_expansion_192b
- jmp .Ldec_key
-.Lenc_key128:
- AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
- call _key_expansion_128
- AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
- call _key_expansion_128
- AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
- call _key_expansion_128
- AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
- call _key_expansion_128
- AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
- call _key_expansion_128
- AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
- call _key_expansion_128
- AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
- call _key_expansion_128
- AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
- call _key_expansion_128
- AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
- call _key_expansion_128
- AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
- call _key_expansion_128
-.Ldec_key:
- sub $0x10, %rcx
- movaps (%rdi), %xmm0
- movaps (%rcx), %xmm1
- movaps %xmm0, 240(%rcx)
- movaps %xmm1, 240(%rdi)
- add $0x10, %rdi
- lea 240-16(%rcx), %rsi
-.align 4
-.Ldec_key_loop:
- movaps (%rdi), %xmm0
- AESIMC %xmm0 %xmm1
- movaps %xmm1, (%rsi)
- add $0x10, %rdi
- sub $0x10, %rsi
- cmp %rcx, %rdi
- jb .Ldec_key_loop
- xor %rax, %rax
- ret
-
-/*
- * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
- */
-ENTRY(aesni_enc)
- movl 480(KEYP), KLEN # key length
- movups (INP), STATE # input
- call _aesni_enc1
- movups STATE, (OUTP) # output
- ret
-
-/*
- * _aesni_enc1: internal ABI
- * input:
- * KEYP: key struct pointer
- * KLEN: round count
- * STATE: initial state (input)
- * output:
- * STATE: finial state (output)
- * changed:
- * KEY
- * TKEYP (T1)
- */
-_aesni_enc1:
- movaps (KEYP), KEY # key
- mov KEYP, TKEYP
- pxor KEY, STATE # round 0
- add $0x30, TKEYP
- cmp $24, KLEN
- jb .Lenc128
- lea 0x20(TKEYP), TKEYP
- je .Lenc192
- add $0x20, TKEYP
- movaps -0x60(TKEYP), KEY
- AESENC KEY STATE
- movaps -0x50(TKEYP), KEY
- AESENC KEY STATE
-.align 4
-.Lenc192:
- movaps -0x40(TKEYP), KEY
- AESENC KEY STATE
- movaps -0x30(TKEYP), KEY
- AESENC KEY STATE
-.align 4
-.Lenc128:
- movaps -0x20(TKEYP), KEY
- AESENC KEY STATE
- movaps -0x10(TKEYP), KEY
- AESENC KEY STATE
- movaps (TKEYP), KEY
- AESENC KEY STATE
- movaps 0x10(TKEYP), KEY
- AESENC KEY STATE
- movaps 0x20(TKEYP), KEY
- AESENC KEY STATE
- movaps 0x30(TKEYP), KEY
- AESENC KEY STATE
- movaps 0x40(TKEYP), KEY
- AESENC KEY STATE
- movaps 0x50(TKEYP), KEY
- AESENC KEY STATE
- movaps 0x60(TKEYP), KEY
- AESENC KEY STATE
- movaps 0x70(TKEYP), KEY
- AESENCLAST KEY STATE
- ret
-
-/*
- * _aesni_enc4: internal ABI
- * input:
- * KEYP: key struct pointer
- * KLEN: round count
- * STATE1: initial state (input)
- * STATE2
- * STATE3
- * STATE4
- * output:
- * STATE1: finial state (output)
- * STATE2
- * STATE3
- * STATE4
- * changed:
- * KEY
- * TKEYP (T1)
- */
-_aesni_enc4:
- movaps (KEYP), KEY # key
- mov KEYP, TKEYP
- pxor KEY, STATE1 # round 0
- pxor KEY, STATE2
- pxor KEY, STATE3
- pxor KEY, STATE4
- add $0x30, TKEYP
- cmp $24, KLEN
- jb .L4enc128
- lea 0x20(TKEYP), TKEYP
- je .L4enc192
- add $0x20, TKEYP
- movaps -0x60(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- movaps -0x50(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
-#.align 4
-.L4enc192:
- movaps -0x40(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- movaps -0x30(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
-#.align 4
-.L4enc128:
- movaps -0x20(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- movaps -0x10(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- movaps (TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- movaps 0x10(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- movaps 0x20(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- movaps 0x30(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- movaps 0x40(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- movaps 0x50(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- movaps 0x60(TKEYP), KEY
- AESENC KEY STATE1
- AESENC KEY STATE2
- AESENC KEY STATE3
- AESENC KEY STATE4
- movaps 0x70(TKEYP), KEY
- AESENCLAST KEY STATE1 # last round
- AESENCLAST KEY STATE2
- AESENCLAST KEY STATE3
- AESENCLAST KEY STATE4
- ret
-
-/*
- * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
- */
-ENTRY(aesni_dec)
- mov 480(KEYP), KLEN # key length
- add $240, KEYP
- movups (INP), STATE # input
- call _aesni_dec1
- movups STATE, (OUTP) #output
- ret
-
-/*
- * _aesni_dec1: internal ABI
- * input:
- * KEYP: key struct pointer
- * KLEN: key length
- * STATE: initial state (input)
- * output:
- * STATE: finial state (output)
- * changed:
- * KEY
- * TKEYP (T1)
- */
-_aesni_dec1:
- movaps (KEYP), KEY # key
- mov KEYP, TKEYP
- pxor KEY, STATE # round 0
- add $0x30, TKEYP
- cmp $24, KLEN
- jb .Ldec128
- lea 0x20(TKEYP), TKEYP
- je .Ldec192
- add $0x20, TKEYP
- movaps -0x60(TKEYP), KEY
- AESDEC KEY STATE
- movaps -0x50(TKEYP), KEY
- AESDEC KEY STATE
-.align 4
-.Ldec192:
- movaps -0x40(TKEYP), KEY
- AESDEC KEY STATE
- movaps -0x30(TKEYP), KEY
- AESDEC KEY STATE
-.align 4
-.Ldec128:
- movaps -0x20(TKEYP), KEY
- AESDEC KEY STATE
- movaps -0x10(TKEYP), KEY
- AESDEC KEY STATE
- movaps (TKEYP), KEY
- AESDEC KEY STATE
- movaps 0x10(TKEYP), KEY
- AESDEC KEY STATE
- movaps 0x20(TKEYP), KEY
- AESDEC KEY STATE
- movaps 0x30(TKEYP), KEY
- AESDEC KEY STATE
- movaps 0x40(TKEYP), KEY
- AESDEC KEY STATE
- movaps 0x50(TKEYP), KEY
- AESDEC KEY STATE
- movaps 0x60(TKEYP), KEY
- AESDEC KEY STATE
- movaps 0x70(TKEYP), KEY
- AESDECLAST KEY STATE
- ret
-
-/*
- * _aesni_dec4: internal ABI
- * input:
- * KEYP: key struct pointer
- * KLEN: key length
- * STATE1: initial state (input)
- * STATE2
- * STATE3
- * STATE4
- * output:
- * STATE1: finial state (output)
- * STATE2
- * STATE3
- * STATE4
- * changed:
- * KEY
- * TKEYP (T1)
- */
-_aesni_dec4:
- movaps (KEYP), KEY # key
- mov KEYP, TKEYP
- pxor KEY, STATE1 # round 0
- pxor KEY, STATE2
- pxor KEY, STATE3
- pxor KEY, STATE4
- add $0x30, TKEYP
- cmp $24, KLEN
- jb .L4dec128
- lea 0x20(TKEYP), TKEYP
- je .L4dec192
- add $0x20, TKEYP
- movaps -0x60(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- movaps -0x50(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
-.align 4
-.L4dec192:
- movaps -0x40(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- movaps -0x30(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
-.align 4
-.L4dec128:
- movaps -0x20(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- movaps -0x10(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- movaps (TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- movaps 0x10(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- movaps 0x20(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- movaps 0x30(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- movaps 0x40(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- movaps 0x50(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- movaps 0x60(TKEYP), KEY
- AESDEC KEY STATE1
- AESDEC KEY STATE2
- AESDEC KEY STATE3
- AESDEC KEY STATE4
- movaps 0x70(TKEYP), KEY
- AESDECLAST KEY STATE1 # last round
- AESDECLAST KEY STATE2
- AESDECLAST KEY STATE3
- AESDECLAST KEY STATE4
- ret
-
-/*
- * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
- * size_t len)
- */
-ENTRY(aesni_ecb_enc)
- test LEN, LEN # check length
- jz .Lecb_enc_ret
- mov 480(KEYP), KLEN
- cmp $16, LEN
- jb .Lecb_enc_ret
- cmp $64, LEN
- jb .Lecb_enc_loop1
-.align 4
-.Lecb_enc_loop4:
- movups (INP), STATE1
- movups 0x10(INP), STATE2
- movups 0x20(INP), STATE3
- movups 0x30(INP), STATE4
- call _aesni_enc4
- movups STATE1, (OUTP)
- movups STATE2, 0x10(OUTP)
- movups STATE3, 0x20(OUTP)
- movups STATE4, 0x30(OUTP)
- sub $64, LEN
- add $64, INP
- add $64, OUTP
- cmp $64, LEN
- jge .Lecb_enc_loop4
- cmp $16, LEN
- jb .Lecb_enc_ret
-.align 4
-.Lecb_enc_loop1:
- movups (INP), STATE1
- call _aesni_enc1
- movups STATE1, (OUTP)
- sub $16, LEN
- add $16, INP
- add $16, OUTP
- cmp $16, LEN
- jge .Lecb_enc_loop1
-.Lecb_enc_ret:
- ret
-
-/*
- * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
- * size_t len);
- */
-ENTRY(aesni_ecb_dec)
- test LEN, LEN
- jz .Lecb_dec_ret
- mov 480(KEYP), KLEN
- add $240, KEYP
- cmp $16, LEN
- jb .Lecb_dec_ret
- cmp $64, LEN
- jb .Lecb_dec_loop1
-.align 4
-.Lecb_dec_loop4:
- movups (INP), STATE1
- movups 0x10(INP), STATE2
- movups 0x20(INP), STATE3
- movups 0x30(INP), STATE4
- call _aesni_dec4
- movups STATE1, (OUTP)
- movups STATE2, 0x10(OUTP)
- movups STATE3, 0x20(OUTP)
- movups STATE4, 0x30(OUTP)
- sub $64, LEN
- add $64, INP
- add $64, OUTP
- cmp $64, LEN
- jge .Lecb_dec_loop4
- cmp $16, LEN
- jb .Lecb_dec_ret
-.align 4
-.Lecb_dec_loop1:
- movups (INP), STATE1
- call _aesni_dec1
- movups STATE1, (OUTP)
- sub $16, LEN
- add $16, INP
- add $16, OUTP
- cmp $16, LEN
- jge .Lecb_dec_loop1
-.Lecb_dec_ret:
- ret
-
-/*
- * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
- * size_t len, u8 *iv)
- */
-ENTRY(aesni_cbc_enc)
- cmp $16, LEN
- jb .Lcbc_enc_ret
- mov 480(KEYP), KLEN
- movups (IVP), STATE # load iv as initial state
-.align 4
-.Lcbc_enc_loop:
- movups (INP), IN # load input
- pxor IN, STATE
- call _aesni_enc1
- movups STATE, (OUTP) # store output
- sub $16, LEN
- add $16, INP
- add $16, OUTP
- cmp $16, LEN
- jge .Lcbc_enc_loop
- movups STATE, (IVP)
-.Lcbc_enc_ret:
- ret
-
-/*
- * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
- * size_t len, u8 *iv)
- */
-ENTRY(aesni_cbc_dec)
- cmp $16, LEN
- jb .Lcbc_dec_just_ret
- mov 480(KEYP), KLEN
- add $240, KEYP
- movups (IVP), IV
- cmp $64, LEN
- jb .Lcbc_dec_loop1
-.align 4
-.Lcbc_dec_loop4:
- movups (INP), IN1
- movaps IN1, STATE1
- movups 0x10(INP), IN2
- movaps IN2, STATE2
- movups 0x20(INP), IN3
- movaps IN3, STATE3
- movups 0x30(INP), IN4
- movaps IN4, STATE4
- call _aesni_dec4
- pxor IV, STATE1
- pxor IN1, STATE2
- pxor IN2, STATE3
- pxor IN3, STATE4
- movaps IN4, IV
- movups STATE1, (OUTP)
- movups STATE2, 0x10(OUTP)
- movups STATE3, 0x20(OUTP)
- movups STATE4, 0x30(OUTP)
- sub $64, LEN
- add $64, INP
- add $64, OUTP
- cmp $64, LEN
- jge .Lcbc_dec_loop4
- cmp $16, LEN
- jb .Lcbc_dec_ret
-.align 4
-.Lcbc_dec_loop1:
- movups (INP), IN
- movaps IN, STATE
- call _aesni_dec1
- pxor IV, STATE
- movups STATE, (OUTP)
- movaps IN, IV
- sub $16, LEN
- add $16, INP
- add $16, OUTP
- cmp $16, LEN
- jge .Lcbc_dec_loop1
-.Lcbc_dec_ret:
- movups IV, (IVP)
-.Lcbc_dec_just_ret:
- ret
-
-.align 16
-.Lbswap_mask:
- .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-
-/*
- * _aesni_inc_init: internal ABI
- * setup registers used by _aesni_inc
- * input:
- * IV
- * output:
- * CTR: == IV, in little endian
- * TCTR_LOW: == lower qword of CTR
- * INC: == 1, in little endian
- * BSWAP_MASK == endian swapping mask
- */
-_aesni_inc_init:
- movaps .Lbswap_mask, BSWAP_MASK
- movaps IV, CTR
- PSHUFB_XMM BSWAP_MASK CTR
- mov $1, TCTR_LOW
- MOVQ_R64_XMM TCTR_LOW INC
- MOVQ_R64_XMM CTR TCTR_LOW
- ret
-
-/*
- * _aesni_inc: internal ABI
- * Increase IV by 1, IV is in big endian
- * input:
- * IV
- * CTR: == IV, in little endian
- * TCTR_LOW: == lower qword of CTR
- * INC: == 1, in little endian
- * BSWAP_MASK == endian swapping mask
- * output:
- * IV: Increase by 1
- * changed:
- * CTR: == output IV, in little endian
- * TCTR_LOW: == lower qword of CTR
- */
-_aesni_inc:
- paddq INC, CTR
- add $1, TCTR_LOW
- jnc .Linc_low
- pslldq $8, INC
- paddq INC, CTR
- psrldq $8, INC
-.Linc_low:
- movaps CTR, IV
- PSHUFB_XMM BSWAP_MASK IV
- ret
-
-/*
- * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
- * size_t len, u8 *iv)
- */
-ENTRY(aesni_ctr_enc)
- cmp $16, LEN
- jb .Lctr_enc_just_ret
- mov 480(KEYP), KLEN
- movups (IVP), IV
- call _aesni_inc_init
- cmp $64, LEN
- jb .Lctr_enc_loop1
-.align 4
-.Lctr_enc_loop4:
- movaps IV, STATE1
- call _aesni_inc
- movups (INP), IN1
- movaps IV, STATE2
- call _aesni_inc
- movups 0x10(INP), IN2
- movaps IV, STATE3
- call _aesni_inc
- movups 0x20(INP), IN3
- movaps IV, STATE4
- call _aesni_inc
- movups 0x30(INP), IN4
- call _aesni_enc4
- pxor IN1, STATE1
- movups STATE1, (OUTP)
- pxor IN2, STATE2
- movups STATE2, 0x10(OUTP)
- pxor IN3, STATE3
- movups STATE3, 0x20(OUTP)
- pxor IN4, STATE4
- movups STATE4, 0x30(OUTP)
- sub $64, LEN
- add $64, INP
- add $64, OUTP
- cmp $64, LEN
- jge .Lctr_enc_loop4
- cmp $16, LEN
- jb .Lctr_enc_ret
-.align 4
-.Lctr_enc_loop1:
- movaps IV, STATE
- call _aesni_inc
- movups (INP), IN
- call _aesni_enc1
- pxor IN, STATE
- movups STATE, (OUTP)
- sub $16, LEN
- add $16, INP
- add $16, OUTP
- cmp $16, LEN
- jge .Lctr_enc_loop1
-.Lctr_enc_ret:
- movups IV, (IVP)
-.Lctr_enc_just_ret:
- ret
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2cb3dcc..39f6238 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -59,8 +59,10 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
+#ifdef CONFIG_CRYPTO_AES_NI_INTEL_X86_64
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
+#endif

static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
{
@@ -324,6 +326,7 @@ static struct crypto_alg blk_cbc_alg = {
},
};

+#ifdef CONFIG_CRYPTO_AES_NI_INTEL_X86_64
static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
struct blkcipher_walk *walk)
{
@@ -389,6 +392,7 @@ static struct crypto_alg blk_ctr_alg = {
},
},
};
+#endif

static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
unsigned int key_len)
@@ -536,6 +540,7 @@ static struct crypto_alg ablk_cbc_alg = {
},
};

+#ifdef CONFIG_CRYPTO_AES_NI_INTEL_X86_64
static int ablk_ctr_init(struct crypto_tfm *tfm)
{
struct cryptd_ablkcipher *cryptd_tfm;
@@ -612,6 +617,7 @@ static struct crypto_alg ablk_rfc3686_ctr_alg = {
},
};
#endif
+#endif

#ifdef HAS_LRW
static int ablk_lrw_init(struct crypto_tfm *tfm)
@@ -746,18 +752,22 @@ static int __init aesni_init(void)
goto blk_ecb_err;
if ((err = crypto_register_alg(&blk_cbc_alg)))
goto blk_cbc_err;
+#ifdef CONFIG_CRYPTO_AES_NI_INTEL_X86_64
if ((err = crypto_register_alg(&blk_ctr_alg)))
goto blk_ctr_err;
+#endif
if ((err = crypto_register_alg(&ablk_ecb_alg)))
goto ablk_ecb_err;
if ((err = crypto_register_alg(&ablk_cbc_alg)))
goto ablk_cbc_err;
+#ifdef CONFIG_CRYPTO_AES_NI_INTEL_X86_64
if ((err = crypto_register_alg(&ablk_ctr_alg)))
goto ablk_ctr_err;
#ifdef HAS_CTR
if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg)))
goto ablk_rfc3686_ctr_err;
#endif
+#endif
#ifdef HAS_LRW
if ((err = crypto_register_alg(&ablk_lrw_alg)))
goto ablk_lrw_err;
@@ -784,18 +794,22 @@ ablk_pcbc_err:
crypto_unregister_alg(&ablk_lrw_alg);
ablk_lrw_err:
#endif
+#ifdef CONFIG_CRYPTO_AES_NI_INTEL_X86_64
#ifdef HAS_CTR
crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
ablk_rfc3686_ctr_err:
#endif
crypto_unregister_alg(&ablk_ctr_alg);
ablk_ctr_err:
+#endif
crypto_unregister_alg(&ablk_cbc_alg);
ablk_cbc_err:
crypto_unregister_alg(&ablk_ecb_alg);
ablk_ecb_err:
+#ifdef CONFIG_CRYPTO_AES_NI_INTEL_X86_64
crypto_unregister_alg(&blk_ctr_alg);
blk_ctr_err:
+#endif
crypto_unregister_alg(&blk_cbc_alg);
blk_cbc_err:
crypto_unregister_alg(&blk_ecb_alg);
@@ -818,13 +832,17 @@ static void __exit aesni_exit(void)
#ifdef HAS_LRW
crypto_unregister_alg(&ablk_lrw_alg);
#endif
+#ifdef CONFIG_CRYPTO_AES_NI_INTEL_X86_64
#ifdef HAS_CTR
crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
#endif
crypto_unregister_alg(&ablk_ctr_alg);
+#endif
crypto_unregister_alg(&ablk_cbc_alg);
crypto_unregister_alg(&ablk_ecb_alg);
+#ifdef CONFIG_CRYPTO_AES_NI_INTEL_X86_64
crypto_unregister_alg(&blk_ctr_alg);
+#endif
crypto_unregister_alg(&blk_cbc_alg);
crypto_unregister_alg(&blk_ecb_alg);
crypto_unregister_alg(&__aesni_alg);
diff --git a/crypto/Kconfig b/crypto/Kconfig
index e4bac29..7f917c6 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -537,7 +537,37 @@ config CRYPTO_AES_X86_64

See <http://csrc.nist.gov/encryption/aes/> for more information.

-config CRYPTO_AES_NI_INTEL
+config CRYPTO_AES_NI_INTEL_586
+ tristate "AES cipher algorithms (AES-NI)"
+ depends on (X86 || UML_X86) && !64BIT
+ select CRYPTO_AES_586
+ select CRYPTO_CRYPTD
+ select CRYPTO_ALGAPI
+ select CRYPTO_FPU
+ help
+ Use Intel AES-NI instructions for AES algorithm.
+
+ AES cipher algorithms (FIPS-197). AES uses the Rijndael
+ algorithm.
+
+ Rijndael appears to be consistently a very good performer in
+ both hardware and software across a wide range of computing
+ environments regardless of its use in feedback or non-feedback
+ modes. Its key setup time is excellent, and its key agility is
+ good. Rijndael's very low memory requirements make it very well
+ suited for restricted-space environments, in which it also
+ demonstrates excellent performance. Rijndael's operations are
+ among the easiest to defend against power and timing attacks.
+
+ The AES specifies three key sizes: 128, 192 and 256 bits
+
+ See <http://csrc.nist.gov/encryption/aes/> for more information.
+
+ In addition to AES cipher algorithm support, the
+ acceleration for some popular block cipher mode is supported
+ too, including ECB, CBC, CTR, LRW, PCBC, XTS.
+
+config CRYPTO_AES_NI_INTEL_X86_64
tristate "AES cipher algorithms (AES-NI)"
depends on (X86 || UML_X86) && 64BIT
select CRYPTO_AES_X86_64
--
1.5.6.5


2010-10-29 22:15:44

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH] x86, crypto: ported aes-ni implementation to x86

Mathias Krause <[email protected]> wrote:
> The AES-NI instructions are also available in legacy mode so the x86
> architecture may profit from those, too.
>
> To illustrate the performance gain here's a short summary of the tcrypt
> speed test on a Core i5 M 520 running at 2.40GHz comparing both
> assembler implementations:
>
> aes-i586 aes-ni-i586 delta
> 256 bit, 8kB blocks, ECB: 46.81 MB/s 164.46 MB/s +251%
> 256 bit, 8kB blocks, CBC: 43.89 MB/s 62.18 MB/s +41%
> 384 bit, 8kB blocks, LRW: 42.24 MB/s 142.90 MB/s +238%
> 512 bit, 8kB blocks, XTS: 43.41 MB/s 148.67 MB/s +242%
>
> Signed-off-by: Mathias Krause <[email protected]>

Nice work :)

I have to say though that I'll love this een more if we could
avoid duplicating those assembly files somehow. Is this possible?

Oh and those CBC numbers look out of whack. I'd expect CBC to be
way faster as it's done directly by the hardware unlike the
other modes. What numbers do you get in 64-bit before/after
your patch?

If the hardware CBC is really so much slower then maybe we should
stop using it.

Thanks,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2010-10-29 22:51:50

by Mathias Krause

[permalink] [raw]
Subject: Re: [PATCH] x86, crypto: ported aes-ni implementation to x86

On 30.10.2010, 00:15 Herbert Xu wrote:
> Mathias Krause <[email protected]> wrote:
>> The AES-NI instructions are also available in legacy mode so the x86
>> architecture may profit from those, too.
>>
>> To illustrate the performance gain here's a short summary of the tcrypt
>> speed test on a Core i5 M 520 running at 2.40GHz comparing both
>> assembler implementations:
>>
>> aes-i586 aes-ni-i586 delta
>> 256 bit, 8kB blocks, ECB: 46.81 MB/s 164.46 MB/s +251%
>> 256 bit, 8kB blocks, CBC: 43.89 MB/s 62.18 MB/s +41%
>> 384 bit, 8kB blocks, LRW: 42.24 MB/s 142.90 MB/s +238%
>> 512 bit, 8kB blocks, XTS: 43.41 MB/s 148.67 MB/s +242%
>>
>> Signed-off-by: Mathias Krause <[email protected]>
>
> Nice work :)
>
> I have to say though that I'll love this een more if we could
> avoid duplicating those assembly files somehow. Is this possible?

I thought about that too but found it more easy to split those files.
The different calling conventions of the architectures and the limited
register set on the 32-bit version made me make some not so nice
#ifdef-able changes to the code so it'll work with less registers.

> Oh and those CBC numbers look out of whack. I'd expect CBC to be
> way faster as it's done directly by the hardware unlike the
> other modes.

Well, actually the 32-bit assembler implementation has specialized
algorithms for ECB and CBC. But the latter must be implemented a
little different than the 64-bit version because I didn't have enough
xmm registers to make a 1:1 port. So I reused some registers for
loading memory values and used direct memory references to make
aesni_cbc_dec() work with the limited amount of registers.

I'll look into it if we can do better, but if not, maybe leaving this
one out for the 32-bit version might be the best option. Doing so may
even make it easier to combine the two assembler files again.

Btw., because of the limited register set I wasn't able to port the
CTR mode version, yet. It uses even more registers -- xmm and general
purpose. :(

> What numbers do you get in 64-bit before/after
> your patch?

Haven't yet build a 64-bit kernel but will try that tomorrow.

> If the hardware CBC is really so much slower then maybe we should
> stop using it.

This must be related to the changes I made to the code. I would guess
it doesn't like the additional memory loads.

There's even more potential for optimization since I've still a
general purpose register left. ;)

See this version as a first version to get feedback, especially from
Huang Ying. But it's already quite fast. :)


Regards,
Mathias

>
> Thanks,
> --
> Email: Herbert Xu <[email protected]>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2010-10-31 19:33:11

by Mathias Krause

[permalink] [raw]
Subject: Re: [PATCH] x86, crypto: ported aes-ni implementation to x86

On 30.10.2010, 00:15 Herbert Xu wrote:
> Mathias Krause <[email protected]> wrote:
>> To illustrate the performance gain here's a short summary of the tcrypt
>> speed test on a Core i5 M 520 running at 2.40GHz comparing both
>> assembler implementations:
>>
>> aes-i586 aes-ni-i586 delta
>> 256 bit, 8kB blocks, ECB: 46.81 MB/s 164.46 MB/s +251%
>> 256 bit, 8kB blocks, CBC: 43.89 MB/s 62.18 MB/s +41%
>> 384 bit, 8kB blocks, LRW: 42.24 MB/s 142.90 MB/s +238%
>> 512 bit, 8kB blocks, XTS: 43.41 MB/s 148.67 MB/s +242%
>>
>> Signed-off-by: Mathias Krause <[email protected]>
>
> Oh and those CBC numbers look out of whack. I'd expect CBC to be
> way faster as it's done directly by the hardware unlike the
> other modes. What numbers do you get in 64-bit before/after
> your patch?
>
> If the hardware CBC is really so much slower then maybe we should
> stop using it.

Today I build and measured a 64-bit version without my changes and got
results for the above tests at around 60 to 66 MB/s which is ridiculous!
So I ran the test again and again and noticed that _sometimes_ I got
results for _some_ algorithms at 150 to 160 MB/s. That's weird!

Testing the 32-bit version again (with my patch) I even got 151 MB/s for
the CBC mode, albeit now other algorithms were down to 58 - 67 MB/s.
Strange. Looks like I was just lucky with my first measurement. :/

I don't know why the numbers do vary that much. Maybe it's some magic in
the processor deactivating some cores and the kernel scheduling work to
the wrong core. Nevertheless my system under test was otherwise idle. I
booted a minimal initramfs based system with no services at all but the
ability to load the tcrypt module.

Maybe Huang Ying can give us some insight why the numbers do vary that
much? My test case was 'modprobe tcrypt mode=200 sec=10' (for latter
tests I reduces the sec parameter to 1 in favor of doing multiple runs).
If that's an inappropriate test for the Intel AES instructions maybe
somebody can tell me how to do better? Maybe dd to a cryptoloop device?

Regards,
Mathias

2010-11-03 12:47:29

by Mathias Krause

[permalink] [raw]
Subject: Re: [PATCH] x86, crypto: ported aes-ni implementation to x86

Hi,

I modified the patch so it doesn't introduce a copy of the existing
assembler implementation but modifies the existing one to be usable for
64 and 32 bit. Additionally I added some alignment constraints for
internal functions which resulted in a noticeable speed-up.

I rerun the tests on another machine, an Core i7 M620, 2.67GHz. I also
took the "low-end" numbers for the AES-NI variants because I didn't
want to wait for the big numbers to come every now and then any more ;)
So here is the comparison of 5 consecutive tcrypt test runs for some
selected algorithms in MiB/s:

x86-64 (old): 1. run 2. run 3. run 4. run 5. run mean
ECB, 256 bit, 8kB: 152.49 152.58 152.51 151.80 151.87 152.25
CBC. 256 bit, 8kB: 144.32 144.44 144.35 143.75 143.75 144.12
LRW, 320 bit, 8kB: 159.41 159.21 159.21 158.55 159.28 159.13
XTS, 512 bit, 8kB: 144.87 142.88 144.75 144.11 144.75 144.27

x86-64 (new): 1. run 2. run 3. run 4. run 5. run mean
ECB, 256 bit, 8kB: 184.07 184.07 183.50 183.50 184.07 183.84
CBC. 256 bit, 8kB: 170.25 170.24 169.71 169.71 170.25 170.03
LRW, 320 bit, 8kB: 169.91 169.91 169.39 169.37 169.91 169.69
XTS, 512 bit, 8kB: 172.39 172.35 171.82 171.82 172.35 172.14

i586: 1. run 2. run 3. run 4. run 5. run mean
ECB, 256 bit, 8kB: 125.98 126.03 126.03 125.64 126.03 125.94
CBC. 256 bit, 8kB: 118.18 118.19 117.84 117.84 118.19 118.04
LRW, 320 bit, 8kB: 128.37 128.35 127.97 127.98 128.35 128.20
XTS, 512 bit, 8kB: 118.52 118.50 118.14 118.14 118.49 118.35

x86 (AES-NI): 1. run 2. run 3. run 4. run 5. run mean
ECB, 256 bit, 8kB: 187.33 187.34 187.33 186.75 186.74 187.09
CBC. 256 bit, 8kB: 171.84 171.84 171.84 171.28 171.28 171.61
LRW, 320 bit, 8kB: 168.54 168.54 168.53 168.00 168.02 168.32
XTS, 512 bit, 8kB: 166.61 166.60 166.60 166.08 166.60 166.49

Comparing the mean values gives us:

x86-64: old new delta
ECB, 256 bit, 8kB: 152.25 183.84 +20.7%
CBC. 256 bit, 8kB: 144.12 170.03 +18.0%
LRW, 320 bit, 8kB: 159.13 169.69 +6.6%
XTS, 512 bit, 8kB: 144.27 172.14 +19.3%

x86: i586 aes-ni delta
ECB, 256 bit, 8kB: 125.94 187.09 +48.6%
CBC. 256 bit, 8kB: 118.04 171.61 +45.4%
LRW, 320 bit, 8kB: 128.20 168.32 +31.3%
XTS, 512 bit, 8kB: 118.35 166.49 +40.7%

The funny thing is that the 32 bit implementation is sometimes even
faster then the 64 bit one. Nevertheless the minor optimization of
aligning function entries gave the 64 bit version quite a big
performance gain (up to 20%).

I'll post the new version of the patch in a follow-up email.

Regards,
Mathias