Add support to Intel AES-NI instructions for x86_64 platform.
Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD)
instructions that are going to be introduced in the next generation of
Intel processor, as of 2009. These instructions enable fast and secure
data encryption and decryption, using the Advanced Encryption Standard
(AES), defined by FIPS Publication number 197. The architecture
introduces six instructions that offer full hardware support for
AES. Four of them support high performance data encryption and
decryption, and the other two instructions support the AES key
expansion procedure.
The white paper can be downloaded from:
http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
AES may be used in soft_irq context, but MMX/SSE context can not be
touched safely in soft_irq context. So in_interrupt() is checked, if
in IRQ or soft_irq context, the general x86_64 implementation are used
instead.
Signed-off-by: Huang Ying <[email protected]>
---
arch/x86/crypto/aes_glue.c | 10 -
arch/x86/include/asm/aes.h | 9 +
arch/x86/include/asm/cpufeature.h | 1
drivers/crypto/Kconfig | 11 +
drivers/crypto/Makefile | 3
drivers/crypto/intel-aes_asm.S | 341 ++++++++++++++++++++++++++++++++++++++
drivers/crypto/intel-aes_glue.c | 132 ++++++++++++++
7 files changed, 503 insertions(+), 4 deletions(-)
--- /dev/null
+++ b/drivers/crypto/intel-aes_glue.c
@@ -0,0 +1,132 @@
+/*
+ * Support for Intel AES-NI instructions. This file contains glue
+ * code, the real AES implementation is in intel-aes_asm.S.
+ *
+ * Copyright (C) 2008, Intel Corp.
+ * Author: Huang Ying <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/aes.h>
+#include <linux/hardirq.h>
+#include <asm/i387.h>
+#include <asm/aes.h>
+
+#define INTEL_AES_ALIGN 16
+
+struct aes_ctx {
+ u32 key_enc[60];
+ u32 key_dec[60];
+ u32 key_length;
+};
+
+asmlinkage int intel_aes_set_key(struct aes_ctx *ctx, const u8 *in_key,
+ unsigned int key_len);
+asmlinkage void intel_aes_enc(struct aes_ctx *ctx, u8 *out, const u8 *in);
+asmlinkage void intel_aes_dec(struct aes_ctx *ctx, u8 *out, const u8 *in);
+
+static inline struct aes_ctx *aes_ctx(struct crypto_tfm *tfm)
+{
+ unsigned long addr = (unsigned long)crypto_tfm_ctx(tfm);
+ unsigned long align = INTEL_AES_ALIGN;
+
+ if (align <= crypto_tfm_ctx_alignment())
+ align = 1;
+ return (struct aes_ctx *)ALIGN(addr, align);
+}
+
+static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct aes_ctx *ctx = aes_ctx(tfm);
+ u32 *tfm_flags = &tfm->crt_flags;
+ int ret;
+
+ if (key_len % 8) {
+ *tfm_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+
+ if (in_interrupt())
+ ret = crypto_aes_set_key(tfm, in_key, key_len);
+ else {
+ kernel_fpu_begin();
+ ret = intel_aes_set_key(ctx, in_key, key_len);
+ kernel_fpu_end();
+ }
+
+ return ret;
+}
+
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ struct aes_ctx *ctx = aes_ctx(tfm);
+
+ if (in_interrupt())
+ crypto_aes_encrypt_x86(tfm, dst, src);
+ else {
+ kernel_fpu_begin();
+ intel_aes_enc(ctx, dst, src);
+ kernel_fpu_end();
+ }
+}
+
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ struct aes_ctx *ctx = aes_ctx(tfm);
+
+ if (in_interrupt())
+ crypto_aes_decrypt_x86(tfm, dst, src);
+ else {
+ kernel_fpu_begin();
+ intel_aes_dec(ctx, dst, src);
+ kernel_fpu_end();
+ }
+}
+
+static struct crypto_alg intel_aes_alg = {
+ .cra_name = "aes",
+ .cra_driver_name = "aes-intel",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct aes_ctx),
+ .cra_alignmask = INTEL_AES_ALIGN - 1,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(intel_aes_alg.cra_list),
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = AES_MIN_KEY_SIZE,
+ .cia_max_keysize = AES_MAX_KEY_SIZE,
+ .cia_setkey = aes_set_key,
+ .cia_encrypt = aes_encrypt,
+ .cia_decrypt = aes_decrypt
+ }
+ }
+};
+
+static int __init intel_aes_init(void)
+{
+ if (!cpu_has_aes) {
+ printk(KERN_ERR "Intel AES instructions are not detected.\n");
+ return -ENODEV;
+ }
+ return crypto_register_alg(&intel_aes_alg);
+}
+
+static void __exit intel_aes_fini(void)
+{
+ crypto_unregister_alg(&intel_aes_alg);
+}
+
+module_init(intel_aes_init);
+module_exit(intel_aes_fini);
+
+MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, Intel AES-NI instructions optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("aes");
+MODULE_ALIAS("aes-intel");
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -200,4 +200,15 @@ config CRYPTO_DEV_IXP4XX
help
Driver for the IXP4xx NPE crypto engine.
+config CRYPTO_DEV_INTEL_AES
+ tristate "Support for Intel AES-NI instructions"
+ depends on X86_64 && !UML
+ select CRYPTO_AES_X86_64
+ select CRYPTO_ALGAPI
+ help
+ Use Intel AES-NI instructions for AES algorithm.
+
+ The instructions are used only when the CPU supports them.
+ Otherwise software encryption is used.
+
endif # CRYPTO_HW
--- a/drivers/crypto/Makefile
+++ b/drivers/crypto/Makefile
@@ -4,3 +4,6 @@ obj-$(CONFIG_CRYPTO_DEV_GEODE) += geode-
obj-$(CONFIG_CRYPTO_DEV_HIFN_795X) += hifn_795x.o
obj-$(CONFIG_CRYPTO_DEV_TALITOS) += talitos.o
obj-$(CONFIG_CRYPTO_DEV_IXP4XX) += ixp4xx_crypto.o
+obj-$(CONFIG_CRYPTO_DEV_INTEL_AES) += intel-aes.o
+
+intel-aes-y := intel-aes_asm.o intel-aes_glue.o
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -210,6 +210,7 @@ extern const char * const x86_power_flag
#define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM)
#define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2)
#define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3)
+#define cpu_has_aes boot_cpu_has(X86_FEATURE_AES)
#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT)
#define cpu_has_mp boot_cpu_has(X86_FEATURE_MP)
#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX)
--- /dev/null
+++ b/drivers/crypto/intel-aes_asm.S
@@ -0,0 +1,341 @@
+/*
+ * Implement AES algorithm in Intel AES-NI instructions.
+ *
+ * The white paper of AES-NI instructions can be downloaded from:
+ * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
+ *
+ * Copyright (C) 2008, Intel Corp.
+ * Author: Huang Ying <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+.text
+
+#include <linux/linkage.h>
+
+key_expansion_128:
+ movaps %xmm1, %xmm4
+ psrldq $12, %xmm1
+ pxor %xmm0, %xmm1
+ palignr $12, %xmm4, %xmm1
+ pxor %xmm0, %xmm1
+ palignr $12, %xmm4, %xmm1
+ pxor %xmm0, %xmm1
+ palignr $12, %xmm4, %xmm1
+ pxor %xmm1, %xmm0
+
+ movaps %xmm0, (%rcx)
+ add $0x10, %rcx
+ ret
+
+key_expansion_192:
+ pshufd $0b01010101, %xmm1, %xmm1
+ movaps %xmm1, %xmm4
+ pxor %xmm0, %xmm1
+ palignr $12, %xmm4, %xmm1
+ pxor %xmm0, %xmm1
+ palignr $12, %xmm4, %xmm1
+ pxor %xmm0, %xmm1
+ palignr $12, %xmm4, %xmm1
+ pxor %xmm1, %xmm0
+
+ pshufd $0b11111111, %xmm0, %xmm3
+ pxor %xmm2, %xmm3
+ palignr $12, %xmm0, %xmm3
+ pxor %xmm2, %xmm3
+
+ test %r9, %r9
+ not %r9
+ jnz 1f
+
+ movaps %xmm0, %xmm1
+ pslldq $8, %xmm2
+ palignr $8, %xmm2, %xmm1
+ movaps %xmm1, (%rcx)
+ add $0x10, %rcx
+ movaps %xmm3, %xmm2
+ palignr $8, %xmm0, %xmm3
+ movaps %xmm3, (%rcx)
+ add $0x10, %rcx
+ ret
+1:
+ movaps %xmm0, (%rcx)
+ add $0x10, %rcx
+ movaps %xmm3, %xmm2
+ ret
+
+key_expansion_256:
+ movaps %xmm1, %xmm4
+ psrldq $12, %xmm1
+ pxor %xmm0, %xmm1
+ palignr $12, %xmm4, %xmm1
+ pxor %xmm0, %xmm1
+ palignr $12, %xmm4, %xmm1
+ pxor %xmm0, %xmm1
+ palignr $12, %xmm4, %xmm1
+ pxor %xmm1, %xmm0
+
+ movaps %xmm0, (%rcx)
+ add $0x10, %rcx
+
+ test %r9, %r9
+ jnz 1f
+
+ # aeskeygenassist $0x1, %xmm0, %xmm1
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
+
+ pshufd $0b10101010, %xmm1, %xmm1
+ movaps %xmm1, %xmm4
+ pxor %xmm2, %xmm1
+ palignr $12, %xmm4, %xmm1
+ pxor %xmm2, %xmm1
+ palignr $12, %xmm4, %xmm1
+ pxor %xmm2, %xmm1
+ palignr $12, %xmm4, %xmm1
+ pxor %xmm1, %xmm2
+
+ movaps %xmm2, (%rcx)
+ add $0x10, %rcx
+1:
+ ret
+
+ENTRY(intel_aes_set_key)
+ movups (%rsi), %xmm0 # user key (first 16 bytes)
+ movaps %xmm0, (%rdi)
+ lea 0x10(%rdi), %rcx # key addr
+ mov %edx, 480(%rdi) # key len
+ cmp $24, %dl
+ jb 2f
+ je 1f
+ movups 0x10(%rsi), %xmm2 # other user key
+ movaps %xmm2, (%rcx)
+ lea 0x10(%rcx), %rcx
+ xor %r9, %r9
+ # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
+ call key_expansion_256
+ # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
+ call key_expansion_256
+ # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
+ call key_expansion_256
+ # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
+ call key_expansion_256
+ # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
+ call key_expansion_256
+ # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
+ call key_expansion_256
+ # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
+ not %r9
+ call key_expansion_256
+ lea 224(%rdi), %rdx
+ jmp 3f
+1:
+ mov $192, %edx
+ movq 0x10(%rsi), %xmm2 # other user key
+ xor %r9, %r9
+ # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
+ call key_expansion_192
+ # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
+ call key_expansion_192
+ # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
+ call key_expansion_192
+ # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
+ call key_expansion_192
+ # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
+ call key_expansion_192
+ # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
+ call key_expansion_192
+ # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
+ call key_expansion_192
+ # aeskeygenassist $0x80, %xmm2, %xmm1 # round 8
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x80
+ call key_expansion_192
+ lea 192(%rdi), %rdx
+ jmp 3f
+2:
+ # aeskeygenassist $0x1, %xmm0, %xmm1 # round 1
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
+ call key_expansion_128
+ # aeskeygenassist $0x2, %xmm0, %xmm1 # round 2
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
+ call key_expansion_128
+ # aeskeygenassist $0x4, %xmm0, %xmm1 # round 3
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
+ call key_expansion_128
+ # aeskeygenassist $0x8, %xmm0, %xmm1 # round 4
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
+ call key_expansion_128
+ # aeskeygenassist $0x10, %xmm0, %xmm1 # round 5
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
+ call key_expansion_128
+ # aeskeygenassist $0x20, %xmm0, %xmm1 # round 6
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
+ call key_expansion_128
+ # aeskeygenassist $0x40, %xmm0, %xmm1 # round 7
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x40
+ call key_expansion_128
+ # aeskeygenassist $0x80, %xmm0, %xmm1 # round 8
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x80
+ call key_expansion_128
+ # aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x1b
+ call key_expansion_128
+ # aeskeygenassist $0x36, %xmm0, %xmm1 # round 10
+ .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x36
+ call key_expansion_128
+ lea 160(%rdi), %rdx
+3:
+ movaps (%rdi), %xmm0
+ movaps (%rdx), %xmm1
+ movaps %xmm0, 240(%rdx)
+ movaps %xmm1, 240(%rdi)
+ lea 0x10(%rdi), %rdi
+ lea 224(%rdx), %rsi
+4:
+ movaps (%rdi), %xmm0
+ # aesimc %xmm0, %xmm1
+ .byte 0x66, 0x0f, 0x38, 0xdb, 0xc8
+ movaps %xmm1, (%rsi)
+ lea 0x10(%rdi), %rdi
+ lea -0x10(%rsi), %rsi
+ cmp %rdx, %rdi
+ jb 4b
+ xor %rax, %rax
+ ret
+END(intel_aes_set_key)
+
+ENTRY(intel_aes_enc)
+ movups (%rdx), %xmm0 # input
+ mov 480(%rdi), %ecx # key length
+ movaps (%rdi), %xmm1 # key
+ pxor %xmm1, %xmm0 # round 0
+ lea 0x30(%rdi), %rdi
+ cmp $24, %cl
+ jb 2f
+ lea 0x20(%rdi), %rdi
+ je 1f
+ lea 0x20(%rdi), %rdi
+ movaps -0x60(%rdi), %xmm1
+ # aesenc %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc1
+ movaps -0x50(%rdi), %xmm1
+ # aesenc %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc1
+1:
+ movaps -0x40(%rdi), %xmm1
+ # aesenc %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc1
+ movaps -0x30(%rdi), %xmm1
+ # aesenc %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc1
+2:
+ movaps -0x20(%rdi), %xmm1
+ # aesenc %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc1
+ movaps -0x10(%rdi), %xmm1
+ # aesenc %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc1
+ movaps (%rdi), %xmm1
+ # aesenc %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc1
+ movaps 0x10(%rdi), %xmm1
+ # aesenc %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc1
+ movaps 0x20(%rdi), %xmm1
+ # aesenc %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc1
+ movaps 0x30(%rdi), %xmm1
+ # aesenc %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc1
+ movaps 0x40(%rdi), %xmm1
+ # aesenc %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc1
+ movaps 0x50(%rdi), %xmm1
+ # aesenc %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc1
+ movaps 0x60(%rdi), %xmm1
+ # aesenc %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xdc, 0xc1
+ movaps 0x70(%rdi), %xmm1
+ # aesenclast %xmm1, %xmm0 # last round
+ .byte 0x66, 0x0f, 0x38, 0xdd, 0xc1
+ movups %xmm0, (%rsi) # output
+ ret
+END(intel_aes_enc)
+
+ENTRY(intel_aes_dec)
+ movups (%rdx), %xmm0 # input
+ mov 480(%rdi), %ecx # key length
+ lea 240(%rdi), %rdi
+ movaps (%rdi), %xmm1 # key
+ pxor %xmm1, %xmm0 # round 0
+ lea 0x30(%rdi), %rdi
+ cmp $24, %cl
+ jb 2f
+ lea 0x20(%rdi), %rdi
+ je 1f
+ lea 0x20(%rdi), %rdi
+ movaps -0x60(%rdi), %xmm1
+ # aesdec %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc1
+ movaps -0x50(%rdi), %xmm1
+ # aesdec %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc1
+1:
+ movaps -0x40(%rdi), %xmm1
+ # aesdec %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc1
+ movaps -0x30(%rdi), %xmm1
+ # aesdec %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc1
+2:
+ movaps -0x20(%rdi), %xmm1
+ # aesdec %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc1
+ movaps -0x10(%rdi), %xmm1
+ # aesdec %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc1
+ movaps (%rdi), %xmm1
+ # aesdec %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc1
+ movaps 0x10(%rdi), %xmm1
+ # aesdec %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc1
+ movaps 0x20(%rdi), %xmm1
+ # aesdec %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc1
+ movaps 0x30(%rdi), %xmm1
+ # aesdec %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc1
+ movaps 0x40(%rdi), %xmm1
+ # aesdec %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc1
+ movaps 0x50(%rdi), %xmm1
+ # aesdec %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc1
+ movaps 0x60(%rdi), %xmm1
+ # aesdec %xmm1, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0xde, 0xc1
+ movaps 0x70(%rdi), %xmm1
+ # aesdeclast %xmm1, %xmm0 # last round
+ .byte 0x66, 0x0f, 0x38, 0xdf, 0xc1
+ movups %xmm0, (%rsi) # output
+ ret
+END(intel_aes_dec)
--- /dev/null
+++ b/arch/x86/include/asm/aes.h
@@ -0,0 +1,9 @@
+#ifndef ASM_X86_AES_H
+#define ASM_X86_AES_H
+
+#include <linux/crypto.h>
+
+void crypto_aes_encrypt_x86(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+void crypto_aes_decrypt_x86(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+
+#endif
--- a/arch/x86/crypto/aes_glue.c
+++ b/arch/x86/crypto/aes_glue.c
@@ -8,15 +8,17 @@
asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
-static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+void crypto_aes_encrypt_x86(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
aes_enc_blk(tfm, dst, src);
}
+EXPORT_SYMBOL_GPL(crypto_aes_encrypt_x86);
-static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+void crypto_aes_decrypt_x86(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
aes_dec_blk(tfm, dst, src);
}
+EXPORT_SYMBOL_GPL(crypto_aes_decrypt_x86);
static struct crypto_alg aes_alg = {
.cra_name = "aes",
@@ -32,8 +34,8 @@ static struct crypto_alg aes_alg = {
.cia_min_keysize = AES_MIN_KEY_SIZE,
.cia_max_keysize = AES_MAX_KEY_SIZE,
.cia_setkey = crypto_aes_set_key,
- .cia_encrypt = aes_encrypt,
- .cia_decrypt = aes_decrypt
+ .cia_encrypt = crypto_aes_encrypt_x86,
+ .cia_decrypt = crypto_aes_decrypt_x86
}
}
};
* Huang Ying | 2008-12-12 12:08:46 [+0800]:
>Add support to Intel AES-NI instructions for x86_64 platform.
>
>Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD)
>instructions that are going to be introduced in the next generation of
>Intel processor, as of 2009. These instructions enable fast and secure
>data encryption and decryption, using the Advanced Encryption Standard
>(AES), defined by FIPS Publication number 197. The architecture
>introduces six instructions that offer full hardware support for
>AES. Four of them support high performance data encryption and
>decryption, and the other two instructions support the AES key
>expansion procedure.
>
>The white paper can be downloaded from:
>
>http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
>
>AES may be used in soft_irq context, but MMX/SSE context can not be
>touched safely in soft_irq context. So in_interrupt() is checked, if
>in IRQ or soft_irq context, the general x86_64 implementation are used
>instead.
Nice work. A few things:
- Did you rename the "old" x86 functions to avoid a clash?
So you bypass the crypto layer in case you can't handle the operation.
Does this improve the performace or just saves key strokes? Not sure
what the best sollution could be....
- unless I've read the code too fast, it does not work if someone sets the
key in user context and starts an encryption in softirq context.
- aes_ctx is somehow bad. You are using this for a function and a
struct. An Intel prefix would be nice (in case of the struct). On a
second thought, any reason why you can't use crypto_aes_ctx?
- Is this an Intel thing or is going to be part of X86 and also
available to others (like mmx). In that case the Intel prefix may be
"wrong".
- does the cpu support more than just pure aes e.g. block modes? In case
it does not, does the perfomance improve if you implement lets say
cbc(aes) and do the xor with sse in order to save a few
kernel_fpu_begin() calls? I'm just asking because I saw a similar
thing and PowerPC and the AltiVec unit. Maybe it is cheap on x86 :)
- I can't see how why the intel-aes alias is required.
>Signed-off-by: Huang Ying <[email protected]>
Sebastian
On Sat, 2008-12-13 at 03:57 +0800, Sebastian Andrzej Siewior wrote:
> * Huang Ying | 2008-12-12 12:08:46 [+0800]:
>
> >Add support to Intel AES-NI instructions for x86_64 platform.
> >
> >Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD)
> >instructions that are going to be introduced in the next generation of
> >Intel processor, as of 2009. These instructions enable fast and secure
> >data encryption and decryption, using the Advanced Encryption Standard
> >(AES), defined by FIPS Publication number 197. The architecture
> >introduces six instructions that offer full hardware support for
> >AES. Four of them support high performance data encryption and
> >decryption, and the other two instructions support the AES key
> >expansion procedure.
> >
> >The white paper can be downloaded from:
> >
> >http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
> >
> >AES may be used in soft_irq context, but MMX/SSE context can not be
> >touched safely in soft_irq context. So in_interrupt() is checked, if
> >in IRQ or soft_irq context, the general x86_64 implementation are used
> >instead.
>
> Nice work. A few things:
> - Did you rename the "old" x86 functions to avoid a clash?
> So you bypass the crypto layer in case you can't handle the operation.
> Does this improve the performace or just saves key strokes? Not sure
> what the best sollution could be....
The general x86 implementation is used as the fall back for new AES-NI
based implementation. Because AES-NI can not be used in kernel soft_irq
context. If crypto layer is used to access general x86 implementation,
we will have tfm_ctx alignment issue, because AES-NI need tfm_ctx to be
16 byte aligned.
> - unless I've read the code too fast, it does not work if someone sets the
> key in user context and starts an encryption in softirq context.
Oh, I should use struct crypto_aes_ctx instead of define struct aes_ctx.
The tfm_ctx definition is different. I will fix this. Except that, is
there any other issue?
> - aes_ctx is somehow bad. You are using this for a function and a
> struct. An Intel prefix would be nice (in case of the struct). On a
> second thought, any reason why you can't use crypto_aes_ctx?
Yes. I will use that. The only issue is that AES-NI need scheduled key
to be 16 bytes aligned, so we need move key_length in struct
crypto_aes_ctx to the end of the struct.
> - Is this an Intel thing or is going to be part of X86 and also
> available to others (like mmx). In that case the Intel prefix may be
> "wrong".
Now it is an Intel thing. But in the future it may become part of x86.
Intel named it as AES-NI (AES New Instructions), but name like
aes_ni_aes_set_key is not good too. Does aes_ni_set_key sound better?
> - does the cpu support more than just pure aes e.g. block modes? In case
> it does not, does the perfomance improve if you implement lets say
> cbc(aes) and do the xor with sse in order to save a few
> kernel_fpu_begin() calls? I'm just asking because I saw a similar
> thing and PowerPC and the AltiVec unit. Maybe it is cheap on x86 :)
Yes. AES-NI can benefit not only block modes. And the pipeline
implementation of AES-NI can benefit cbc(aes) decryption and ctr(aes)
even more (described in detail in white paper). We will work on that.
> - I can't see how why the intel-aes alias is required.
Yes. I will remove it.
Best Regards,
Huang Ying
On Mon, Dec 15, 2008 at 10:19:02AM +0800, Huang Ying wrote:
>
> The general x86 implementation is used as the fall back for new AES-NI
> based implementation. Because AES-NI can not be used in kernel soft_irq
> context. If crypto layer is used to access general x86 implementation,
Why is that? The VIA PadLock also "touches" the SSE state but we still
use it on softirq paths.
In fact Suresh told me earlier that your AES instruction wasn't
going to have the SSE problems that VIA had, is this not the case?
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Mon, 2008-12-15 at 11:38 +0800, Herbert Xu wrote:
> On Mon, Dec 15, 2008 at 10:19:02AM +0800, Huang Ying wrote:
> >
> > The general x86 implementation is used as the fall back for new AES-NI
> > based implementation. Because AES-NI can not be used in kernel soft_irq
> > context. If crypto layer is used to access general x86 implementation,
>
> Why is that? The VIA PadLock also "touches" the SSE state but we still
> use it on softirq paths.
>
> In fact Suresh told me earlier that your AES instruction wasn't
> going to have the SSE problems that VIA had, is this not the case?
The PadLock instructions don't use/touch SSE registers, but might cause
DNA fault when CR0.TS is set. So it is sufficient just to clear CR0.TS
before executed.
The AES-NI instructions do use SSE registers. Considering the following
situation:
1. In kernel, code path using SSE registers are executed, user space SSE
state is saved if necessary.
2. An interrupt/soft_irq comes, and encrypt/decrypt with AES-NI is
executed. The SSE state of code path 1 is destroyed.
To solve the above issue, the following methods can be used:
a. Do not touch SSE state in soft_irq
b. Disable/restore soft_irq in kernel_fpu_begin/kernel_fpu_end
c. Use a per-CPU data structure to save kernel FPU state during
soft_irq.
The mothod a is used in patch.
Best Regards,
Huang Ying
On Mon, Dec 15, 2008 at 01:14:59PM +0800, Huang Ying wrote:
>
> The PadLock instructions don't use/touch SSE registers, but might cause
> DNA fault when CR0.TS is set. So it is sufficient just to clear CR0.TS
> before executed.
>
> The AES-NI instructions do use SSE registers. Considering the following
This really sucks as more than half of the kernel AES users are
in softirq context. Someone hit the guy who designed this with
a clue-bat please!
> To solve the above issue, the following methods can be used:
>
> a. Do not touch SSE state in soft_irq
> b. Disable/restore soft_irq in kernel_fpu_begin/kernel_fpu_end
> c. Use a per-CPU data structure to save kernel FPU state during
> soft_irq.
>
> The mothod a is used in patch.
Could you run the tcrypt speed test on this and measure the
difference between the native AES vs. the fallback? Depending
on the difference I think we'd want to consider b) or c).
Of course the best solution would be to fix the hardware.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Mon, 2008-12-15 at 13:21 +0800, Herbert Xu wrote:
> On Mon, Dec 15, 2008 at 01:14:59PM +0800, Huang Ying wrote:
> >
> > The PadLock instructions don't use/touch SSE registers, but might cause
> > DNA fault when CR0.TS is set. So it is sufficient just to clear CR0.TS
> > before executed.
> >
> > The AES-NI instructions do use SSE registers. Considering the following
>
> This really sucks as more than half of the kernel AES users are
> in softirq context. Someone hit the guy who designed this with
> a clue-bat please!
>
> > To solve the above issue, the following methods can be used:
> >
> > a. Do not touch SSE state in soft_irq
> > b. Disable/restore soft_irq in kernel_fpu_begin/kernel_fpu_end
> > c. Use a per-CPU data structure to save kernel FPU state during
> > soft_irq.
> >
> > The mothod a is used in patch.
>
> Could you run the tcrypt speed test on this and measure the
> difference between the native AES vs. the fallback? Depending
> on the difference I think we'd want to consider b) or c).
I do not have appropriate machine at hand, I will contact my colleague
for testing and post the results later.
Best Regards,
Huang Ying
* Huang Ying | 2008-12-15 10:19:02 [+0800]:
>> Nice work. A few things:
>> - Did you rename the "old" x86 functions to avoid a clash?
>> So you bypass the crypto layer in case you can't handle the operation.
>> Does this improve the performace or just saves key strokes? Not sure
>> what the best sollution could be....
>
>The general x86 implementation is used as the fall back for new AES-NI
>based implementation. Because AES-NI can not be used in kernel soft_irq
>context. If crypto layer is used to access general x86 implementation,
>we will have tfm_ctx alignment issue, because AES-NI need tfm_ctx to be
>16 byte aligned.
>
>> - unless I've read the code too fast, it does not work if someone sets the
>> key in user context and starts an encryption in softirq context.
>
>Oh, I should use struct crypto_aes_ctx instead of define struct aes_ctx.
>The tfm_ctx definition is different. I will fix this. Except that, is
>there any other issue?
Now I see what you have done. You are sharing the same tfm between your
aes version and the "old" asm version. Both assume different private
data (aes_ctx vs crypto_aes_ctx) so this should not work.
>> - aes_ctx is somehow bad. You are using this for a function and a
>> struct. An Intel prefix would be nice (in case of the struct). On a
>> second thought, any reason why you can't use crypto_aes_ctx?
>
>Yes. I will use that. The only issue is that AES-NI need scheduled key
>to be 16 bytes aligned, so we need move key_length in struct
>crypto_aes_ctx to the end of the struct.
You have to it if you want to bypass the crypto layer and call asm
functions directly and I'm not sure whether bypassing the crypto layer
is a good thing. Both asm routines (the 32bit and 64bit) assume that
keylen ist at +0 followed by enc key, dec key. Ach and they don't do the
ALIGN thing.
Herbert what do you thing?
>> - Is this an Intel thing or is going to be part of X86 and also
>> available to others (like mmx). In that case the Intel prefix may be
>> "wrong".
>
>Now it is an Intel thing. But in the future it may become part of x86.
>Intel named it as AES-NI (AES New Instructions), but name like
>aes_ni_aes_set_key is not good too. Does aes_ni_set_key sound better?
The latter is fine. I just want to avoid having several and different
aes_set_key() around.
>> - does the cpu support more than just pure aes e.g. block modes? In case
>> it does not, does the perfomance improve if you implement lets say
>> cbc(aes) and do the xor with sse in order to save a few
>> kernel_fpu_begin() calls? I'm just asking because I saw a similar
>> thing and PowerPC and the AltiVec unit. Maybe it is cheap on x86 :)
>
>Yes. AES-NI can benefit not only block modes. And the pipeline
>implementation of AES-NI can benefit cbc(aes) decryption and ctr(aes)
>even more (described in detail in white paper). We will work on that.
Ah okay. So depending on how expensive kernel_fpu_begin() really is it
might be slower for just the cipher (i.e. in xts(aes) where xts calls
you for every 16bytes).
>
>Best Regards,
>Huang Ying
>
Sebastian
On Mon, Dec 15, 2008 at 10:07:45AM +0100, Sebastian Andrzej Siewior wrote:
>
> You have to it if you want to bypass the crypto layer and call asm
> functions directly and I'm not sure whether bypassing the crypto layer
> is a good thing. Both asm routines (the 32bit and 64bit) assume that
> keylen ist at +0 followed by enc key, dec key. Ach and they don't do the
> ALIGN thing.
>
> Herbert what do you thing?
I think it would be OK if it called the assembly routine directly
and we moved this under asm/x86-64. We should do the latter anyway
regardless of what we decide.
However, I'm skeptical about whether we should use of a fallback at
all rather than making this work in softirq context.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Mon, Dec 15, 2008 at 04:21:06PM +1100, Herbert Xu wrote:
>
> > a. Do not touch SSE state in soft_irq
> > b. Disable/restore soft_irq in kernel_fpu_begin/kernel_fpu_end
> > c. Use a per-CPU data structure to save kernel FPU state during
> > soft_irq.
> >
> > The mothod a is used in patch.
>
> Could you run the tcrypt speed test on this and measure the
> difference between the native AES vs. the fallback? Depending
> on the difference I think we'd want to consider b) or c).
Here's another option
d. When we're in interrupt context, schedule a task to perform
the encryption asynchronously.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Sun, Dec 14, 2008 at 07:38:42PM -0800, Herbert Xu wrote:
> On Mon, Dec 15, 2008 at 10:19:02AM +0800, Huang Ying wrote:
> >
> > The general x86 implementation is used as the fall back for new AES-NI
> > based implementation. Because AES-NI can not be used in kernel soft_irq
> > context. If crypto layer is used to access general x86 implementation,
>
> Why is that? The VIA PadLock also "touches" the SSE state but we still
> use it on softirq paths.
>
> In fact Suresh told me earlier that your AES instruction wasn't
> going to have the SSE problems that VIA had, is this not the case?
As Huang mentioned, AES instructions touch SSE registers and thus have
different requirements. I agree that we have to do some performance
analysis to come up with the optimized model.
thanks,
suresh
On Mon, Dec 15, 2008 at 11:38:01PM +1100, Herbert Xu wrote:
> On Mon, Dec 15, 2008 at 04:21:06PM +1100, Herbert Xu wrote:
> >
> > > a. Do not touch SSE state in soft_irq
> > > b. Disable/restore soft_irq in kernel_fpu_begin/kernel_fpu_end
> > > c. Use a per-CPU data structure to save kernel FPU state during
> > > soft_irq.
> Here's another option
>
> d. When we're in interrupt context, schedule a task to perform
> the encryption asynchronously.
We can also hybridise b. and d.:
e. When we're in interrupt context, if TS is clear, then we defer
the operation to a thread. Otherwise if user-space has touched
the FPU we save the state, if not then we simply clear TS. In
either case we perform the operation immediately and then reset
TS if user-space didn't touch the FPU.
This is based on the fact that it should be fairly rare for us
to interrupt a kernel FPU/SSE operation. The common case would
be interrupting a user process or a kernel context which is not
engaging in any kernel FPU operations.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Wed, 2008-12-17 at 07:31 +0800, Herbert Xu wrote:
> On Mon, Dec 15, 2008 at 11:38:01PM +1100, Herbert Xu wrote:
> > On Mon, Dec 15, 2008 at 04:21:06PM +1100, Herbert Xu wrote:
> > >
> > > > a. Do not touch SSE state in soft_irq
> > > > b. Disable/restore soft_irq in kernel_fpu_begin/kernel_fpu_end
> > > > c. Use a per-CPU data structure to save kernel FPU state during
> > > > soft_irq.
>
> > Here's another option
> >
> > d. When we're in interrupt context, schedule a task to perform
> > the encryption asynchronously.
>
> We can also hybridise b. and d.:
>
> e. When we're in interrupt context, if TS is clear, then we defer
> the operation to a thread. Otherwise if user-space has touched
> the FPU we save the state, if not then we simply clear TS. In
> either case we perform the operation immediately and then reset
> TS if user-space didn't touch the FPU.
>
> This is based on the fact that it should be fairly rare for us
> to interrupt a kernel FPU/SSE operation. The common case would
> be interrupting a user process or a kernel context which is not
> engaging in any kernel FPU operations.
Yes. This is a better solution with much better performance. How about
hybridise b. and a.:
f. if TS is clear, then use x86_64 implementation. Otherwise if
user-space has touched the FPU, we save the state, if not then simply
clear TS.
I think that could be simpler to be implemented.
Best Regards,
Huang Ying
Huang Ying <[email protected]> wrote:
>
> f. if TS is clear, then use x86_64 implementation. Otherwise if
> user-space has touched the FPU, we save the state, if not then simply
> clear TS.
Well I'd rather avoid using the x86_64 implementation ever because
unless the chip guys have really screwed up we should be looking at
a difference of at least a factor of 10.
BTW I wasn't very clear in the original email. You'd only do the
asynchronous operation for CBC/ECB. For the simple AES case I
suppose we'll just have to stick to the x86_64 fallback. This'll
really suck for disk encryption but I guess you could always add
an LRW/XTS mode to your code.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Wed, 2008-12-17 at 09:26 +0800, Herbert Xu wrote:
> Huang Ying <[email protected]> wrote:
> >
> > f. if TS is clear, then use x86_64 implementation. Otherwise if
> > user-space has touched the FPU, we save the state, if not then simply
> > clear TS.
>
> Well I'd rather avoid using the x86_64 implementation ever because
> unless the chip guys have really screwed up we should be looking at
> a difference of at least a factor of 10.
>
> BTW I wasn't very clear in the original email. You'd only do the
> asynchronous operation for CBC/ECB. For the simple AES case I
> suppose we'll just have to stick to the x86_64 fallback. This'll
> really suck for disk encryption but I guess you could always add
> an LRW/XTS mode to your code.
It seems that asynchronous operations are only provided in blkcipher
level not cipher level. So the situation may be as follow:
- Now an AES core block algorithm is implemented with AES-NI as
CRYPTO_ALG_TYPE_CIPHER, which can benefit all modes (CBC, LRW, etc). But
because it seems that there is no asynchronous interface for
CRYPTO_ALG_TYPE_CIPHER, the AES core block algorithm can not use a
thread to defer real operations.
- To take full advantage of AES-NI pipeline implementation, at least
"cbc(aes)", "ecb(aes)" and "ctr(aes)" should be implemented as
CRYPTO_ALG_TYPE_ABLKCIPHER. So a thread can be used to defer real
operation upon soft_irq.
Because the combination that kernel process context FPU usage + soft_irq
AES usage is fairly rare, I think the above combination is acceptable.
That is,
- In AES core block algorithm implementation with AES-NI, use x86_64
implementation for the combination above.
- In "cbc(aes)", "ecb(aes)" and "ctr(aes)", use thread deferring for the
combination above.
Best Regards,
Huang Ying
On Wed, Dec 17, 2008 at 11:33:39AM +0800, Huang Ying wrote:
>
> - Now an AES core block algorithm is implemented with AES-NI as
> CRYPTO_ALG_TYPE_CIPHER, which can benefit all modes (CBC, LRW, etc). But
> because it seems that there is no asynchronous interface for
> CRYPTO_ALG_TYPE_CIPHER, the AES core block algorithm can not use a
> thread to defer real operations.
>
> - To take full advantage of AES-NI pipeline implementation, at least
> "cbc(aes)", "ecb(aes)" and "ctr(aes)" should be implemented as
> CRYPTO_ALG_TYPE_ABLKCIPHER. So a thread can be used to defer real
> operation upon soft_irq.
>
> Because the combination that kernel process context FPU usage + soft_irq
> AES usage is fairly rare, I think the above combination is acceptable.
> That is,
>
> - In AES core block algorithm implementation with AES-NI, use x86_64
> implementation for the combination above.
>
> - In "cbc(aes)", "ecb(aes)" and "ctr(aes)", use thread deferring for the
> combination above.
Yes that's pretty much what I'd like to see.
Ideally we should implement all the commonly used modes of
oeprations so that the simple AES cipher itself is never used
except on small chunks of data. However, for now I think doing
just cbc should be sufficient since that's the most common use
case.
Thanks,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt