LinuxLists.cc - [PATCH resend 10/15] arm64: pull in <asm/simd.h> from asm-generic

2014-05-01 15:51:34

Subject: [PATCH resend 10/15] arm64: pull in <asm/simd.h> from asm-generic

Signed-off-by: Ard Biesheuvel <[email protected]>
---
arch/arm64/include/asm/Kbuild | 1 +
1 file changed, 1 insertion(+)

diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index 83f71b3004a8..42c7eecd2bb6 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -40,6 +40,7 @@ generic-y += segment.h
generic-y += sembuf.h
generic-y += serial.h
generic-y += shmbuf.h
+generic-y += simd.h
generic-y += sizes.h
generic-y += socket.h
generic-y += sockios.h
--
1.8.3.2

2014-05-01 15:51:37

by Ard Biesheuvel

[permalink] [raw]

Subject: [PATCH resend 12/15] arm64/crypto: add shared macro to test for NEED_RESCHED

This adds the asm macro definition 'b_if_no_resched' that performs a conditional
branch depending on the preempt need_resched state.

Signed-off-by: Ard Biesheuvel <[email protected]>
---
arch/arm64/include/asm/assembler.h | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index fd3e3924041b..296105fd3021 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -22,6 +22,11 @@

#include <asm/ptrace.h>

+#if defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#endif
+
/*
* Stack pushing/popping (register pairs only). Equivalent to store decrement
* before, load increment after.
@@ -146,3 +151,19 @@ lr .req x30 // link register
#endif
orr \rd, \lbits, \hbits, lsl #32
.endm
+
+/*
+ * Branch to 'lb' but only if we have not been tagged for preemption.
+ *
+ * Expects current->thread_info in ti, or NULL if running in interrupt
+ * context. reg is a scratch x register.
+ */
+ .macro b_if_no_resched, ti, reg, lb
+#if defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
+ cbz \ti, \lb /* have thread_info? */
+ ldr \reg, [\ti, #TI_FLAGS] /* get flags */
+ tbz \reg, #TIF_NEED_RESCHED, \lb /* need rescheduling? */
+#else
+ b \lb
+#endif
+ .endm
--
1.8.3.2

2014-05-01 15:51:37

by Ard Biesheuvel

[permalink] [raw]

Subject: [PATCH resend 11/15] arm64/crypto: AES-ECB/CBC/CTR/XTS using ARMv8 NEON and Crypto Extensions

This adds ARMv8 implementations of AES in ECB, CBC, CTR and XTS modes,
both for ARMv8 with Crypto Extensions and for plain ARMv8 NEON.

The Crypto Extensions version can only run on ARMv8 implementations that
have support for these optional extensions.

The plain NEON version is a table based yet time invariant implementation.
All S-box substitutions are performed in parallel, leveraging the wide range
of ARMv8's tbl/tbx instructions, and the huge NEON register file, which can
comfortably hold the entire S-box and still have room to spare for doing the
actual computations.

The key expansion routines were borrowed from aes_generic.

Signed-off-by: Ard Biesheuvel <[email protected]>
---
arch/arm64/crypto/Kconfig | 14 ++
arch/arm64/crypto/Makefile | 14 ++
arch/arm64/crypto/aes-ce.S | 147 +++++++++++
arch/arm64/crypto/aes-glue.c | 446 ++++++++++++++++++++++++++++++++++
arch/arm64/crypto/aes-modes.S | 548 ++++++++++++++++++++++++++++++++++++++++++
arch/arm64/crypto/aes-neon.S | 382 +++++++++++++++++++++++++++++
6 files changed, 1551 insertions(+)
create mode 100644 arch/arm64/crypto/aes-ce.S
create mode 100644 arch/arm64/crypto/aes-glue.c
create mode 100644 arch/arm64/crypto/aes-modes.S
create mode 100644 arch/arm64/crypto/aes-neon.S

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 8fffd5af65ef..5562652c5316 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -36,4 +36,18 @@ config CRYPTO_AES_ARM64_CE_CCM
select CRYPTO_AES
select CRYPTO_AEAD

+config CRYPTO_AES_ARM64_CE_BLK
+ tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
+ depends on ARM64 && KERNEL_MODE_NEON
+ select CRYPTO_BLKCIPHER
+ select CRYPTO_AES
+ select CRYPTO_ABLK_HELPER
+
+config CRYPTO_AES_ARM64_NEON_BLK
+ tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
+ depends on ARM64 && KERNEL_MODE_NEON
+ select CRYPTO_BLKCIPHER
+ select CRYPTO_AES
+ select CRYPTO_ABLK_HELPER
+
endif
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index 311287d68078..070ff7f6fc66 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -22,3 +22,17 @@ CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto

obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
+
+obj-$(CONFIG_CRYPTO_AES_ARM64_CE_BLK) += aes-ce-blk.o
+aes-ce-blk-y := aes-glue-ce.o aes-ce.o
+
+obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o
+aes-neon-blk-y := aes-glue-neon.o aes-neon.o
+
+AFLAGS_aes-ce.o := -DINTERLEAVE=8 # -DINTERLEAVE_INLINE
+AFLAGS_aes-neon.o := -DINTERLEAVE=4
+
+CFLAGS_aes-glue-ce.o := -DUSE_V8_CRYPTO_EXTENSIONS
+
+$(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE
+ $(call if_changed_dep,cc_o_c)
diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S
new file mode 100644
index 000000000000..08a8416561e2
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce.S
@@ -0,0 +1,147 @@
+/*
+ * linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with
+ * Crypto Extensions
+ *
+ * Copyright (C) 2013 Linaro Ltd <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+
+#define AES_ENTRY(func) ENTRY(ce_ ## func)
+#define AES_ENDPROC(func) ENDPROC(ce_ ## func)
+
+ .arch armv8-a+crypto
+
+ /* preload all round keys */
+ .macro load_round_keys, rounds, rk
+ cmp \rounds, #12
+ blo 2222f /* 128 bits */
+ beq 1111f /* 192 bits */
+ ld1 {v17.16b-v18.16b}, [\rk], #32
+1111: ld1 {v19.16b-v20.16b}, [\rk], #32
+2222: ld1 {v21.16b-v24.16b}, [\rk], #64
+ ld1 {v25.16b-v28.16b}, [\rk], #64
+ ld1 {v29.16b-v31.16b}, [\rk]
+ .endm
+
+ /* prepare for encryption with key in rk[] */
+ .macro enc_prepare, rounds, rk, ignore
+ load_round_keys \rounds, \rk
+ .endm
+
+ /* prepare for encryption (again) but with new key in rk[] */
+ .macro enc_switch_key, rounds, rk, ignore
+ load_round_keys \rounds, \rk
+ .endm
+
+ /* prepare for decryption with key in rk[] */
+ .macro dec_prepare, rounds, rk, ignore
+ load_round_keys \rounds, \rk
+ .endm
+
+ .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3, i4, i5, i6, i7
+ aes\de \i0\().16b, \k\().16b
+ aes\mc \i0\().16b, \i0\().16b
+ .ifnb \i1
+ aes\de \i1\().16b, \k\().16b
+ aes\mc \i1\().16b, \i1\().16b
+ .ifnb \i3
+ aes\de \i2\().16b, \k\().16b
+ aes\mc \i2\().16b, \i2\().16b
+ aes\de \i3\().16b, \k\().16b
+ aes\mc \i3\().16b, \i3\().16b
+ .ifnb \i7
+ .irp reg, \i4\().16b, \i5\().16b, \i6\().16b, \i7\().16b
+ aes\de \reg, \k\().16b
+ aes\mc \reg, \reg
+ .endr
+ .endif
+ .endif
+ .endif
+ .endm
+
+ /* up to 8 interleaved encryption rounds with the same round key */
+ .macro round_Nx, enc, k, i0, i1, i2, i3, i4, i5, i6, i7
+ .ifc \enc, e
+ do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3, \i4, \i5, \i6, \i7
+ .else
+ do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3, \i4, \i5, \i6, \i7
+ .endif
+ .endm
+
+ /* up to 8 interleaved final rounds */
+ .macro fin_round_Nx, de, k, k2, i0, i1, i2, i3, i4, i5, i6, i7
+ aes\de \i0\().16b, \k\().16b
+ eor \i0\().16b, \i0\().16b, \k2\().16b
+ .ifnb \i1
+ aes\de \i1\().16b, \k\().16b
+ eor \i1\().16b, \i1\().16b, \k2\().16b
+ .ifnb \i3
+ aes\de \i2\().16b, \k\().16b
+ eor \i2\().16b, \i2\().16b, \k2\().16b
+ aes\de \i3\().16b, \k\().16b
+ eor \i3\().16b, \i3\().16b, \k2\().16b
+ .ifnb \i7
+ .irp reg, \i4\().16b,\i5\().16b,\i6\().16b,\i7\().16b
+ aes\de \reg, \k\().16b
+ eor \reg, \reg, \k2\().16b
+ .endr
+ .endif
+ .endif
+ .endif
+ .endm
+
+ /* up to 8 interleaved blocks */
+ .macro do_block_Nx, enc, rounds, i0, i1, i2, i3, i4, i5, i6, i7
+ cmp \rounds, #12
+ blo 2222f /* 128 bits */
+ beq 1111f /* 192 bits */
+ round_Nx \enc, v17, \i0, \i1, \i2, \i3, \i4, \i5, \i6, \i7
+ round_Nx \enc, v18, \i0, \i1, \i2, \i3, \i4, \i5, \i6, \i7
+1111: round_Nx \enc, v19, \i0, \i1, \i2, \i3, \i4, \i5, \i6, \i7
+ round_Nx \enc, v20, \i0, \i1, \i2, \i3, \i4, \i5, \i6, \i7
+2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
+ round_Nx \enc, \key, \i0, \i1, \i2, \i3, \i4, \i5, \i6, \i7
+ .endr
+ fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3, \i4, \i5, \i6, \i7
+ .endm
+
+ .macro encrypt_block, in, rounds, t0, t1, t2
+ do_block_Nx e, \rounds, \in
+ .endm
+
+ .macro encrypt_block2x, i0, i1, rounds, t0, t1, t2
+ do_block_Nx e, \rounds, \i0, \i1
+ .endm
+
+ .macro encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
+ do_block_Nx e, \rounds, \i0, \i1, \i2, \i3
+ .endm
+
+ .macro encrypt_block8x, i0, i1, i2, i3, i4, i5, i6, i7, \
+ rounds, t0, t1, t2
+ do_block_Nx e, \rounds, \i0, \i1, \i2, \i3, \i4, \i5, \i6, \i7
+ .endm
+
+ .macro decrypt_block, in, rounds, t0, t1, t2
+ do_block_Nx d, \rounds, \in
+ .endm
+
+ .macro decrypt_block2x, i0, i1, rounds, t0, t1, t2
+ do_block_Nx d, \rounds, \i0, \i1
+ .endm
+
+ .macro decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
+ do_block_Nx d, \rounds, \i0, \i1, \i2, \i3
+ .endm
+
+ .macro decrypt_block8x, i0, i1, i2, i3, i4, i5, i6, i7, \
+ rounds, t0, t1, t2
+ do_block_Nx d, \rounds, \i0, \i1, \i2, \i3, \i4, \i5, \i6, \i7
+ .endm
+
+#include "aes-modes.S"
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
new file mode 100644
index 000000000000..60f2f4c12256
--- /dev/null
+++ b/arch/arm64/crypto/aes-glue.c
@@ -0,0 +1,446 @@
+/*
+ * linux/arch/arm64/crypto/aes-glue.c - wrapper code for ARMv8 AES
+ *
+ * Copyright (C) 2013 Linaro Ltd <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <asm/hwcap.h>
+#include <crypto/aes.h>
+#include <crypto/ablk_helper.h>
+#include <crypto/algapi.h>
+#include <linux/module.h>
+#include <linux/cpufeature.h>
+
+#ifdef USE_V8_CRYPTO_EXTENSIONS
+#define MODE "ce"
+#define PRIO 300
+#define aes_ecb_encrypt ce_aes_ecb_encrypt
+#define aes_ecb_decrypt ce_aes_ecb_decrypt
+#define aes_cbc_encrypt ce_aes_cbc_encrypt
+#define aes_cbc_decrypt ce_aes_cbc_decrypt
+#define aes_ctr_encrypt ce_aes_ctr_encrypt
+#define aes_xts_encrypt ce_aes_xts_encrypt
+#define aes_xts_decrypt ce_aes_xts_decrypt
+MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
+#else
+#define MODE "neon"
+#define PRIO 200
+#define aes_ecb_encrypt neon_aes_ecb_encrypt
+#define aes_ecb_decrypt neon_aes_ecb_decrypt
+#define aes_cbc_encrypt neon_aes_cbc_encrypt
+#define aes_cbc_decrypt neon_aes_cbc_decrypt
+#define aes_ctr_encrypt neon_aes_ctr_encrypt
+#define aes_xts_encrypt neon_aes_xts_encrypt
+#define aes_xts_decrypt neon_aes_xts_decrypt
+MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON");
+MODULE_ALIAS("ecb(aes)");
+MODULE_ALIAS("cbc(aes)");
+MODULE_ALIAS("ctr(aes)");
+MODULE_ALIAS("xts(aes)");
+#endif
+
+MODULE_AUTHOR("Ard Biesheuvel <[email protected]>");
+MODULE_LICENSE("GPL v2");
+
+/* defined in aes-modes.S */
+asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
+ int rounds, int blocks, int first);
+asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
+ int rounds, int blocks, int first);
+
+asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[],
+ int rounds, int blocks, u8 iv[], int first);
+asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
+ int rounds, int blocks, u8 iv[], int first);
+
+asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+ int rounds, int blocks, u8 ctr[], int first);
+
+asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[],
+ int rounds, int blocks, u8 const rk2[], u8 iv[],
+ int first);
+asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[],
+ int rounds, int blocks, u8 const rk2[], u8 iv[],
+ int first);
+
+struct crypto_aes_xts_ctx {
+ struct crypto_aes_ctx key1;
+ struct crypto_aes_ctx __aligned(8) key2;
+};
+
+static int xts_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct crypto_aes_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+ int ret;
+
+ ret = crypto_aes_expand_key(&ctx->key1, in_key, key_len / 2);
+ if (!ret)
+ ret = crypto_aes_expand_key(&ctx->key2, &in_key[key_len / 2],
+ key_len / 2);
+ if (!ret)
+ return 0;
+
+ tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+}
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ int err, first, rounds = 6 + ctx->key_length / 4;
+ struct blkcipher_walk walk;
+ unsigned int blocks;
+
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ kernel_neon_begin();
+ for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+ aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ (u8 *)ctx->key_enc, rounds, blocks, first);
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+ kernel_neon_end();
+ return err;
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ int err, first, rounds = 6 + ctx->key_length / 4;
+ struct blkcipher_walk walk;
+ unsigned int blocks;
+
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ kernel_neon_begin();
+ for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+ aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ (u8 *)ctx->key_dec, rounds, blocks, first);
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+ kernel_neon_end();
+ return err;
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ int err, first, rounds = 6 + ctx->key_length / 4;
+ struct blkcipher_walk walk;
+ unsigned int blocks;
+
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ kernel_neon_begin();
+ for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+ aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ (u8 *)ctx->key_enc, rounds, blocks, walk.iv,
+ first);
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+ kernel_neon_end();
+ return err;
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ int err, first, rounds = 6 + ctx->key_length / 4;
+ struct blkcipher_walk walk;
+ unsigned int blocks;
+
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ kernel_neon_begin();
+ for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+ aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ (u8 *)ctx->key_dec, rounds, blocks, walk.iv,
+ first);
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+ kernel_neon_end();
+ return err;
+}
+
+static int ctr_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ int err, first, rounds = 6 + ctx->key_length / 4;
+ struct blkcipher_walk walk;
+ int blocks;
+
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
+
+ first = 1;
+ kernel_neon_begin();
+ while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
+ aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ (u8 *)ctx->key_enc, rounds, blocks, walk.iv,
+ first);
+ first = 0;
+ nbytes -= blocks * AES_BLOCK_SIZE;
+ if (nbytes && nbytes == walk.nbytes % AES_BLOCK_SIZE)
+ break;
+ err = blkcipher_walk_done(desc, &walk,
+ walk.nbytes % AES_BLOCK_SIZE);
+ }
+ if (nbytes) {
+ u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
+ u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
+ u8 __aligned(8) tail[AES_BLOCK_SIZE];
+
+ /*
+ * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need
+ * to tell aes_ctr_encrypt() to only read half a block.
+ */
+ blocks = (nbytes <= 8) ? -1 : 1;
+
+ aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc, rounds,
+ blocks, walk.iv, first);
+ memcpy(tdst, tail, nbytes);
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+ kernel_neon_end();
+
+ return err;
+}
+
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ int err, first, rounds = 6 + ctx->key1.key_length / 4;
+ struct blkcipher_walk walk;
+ unsigned int blocks;
+
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ kernel_neon_begin();
+ for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+ aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ (u8 *)ctx->key1.key_enc, rounds, blocks,
+ (u8 *)ctx->key2.key_enc, walk.iv, first);
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+ kernel_neon_end();
+
+ return err;
+}
+
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ int err, first, rounds = 6 + ctx->key1.key_length / 4;
+ struct blkcipher_walk walk;
+ unsigned int blocks;
+
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ kernel_neon_begin();
+ for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+ aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ (u8 *)ctx->key1.key_dec, rounds, blocks,
+ (u8 *)ctx->key2.key_enc, walk.iv, first);
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+ kernel_neon_end();
+
+ return err;
+}
+
+static struct crypto_alg aes_algs[] = { {
+ .cra_name = "__ecb-aes-" MODE,
+ .cra_driver_name = "__driver-ecb-aes-" MODE,
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct crypto_aes_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_blkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = crypto_aes_set_key,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ },
+}, {
+ .cra_name = "__cbc-aes-" MODE,
+ .cra_driver_name = "__driver-cbc-aes-" MODE,
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct crypto_aes_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_blkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = crypto_aes_set_key,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ },
+}, {
+ .cra_name = "__ctr-aes-" MODE,
+ .cra_driver_name = "__driver-ctr-aes-" MODE,
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct crypto_aes_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_blkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = crypto_aes_set_key,
+ .encrypt = ctr_encrypt,
+ .decrypt = ctr_encrypt,
+ },
+}, {
+ .cra_name = "__xts-aes-" MODE,
+ .cra_driver_name = "__driver-xts-aes-" MODE,
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct crypto_aes_xts_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_blkcipher = {
+ .min_keysize = 2 * AES_MIN_KEY_SIZE,
+ .max_keysize = 2 * AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = xts_set_key,
+ .encrypt = xts_encrypt,
+ .decrypt = xts_decrypt,
+ },
+}, {
+ .cra_name = "ecb(aes)",
+ .cra_driver_name = "ecb-aes-" MODE,
+ .cra_priority = PRIO,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct async_helper_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_init = ablk_init,
+ .cra_exit = ablk_exit,
+ .cra_ablkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ }
+}, {
+ .cra_name = "cbc(aes)",
+ .cra_driver_name = "cbc-aes-" MODE,
+ .cra_priority = PRIO,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct async_helper_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_init = ablk_init,
+ .cra_exit = ablk_exit,
+ .cra_ablkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ }
+}, {
+ .cra_name = "ctr(aes)",
+ .cra_driver_name = "ctr-aes-" MODE,
+ .cra_priority = PRIO,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct async_helper_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_init = ablk_init,
+ .cra_exit = ablk_exit,
+ .cra_ablkcipher = {
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ }
+}, {
+ .cra_name = "xts(aes)",
+ .cra_driver_name = "xts-aes-" MODE,
+ .cra_priority = PRIO,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct async_helper_ctx),
+ .cra_alignmask = 7,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_init = ablk_init,
+ .cra_exit = ablk_exit,
+ .cra_ablkcipher = {
+ .min_keysize = 2 * AES_MIN_KEY_SIZE,
+ .max_keysize = 2 * AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ }
+} };
+
+static int __init aes_init(void)
+{
+ return crypto_register_algs(aes_algs, ARRAY_SIZE(aes_algs));
+}
+
+static void __exit aes_exit(void)
+{
+ crypto_unregister_algs(aes_algs, ARRAY_SIZE(aes_algs));
+}
+
+#ifdef USE_V8_CRYPTO_EXTENSIONS
+module_cpu_feature_match(AES, aes_init);
+#else
+module_init(aes_init);
+#endif
+module_exit(aes_exit);
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
new file mode 100644
index 000000000000..c990e1e379a2
--- /dev/null
+++ b/arch/arm64/crypto/aes-modes.S
@@ -0,0 +1,548 @@
+/*
+ * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
+ *
+ * Copyright (C) 2013 Linaro Ltd <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* included by aes-ce.S and aes-neon.S */
+
+ .text
+ .align 4
+
+/*
+ * There are several ways to instantiate this code:
+ * - no interleave, all inline
+ * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
+ * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
+ * - 8-way interleave, 8x calls out of line (-DINTERLEAVE=8)
+ * - 8-way interleave, all inline (-DINTERLEAVE=8 -DINTERLEAVE_INLINE)
+ *
+ * Macros imported by this code:
+ * - enc_prepare - setup NEON registers for encryption
+ * - dec_prepare - setup NEON registers for decryption
+ * - enc_switch_key - change to new key after having prepared for encryption
+ * - encrypt_block - encrypt a single block
+ * - decrypt block - decrypt a single block
+ * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
+ * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
+ * - encrypt_block8x - encrypt 8 blocks in parallel (if INTERLEAVE == 8)
+ * - decrypt_block8x - decrypt 8 blocks in parallel (if INTERLEAVE == 8)
+ */
+
+#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
+#define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp
+#define FRAME_POP ldp x29, x30, [sp],#16
+
+#if INTERLEAVE == 4
+
+aes_encrypt_block4x:
+ encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
+ ret
+ENDPROC(aes_encrypt_block4x)
+
+aes_decrypt_block4x:
+ decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
+ ret
+ENDPROC(aes_decrypt_block4x)
+
+#elif INTERLEAVE == 8
+
+aes_encrypt_block8x:
+ encrypt_block8x v0, v1, v2, v3, v8, v9, v10, v11, w3, x2, x6, w7
+ ret
+ENDPROC(aes_encrypt_block8x)
+
+aes_decrypt_block8x:
+ decrypt_block8x v0, v1, v2, v3, v8, v9, v10, v11, w3, x2, x6, w7
+ ret
+ENDPROC(aes_decrypt_block8x)
+
+#endif
+ .macro do_encrypt_block4x
+ bl aes_encrypt_block4x
+ .endm
+
+ .macro do_decrypt_block4x
+ bl aes_decrypt_block4x
+ .endm
+
+ .macro do_encrypt_block8x
+ bl aes_encrypt_block8x
+ .endm
+
+ .macro do_decrypt_block8x
+ bl aes_decrypt_block8x
+ .endm
+#else
+#define FRAME_PUSH
+#define FRAME_POP
+
+ .macro do_encrypt_block4x
+ encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
+ .endm
+
+ .macro do_decrypt_block4x
+ decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
+ .endm
+
+ .macro do_encrypt_block8x
+ encrypt_block8x v0, v1, v2, v3, v8, v9, v10, v11, w3, x2, x6, w7
+ .endm
+
+ .macro do_decrypt_block8x
+ decrypt_block8x v0, v1, v2, v3, v8, v9, v10, v11, w3, x2, x6, w7
+ .endm
+#endif
+
+ /*
+ * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * int blocks, int first)
+ * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * int blocks, int first)
+ */
+
+AES_ENTRY(aes_ecb_encrypt)
+ FRAME_PUSH
+ cbz w5, .LecbencloopNx
+
+ enc_prepare w3, x2, x5
+
+.LecbencloopNx:
+#if INTERLEAVE >= 4
+ subs w4, w4, #INTERLEAVE
+ bmi .Lecbenc1x
+ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
+#if INTERLEAVE == 8
+ ld1 {v8.16b-v11.16b}, [x1], #64 /* get 4 pt blocks */
+ do_encrypt_block8x
+ st1 {v0.16b-v3.16b}, [x0], #64
+ st1 {v8.16b-v11.16b}, [x0], #64
+#else
+ do_encrypt_block4x
+ st1 {v0.16b-v3.16b}, [x0], #64
+#endif
+ b .LecbencloopNx
+.Lecbenc1x:
+ adds w4, w4, #INTERLEAVE
+ beq .Lecbencout
+#endif
+.Lecbencloop:
+ ld1 {v0.16b}, [x1], #16 /* get next pt block */
+ encrypt_block v0, w3, x2, x5, w6
+ st1 {v0.16b}, [x0], #16
+ subs w4, w4, #1
+ bne .Lecbencloop
+.Lecbencout:
+ FRAME_POP
+ ret
+AES_ENDPROC(aes_ecb_encrypt)
+
+
+AES_ENTRY(aes_ecb_decrypt)
+ FRAME_PUSH
+ cbz w5, .LecbdecloopNx
+
+ dec_prepare w3, x2, x5
+
+.LecbdecloopNx:
+#if INTERLEAVE >= 4
+ subs w4, w4, #INTERLEAVE
+ bmi .Lecbdec1x
+ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
+#if INTERLEAVE == 8
+ ld1 {v8.16b-v11.16b}, [x1], #64 /* get 4 ct blocks */
+ do_decrypt_block8x
+ st1 {v0.16b-v3.16b}, [x0], #64
+ st1 {v8.16b-v11.16b}, [x0], #64
+#else
+ do_decrypt_block4x
+ st1 {v0.16b-v3.16b}, [x0], #64
+#endif
+ b .LecbdecloopNx
+.Lecbdec1x:
+ adds w4, w4, #INTERLEAVE
+ beq .Lecbdecout
+#endif
+.Lecbdecloop:
+ ld1 {v0.16b}, [x1], #16 /* get next ct block */
+ decrypt_block v0, w3, x2, x5, w6
+ st1 {v0.16b}, [x0], #16
+ subs w4, w4, #1
+ bne .Lecbdecloop
+.Lecbdecout:
+ FRAME_POP
+ ret
+AES_ENDPROC(aes_ecb_decrypt)
+
+
+ /*
+ * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * int blocks, u8 iv[], int first)
+ * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * int blocks, u8 iv[], int first)
+ */
+
+AES_ENTRY(aes_cbc_encrypt)
+ cbz w6, .Lcbcencloop
+
+ ld1 {v0.16b}, [x5] /* get iv */
+ enc_prepare w3, x2, x5
+
+.Lcbcencloop:
+ ld1 {v1.16b}, [x1], #16 /* get next pt block */
+ eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */
+ encrypt_block v0, w3, x2, x5, w6
+ st1 {v0.16b}, [x0], #16
+ subs w4, w4, #1
+ bne .Lcbcencloop
+ ret
+AES_ENDPROC(aes_cbc_encrypt)
+
+
+AES_ENTRY(aes_cbc_decrypt)
+ FRAME_PUSH
+ cbz w6, .LcbcdecloopNx
+
+ ld1 {v7.16b}, [x5] /* get iv */
+ dec_prepare w3, x2, x5
+
+.LcbcdecloopNx:
+#if INTERLEAVE >= 4
+ subs w4, w4, #INTERLEAVE
+ bmi .Lcbcdec1x
+ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
+ mov v4.16b, v0.16b
+ mov v5.16b, v1.16b
+ mov v6.16b, v2.16b
+#if INTERLEAVE == 8
+ ld1 {v8.16b-v11.16b}, [x1], #64 /* get 4 ct blocks */
+ mov v12.16b, v3.16b
+ mov v13.16b, v8.16b
+ mov v14.16b, v9.16b
+ mov v15.16b, v10.16b
+ do_decrypt_block8x
+#else
+ do_decrypt_block4x
+#endif
+ sub x1, x1, #16
+ eor v0.16b, v0.16b, v7.16b
+ eor v1.16b, v1.16b, v4.16b
+ ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */
+ eor v2.16b, v2.16b, v5.16b
+ eor v3.16b, v3.16b, v6.16b
+ st1 {v0.16b-v3.16b}, [x0], #64
+#if INTERLEAVE == 8
+ eor v8.16b, v8.16b, v12.16b
+ eor v9.16b, v9.16b, v13.16b
+ eor v10.16b, v10.16b, v14.16b
+ eor v11.16b, v11.16b, v15.16b
+ st1 {v8.16b-v11.16b}, [x0], #64
+#endif
+ b .LcbcdecloopNx
+.Lcbcdec1x:
+ adds w4, w4, #INTERLEAVE
+ beq .Lcbcdecout
+#endif
+.Lcbcdecloop:
+ ld1 {v1.16b}, [x1], #16 /* get next ct block */
+ mov v0.16b, v1.16b /* ...and copy to v0 */
+ decrypt_block v0, w3, x2, x5, w6
+ eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
+ mov v7.16b, v1.16b /* ct is next iv */
+ st1 {v0.16b}, [x0], #16
+ subs w4, w4, #1
+ bne .Lcbcdecloop
+.Lcbcdecout:
+ FRAME_POP
+ ret
+AES_ENDPROC(aes_cbc_decrypt)
+
+
+ /*
+ * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * int blocks, u8 ctr[], int first)
+ */
+
+AES_ENTRY(aes_ctr_encrypt)
+ FRAME_PUSH
+ cbnz w6, .Lctrfirst /* 1st time around? */
+ umov x5, v4.d[1] /* keep swabbed ctr in reg */
+ rev x5, x5
+#if INTERLEAVE >= 4
+ cmn w5, w4 /* 32 bit overflow? */
+ bcs .Lctrinc
+ add x5, x5, #1 /* increment BE ctr */
+ b .LctrincNx
+#else
+ b .Lctrinc
+#endif
+.Lctrfirst:
+ enc_prepare w3, x2, x6
+ ld1 {v4.16b}, [x5]
+ umov x5, v4.d[1] /* keep swabbed ctr in reg */
+ rev x5, x5
+#if INTERLEAVE >= 4
+ cmn w5, w4 /* 32 bit overflow? */
+ bcs .Lctrloop
+.LctrloopNx:
+ subs w4, w4, #INTERLEAVE
+ bmi .Lctr1x
+ ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */
+ dup v7.4s, w5
+ mov v0.16b, v4.16b
+ add v7.4s, v7.4s, v8.4s
+ mov v1.16b, v4.16b
+ rev32 v8.16b, v7.16b
+ mov v2.16b, v4.16b
+ mov v3.16b, v4.16b
+ mov v1.4s[3], v8.s[0]
+ mov v2.4s[3], v8.s[1]
+ mov v3.4s[3], v8.s[2]
+#if INTERLEAVE == 8
+ movi v8.4s, #4 /* addends 4,4,4,4 */
+ add x6, x1, #64
+ add v7.4s, v7.4s, v8.4s
+ mov v8.16b, v4.16b
+ mov v9.16b, v4.16b
+ ld1 {v12.16b-v15.16b}, [x6] /* get 4 input blocks */
+ rev32 v7.16b, v7.16b
+ mov v10.16b, v4.16b
+ mov v11.16b, v4.16b
+ mov v8.4s[3], v7.s[3]
+ mov v9.4s[3], v7.s[0]
+ mov v10.4s[3], v7.s[1]
+ mov v11.4s[3], v7.s[2]
+ ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
+ do_encrypt_block8x
+#else
+ ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
+ do_encrypt_block4x
+#endif
+ eor v0.16b, v5.16b, v0.16b
+ ld1 {v5.16b}, [x1], #16 /* get 1 input block */
+ eor v1.16b, v6.16b, v1.16b
+ eor v2.16b, v7.16b, v2.16b
+ eor v3.16b, v5.16b, v3.16b
+ st1 {v0.16b-v3.16b}, [x0], #64
+#if INTERLEAVE == 8
+ add x1, x1, #64
+ eor v12.16b, v8.16b, v12.16b
+ eor v13.16b, v9.16b, v13.16b
+ eor v14.16b, v10.16b, v14.16b
+ eor v15.16b, v11.16b, v15.16b
+ st1 {v12.16b-v15.16b}, [x0], #64
+#endif
+ add x5, x5, #INTERLEAVE
+.LctrincNx:
+ rev x7, x5
+ ins v4.d[1], x7
+ b .LctrloopNx
+.Lctr1x:
+ adds w4, w4, #INTERLEAVE
+ beq .Lctrout
+#endif
+.Lctrloop:
+ mov v0.16b, v4.16b
+ encrypt_block v0, w3, x2, x6, w7
+ subs w4, w4, #1
+ bmi .Lctrhalfblock /* blocks < 0 means 1/2 block */
+ ld1 {v3.16b}, [x1], #16
+ eor v3.16b, v0.16b, v3.16b
+ st1 {v3.16b}, [x0], #16
+ beq .Lctrout
+.Lctrinc:
+ adds x5, x5, #1 /* increment BE ctr */
+ rev x7, x5
+ ins v4.d[1], x7
+ bcc .Lctrloop /* no overflow? */
+ umov x7, v4.d[0] /* load upper word of ctr */
+ rev x7, x7 /* ... to handle the carry */
+ add x7, x7, #1
+ rev x7, x7
+ ins v4.d[0], x7
+ b .Lctrloop
+.Lctrhalfblock:
+ ld1 {v3.8b}, [x1]
+ eor v3.8b, v0.8b, v3.8b
+ st1 {v3.8b}, [x0]
+.Lctrout:
+ FRAME_POP
+ ret
+AES_ENDPROC(aes_ctr_encrypt)
+ .ltorg
+
+
+ /*
+ * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
+ * int blocks, u8 const rk2[], u8 iv[], int first)
+ * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
+ * int blocks, u8 const rk2[], u8 iv[], int first)
+ */
+
+ .macro next_tweak, out, in, const, tmp
+ sshr \tmp\().2d, \in\().2d, #63
+ and \tmp\().16b, \tmp\().16b, \const\().16b
+ add \out\().2d, \in\().2d, \in\().2d
+ ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
+ eor \out\().16b, \out\().16b, \tmp\().16b
+ .endm
+
+.Lxts_mul_x:
+ .word 1, 0, 0x87, 0
+
+AES_ENTRY(aes_xts_encrypt)
+ FRAME_PUSH
+ cbz w7, .LxtsencloopNx
+
+ ld1 {v4.16b}, [x6]
+ enc_prepare w3, x5, x6
+ encrypt_block v4, w3, x5, x6, w7 /* first tweak */
+ enc_switch_key w3, x2, x6
+ ldr q7, .Lxts_mul_x
+ b .LxtsencNx
+
+.LxtsencloopNx:
+ ldr q7, .Lxts_mul_x
+ next_tweak v4, v4, v7, v8
+.LxtsencNx:
+#if INTERLEAVE >= 4
+ subs w4, w4, #INTERLEAVE
+ bmi .Lxtsenc1x
+ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
+ next_tweak v5, v4, v7, v8
+ eor v0.16b, v0.16b, v4.16b
+ next_tweak v6, v5, v7, v8
+ eor v1.16b, v1.16b, v5.16b
+ eor v2.16b, v2.16b, v6.16b
+#if INTERLEAVE == 8
+ next_tweak v12, v6, v7, v16
+ eor v3.16b, v3.16b, v12.16b
+ ld1 {v8.16b-v11.16b}, [x1], #64 /* get 4 ct blocks */
+ next_tweak v13, v12, v7, v16
+ next_tweak v14, v13, v7, v16
+ eor v8.16b, v8.16b, v13.16b
+ next_tweak v15, v14, v7, v16
+ eor v9.16b, v9.16b, v14.16b
+ next_tweak v7, v15, v7, v16
+ eor v10.16b, v10.16b, v15.16b
+ eor v11.16b, v11.16b, v7.16b
+ do_encrypt_block8x
+ eor v3.16b, v3.16b, v12.16b
+#else
+ next_tweak v7, v6, v7, v8
+ eor v3.16b, v3.16b, v7.16b
+ do_encrypt_block4x
+ eor v3.16b, v3.16b, v7.16b
+#endif
+ eor v0.16b, v0.16b, v4.16b
+ eor v1.16b, v1.16b, v5.16b
+ eor v2.16b, v2.16b, v6.16b
+ st1 {v0.16b-v3.16b}, [x0], #64
+#if INTERLEAVE == 8
+ eor v8.16b, v8.16b, v13.16b
+ eor v9.16b, v9.16b, v14.16b
+ eor v10.16b, v10.16b, v15.16b
+ eor v11.16b, v11.16b, v7.16b
+ st1 {v8.16b-v11.16b}, [x0], #64
+#endif
+ mov v4.16b, v7.16b
+ b .LxtsencloopNx
+.Lxtsenc1x:
+ adds w4, w4, #INTERLEAVE
+ beq .Lxtsencout
+#endif
+.Lxtsencloop:
+ ld1 {v1.16b}, [x1], #16
+ eor v0.16b, v1.16b, v4.16b
+ encrypt_block v0, w3, x2, x6, w7
+ eor v0.16b, v0.16b, v4.16b
+ st1 {v0.16b}, [x0], #16
+ subs w4, w4, #1
+ beq .Lxtsencout
+ next_tweak v4, v4, v7, v8
+ b .Lxtsencloop
+.Lxtsencout:
+ FRAME_POP
+ ret
+AES_ENDPROC(aes_xts_encrypt)
+
+
+AES_ENTRY(aes_xts_decrypt)
+ FRAME_PUSH
+ cbz w7, .LxtsdecloopNx
+
+ ld1 {v4.16b}, [x6]
+ enc_prepare w3, x5, x6
+ encrypt_block v4, w3, x5, x6, w7 /* first tweak */
+ dec_prepare w3, x2, x6
+ ldr q7, .Lxts_mul_x
+ b .LxtsdecNx
+
+.LxtsdecloopNx:
+ ldr q7, .Lxts_mul_x
+ next_tweak v4, v4, v7, v8
+.LxtsdecNx:
+#if INTERLEAVE >= 4
+ subs w4, w4, #INTERLEAVE
+ bmi .Lxtsdec1x
+ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
+ next_tweak v5, v4, v7, v8
+ eor v0.16b, v0.16b, v4.16b
+ next_tweak v6, v5, v7, v8
+ eor v1.16b, v1.16b, v5.16b
+ eor v2.16b, v2.16b, v6.16b
+#if INTERLEAVE == 8
+ next_tweak v12, v6, v7, v16
+ eor v3.16b, v3.16b, v12.16b
+ ld1 {v8.16b-v11.16b}, [x1], #64 /* get 4 ct blocks */
+ next_tweak v13, v12, v7, v16
+ next_tweak v14, v13, v7, v16
+ eor v8.16b, v8.16b, v13.16b
+ next_tweak v15, v14, v7, v16
+ eor v9.16b, v9.16b, v14.16b
+ next_tweak v7, v15, v7, v16
+ eor v10.16b, v10.16b, v15.16b
+ eor v11.16b, v11.16b, v7.16b
+ do_decrypt_block8x
+ eor v3.16b, v3.16b, v12.16b
+#else
+ next_tweak v7, v6, v7, v8
+ eor v3.16b, v3.16b, v7.16b
+ do_decrypt_block4x
+ eor v3.16b, v3.16b, v7.16b
+#endif
+ eor v0.16b, v0.16b, v4.16b
+ eor v1.16b, v1.16b, v5.16b
+ eor v2.16b, v2.16b, v6.16b
+ st1 {v0.16b-v3.16b}, [x0], #64
+#if INTERLEAVE == 8
+ eor v8.16b, v8.16b, v13.16b
+ eor v9.16b, v9.16b, v14.16b
+ eor v10.16b, v10.16b, v15.16b
+ eor v11.16b, v11.16b, v7.16b
+ st1 {v8.16b-v11.16b}, [x0], #64
+#endif
+ mov v4.16b, v7.16b
+ b .LxtsdecloopNx
+.Lxtsdec1x:
+ adds w4, w4, #INTERLEAVE
+ beq .Lxtsdecout
+#endif
+.Lxtsdecloop:
+ ld1 {v1.16b}, [x1], #16
+ eor v0.16b, v1.16b, v4.16b
+ decrypt_block v0, w3, x2, x6, w7
+ eor v0.16b, v0.16b, v4.16b
+ st1 {v0.16b}, [x0], #16
+ subs w4, w4, #1
+ beq .Lxtsdecout
+ next_tweak v4, v4, v7, v8
+ b .Lxtsdecloop
+.Lxtsdecout:
+ FRAME_POP
+ ret
+AES_ENDPROC(aes_xts_decrypt)
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S
new file mode 100644
index 000000000000..a2aa6dab1f9e
--- /dev/null
+++ b/arch/arm64/crypto/aes-neon.S
@@ -0,0 +1,382 @@
+/*
+ * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
+ *
+ * Copyright (C) 2013 Linaro Ltd <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+
+#define AES_ENTRY(func) ENTRY(neon_ ## func)
+#define AES_ENDPROC(func) ENDPROC(neon_ ## func)
+
+ /* multiply by polynomial 'x' in GF(2^8) */
+ .macro mul_by_x, out, in, temp, const
+ sshr \temp, \in, #7
+ add \out, \in, \in
+ and \temp, \temp, \const
+ eor \out, \out, \temp
+ .endm
+
+ /* preload the entire Sbox */
+ .macro prepare, sbox, shiftrows, temp
+ adr \temp, \sbox
+ movi v12.16b, #0x40
+ ldr q13, \shiftrows
+ movi v14.16b, #0x1b
+ ld1 {v16.16b-v19.16b}, [\temp], #64
+ ld1 {v20.16b-v23.16b}, [\temp], #64
+ ld1 {v24.16b-v27.16b}, [\temp], #64
+ ld1 {v28.16b-v31.16b}, [\temp]
+ .endm
+
+ /* do preload for encryption */
+ .macro enc_prepare, ignore0, ignore1, temp
+ prepare .LForward_Sbox, .LForward_ShiftRows, \temp
+ .endm
+
+ .macro enc_switch_key, ignore0, ignore1, temp
+ /* do nothing */
+ .endm
+
+ /* do preload for decryption */
+ .macro dec_prepare, ignore0, ignore1, temp
+ prepare .LReverse_Sbox, .LReverse_ShiftRows, \temp
+ .endm
+
+ /* apply SubBytes transformation using the the preloaded Sbox */
+ .macro sub_bytes, in
+ sub v9.16b, \in\().16b, v12.16b
+ tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b
+ sub v10.16b, v9.16b, v12.16b
+ tbx \in\().16b, {v20.16b-v23.16b}, v9.16b
+ sub v11.16b, v10.16b, v12.16b
+ tbx \in\().16b, {v24.16b-v27.16b}, v10.16b
+ tbx \in\().16b, {v28.16b-v31.16b}, v11.16b
+ .endm
+
+ /* apply MixColumns transformation */
+ .macro mix_columns, in
+ mul_by_x v10.16b, \in\().16b, v9.16b, v14.16b
+ rev32 v8.8h, \in\().8h
+ eor \in\().16b, v10.16b, \in\().16b
+ shl v9.4s, v8.4s, #24
+ sri v9.4s, v8.4s, #8
+ shl v11.4s, \in\().4s, #24
+ sri v11.4s, \in\().4s, #8
+ eor v9.16b, v9.16b, v8.16b
+ eor v10.16b, v10.16b, v9.16b
+ eor \in\().16b, v10.16b, v11.16b
+ .endm
+
+ /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
+ .macro inv_mix_columns, in
+ mul_by_x v11.16b, \in\().16b, v10.16b, v14.16b
+ mul_by_x v11.16b, v11.16b, v10.16b, v14.16b
+ eor \in\().16b, \in\().16b, v11.16b
+ rev32 v11.8h, v11.8h
+ eor \in\().16b, \in\().16b, v11.16b
+ mix_columns \in
+ .endm
+
+ .macro do_block, enc, in, rounds, rk, rkp, i
+ ld1 {v15.16b}, [\rk]
+ add \rkp, \rk, #16
+ mov \i, \rounds
+1111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
+ tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */
+ sub_bytes \in
+ ld1 {v15.16b}, [\rkp], #16
+ subs \i, \i, #1
+ beq 2222f
+ .if \enc == 1
+ mix_columns \in
+ .else
+ inv_mix_columns \in
+ .endif
+ b 1111b
+2222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
+ .endm
+
+ .macro encrypt_block, in, rounds, rk, rkp, i
+ do_block 1, \in, \rounds, \rk, \rkp, \i
+ .endm
+
+ .macro decrypt_block, in, rounds, rk, rkp, i
+ do_block 0, \in, \rounds, \rk, \rkp, \i
+ .endm
+
+ /*
+ * Interleaved versions: functionally equivalent to the
+ * ones above, but applied to 2 or 4 AES states in parallel.
+ */
+
+ .macro sub_bytes_2x, in0, in1
+ sub v8.16b, \in0\().16b, v12.16b
+ sub v9.16b, \in1\().16b, v12.16b
+ tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
+ tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
+ sub v10.16b, v8.16b, v12.16b
+ sub v11.16b, v9.16b, v12.16b
+ tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b
+ tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b
+ sub v8.16b, v10.16b, v12.16b
+ sub v9.16b, v11.16b, v12.16b
+ tbx \in0\().16b, {v24.16b-v27.16b}, v10.16b
+ tbx \in1\().16b, {v24.16b-v27.16b}, v11.16b
+ tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b
+ tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b
+ .endm
+
+ .macro sub_bytes_4x, in0, in1, in2, in3
+ sub v8.16b, \in0\().16b, v12.16b
+ tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
+ sub v9.16b, \in1\().16b, v12.16b
+ tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
+ sub v10.16b, \in2\().16b, v12.16b
+ tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b
+ sub v11.16b, \in3\().16b, v12.16b
+ tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b
+ tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b
+ tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b
+ sub v8.16b, v8.16b, v12.16b
+ tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b
+ sub v9.16b, v9.16b, v12.16b
+ tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b
+ sub v10.16b, v10.16b, v12.16b
+ tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b
+ sub v11.16b, v11.16b, v12.16b
+ tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b
+ sub v8.16b, v8.16b, v12.16b
+ tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b
+ sub v9.16b, v9.16b, v12.16b
+ tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b
+ sub v10.16b, v10.16b, v12.16b
+ tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b
+ sub v11.16b, v11.16b, v12.16b
+ tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b
+ tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b
+ tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b
+ .endm
+
+ .macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
+ sshr \tmp0\().16b, \in0\().16b, #7
+ add \out0\().16b, \in0\().16b, \in0\().16b
+ sshr \tmp1\().16b, \in1\().16b, #7
+ and \tmp0\().16b, \tmp0\().16b, \const\().16b
+ add \out1\().16b, \in1\().16b, \in1\().16b
+ and \tmp1\().16b, \tmp1\().16b, \const\().16b
+ eor \out0\().16b, \out0\().16b, \tmp0\().16b
+ eor \out1\().16b, \out1\().16b, \tmp1\().16b
+ .endm
+
+ .macro mix_columns_2x, in0, in1
+ mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14
+ rev32 v10.8h, \in0\().8h
+ rev32 v11.8h, \in1\().8h
+ eor \in0\().16b, v8.16b, \in0\().16b
+ eor \in1\().16b, v9.16b, \in1\().16b
+ shl v12.4s, v10.4s, #24
+ shl v13.4s, v11.4s, #24
+ eor v8.16b, v8.16b, v10.16b
+ sri v12.4s, v10.4s, #8
+ shl v10.4s, \in0\().4s, #24
+ eor v9.16b, v9.16b, v11.16b
+ sri v13.4s, v11.4s, #8
+ shl v11.4s, \in1\().4s, #24
+ sri v10.4s, \in0\().4s, #8
+ eor \in0\().16b, v8.16b, v12.16b
+ sri v11.4s, \in1\().4s, #8
+ eor \in1\().16b, v9.16b, v13.16b
+ eor \in0\().16b, v10.16b, \in0\().16b
+ eor \in1\().16b, v11.16b, \in1\().16b
+ .endm
+
+ .macro inv_mix_cols_2x, in0, in1
+ mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14
+ mul_by_x_2x v8, v9, v8, v9, v10, v11, v14
+ eor \in0\().16b, \in0\().16b, v8.16b
+ eor \in1\().16b, \in1\().16b, v9.16b
+ rev32 v8.8h, v8.8h
+ rev32 v9.8h, v9.8h
+ eor \in0\().16b, \in0\().16b, v8.16b
+ eor \in1\().16b, \in1\().16b, v9.16b
+ mix_columns_2x \in0, \in1
+ .endm
+
+ .macro inv_mix_cols_4x, in0, in1, in2, in3
+ mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14
+ mul_by_x_2x v10, v11, \in2, \in3, v12, v13, v14
+ mul_by_x_2x v8, v9, v8, v9, v12, v13, v14
+ mul_by_x_2x v10, v11, v10, v11, v12, v13, v14
+ eor \in0\().16b, \in0\().16b, v8.16b
+ eor \in1\().16b, \in1\().16b, v9.16b
+ eor \in2\().16b, \in2\().16b, v10.16b
+ eor \in3\().16b, \in3\().16b, v11.16b
+ rev32 v8.8h, v8.8h
+ rev32 v9.8h, v9.8h
+ rev32 v10.8h, v10.8h
+ rev32 v11.8h, v11.8h
+ eor \in0\().16b, \in0\().16b, v8.16b
+ eor \in1\().16b, \in1\().16b, v9.16b
+ eor \in2\().16b, \in2\().16b, v10.16b
+ eor \in3\().16b, \in3\().16b, v11.16b
+ mix_columns_2x \in0, \in1
+ mix_columns_2x \in2, \in3
+ .endm
+
+ .macro do_block_2x, enc, in0, in1 rounds, rk, rkp, i
+ ld1 {v15.16b}, [\rk]
+ add \rkp, \rk, #16
+ mov \i, \rounds
+1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
+ eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
+ sub_bytes_2x \in0, \in1
+ tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
+ tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
+ ld1 {v15.16b}, [\rkp], #16
+ subs \i, \i, #1
+ beq 2222f
+ .if \enc == 1
+ mix_columns_2x \in0, \in1
+ ldr q13, .LForward_ShiftRows
+ .else
+ inv_mix_cols_2x \in0, \in1
+ ldr q13, .LReverse_ShiftRows
+ .endif
+ movi v12.16b, #0x40
+ b 1111b
+2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
+ eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
+ .endm
+
+ .macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
+ ld1 {v15.16b}, [\rk]
+ add \rkp, \rk, #16
+ mov \i, \rounds
+1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
+ eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
+ eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
+ eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
+ sub_bytes_4x \in0, \in1, \in2, \in3
+ tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
+ tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
+ tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */
+ tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */
+ ld1 {v15.16b}, [\rkp], #16
+ subs \i, \i, #1
+ beq 2222f
+ .if \enc == 1
+ mix_columns_2x \in0, \in1
+ mix_columns_2x \in2, \in3
+ ldr q13, .LForward_ShiftRows
+ .else
+ inv_mix_cols_4x \in0, \in1, \in2, \in3
+ ldr q13, .LReverse_ShiftRows
+ .endif
+ movi v12.16b, #0x40
+ b 1111b
+2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
+ eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
+ eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
+ eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
+ .endm
+
+ .macro encrypt_block2x, in0, in1, rounds, rk, rkp, i
+ do_block_2x 1, \in0, \in1, \rounds, \rk, \rkp, \i
+ .endm
+
+ .macro decrypt_block2x, in0, in1, rounds, rk, rkp, i
+ do_block_2x 0, \in0, \in1, \rounds, \rk, \rkp, \i
+ .endm
+
+ .macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
+ do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
+ .endm
+
+ .macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
+ do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
+ .endm
+
+#include "aes-modes.S"
+
+ .text
+ .align 4
+.LForward_ShiftRows:
+ .byte 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3
+ .byte 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb
+
+.LReverse_ShiftRows:
+ .byte 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb
+ .byte 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3
+
+.LForward_Sbox:
+ .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+ .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+ .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+ .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+ .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+ .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+ .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+ .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+ .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+ .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+ .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+ .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+ .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+ .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+ .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+ .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+ .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+ .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+ .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+ .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+ .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+ .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+ .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+ .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+ .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+ .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+ .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+ .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+ .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+ .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+ .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+ .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+
+.LReverse_Sbox:
+ .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+ .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+ .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+ .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+ .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+ .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+ .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+ .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+ .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+ .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+ .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+ .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+ .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+ .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+ .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+ .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+ .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+ .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+ .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+ .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+ .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+ .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+ .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+ .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+ .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+ .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+ .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+ .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+ .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+ .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+ .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+ .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
--
1.8.3.2

2014-05-01 15:51:39

by Ard Biesheuvel

[permalink] [raw]

Subject: [PATCH resend 13/15] arm64/crypto: add voluntary preemption to Crypto Extensions SHA1

The Crypto Extensions based SHA1 implementation uses the NEON register file,
and hence runs with preemption disabled. This patch adds a TIF_NEED_RESCHED
check to its inner loop so we at least give up the CPU voluntarily when we
are running in process context and have been tagged for preemption by the
scheduler.

Signed-off-by: Ard Biesheuvel <[email protected]>
---
arch/arm64/crypto/sha1-ce-core.S | 19 ++++++++--------
arch/arm64/crypto/sha1-ce-glue.c | 49 +++++++++++++++++++++++++++++++---------
2 files changed, 48 insertions(+), 20 deletions(-)

diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S
index bd4af29f2722..831e332f9331 100644
--- a/arch/arm64/crypto/sha1-ce-core.S
+++ b/arch/arm64/crypto/sha1-ce-core.S
@@ -66,8 +66,8 @@
.word 0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6

/*
- * void sha1_ce_transform(int blocks, u8 const *src, u32 *state,
- * u8 *head, long bytes)
+ * int sha1_ce_transform(int blocks, u8 const *src, u32 *state,
+ * u8 *head, long bytes, struct thread_info *ti)
*/
ENTRY(sha1_ce_transform)
/* load round constants */
@@ -127,7 +127,13 @@ CPU_LE( rev32 v11.16b, v11.16b )
add dgbv.2s, dgbv.2s, dg1v.2s
add dgav.4s, dgav.4s, dg0v.4s

- cbnz w0, 0b
+ cbz w0, 4f
+ b_if_no_resched x5, x8, 0b
+
+ /* store new state */
+3: str dga, [x2]
+ str dgb, [x2, #16]
+ ret

/*
* Final block: add padding and total bit count.
@@ -135,7 +141,7 @@ CPU_LE( rev32 v11.16b, v11.16b )
* size was not a round multiple of the block size, and the padding is
* handled by the C code.
*/
- cbz x4, 3f
+4: cbz x4, 3b
movi v9.2d, #0
mov x8, #0x80000000
movi v10.2d, #0
@@ -145,9 +151,4 @@ CPU_LE( rev32 v11.16b, v11.16b )
mov v11.d[0], xzr
mov v11.d[1], x7
b 2b
-
- /* store new state */
-3: str dga, [x2]
- str dgb, [x2, #16]
- ret
ENDPROC(sha1_ce_transform)
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
index 6fe83f37a750..69850a163668 100644
--- a/arch/arm64/crypto/sha1-ce-glue.c
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@@ -20,8 +20,8 @@ MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <[email protected]>");
MODULE_LICENSE("GPL v2");

-asmlinkage void sha1_ce_transform(int blocks, u8 const *src, u32 *state,
- u8 *head, long bytes);
+asmlinkage int sha1_ce_transform(int blocks, u8 const *src, u32 *state,
+ u8 *head, long bytes, struct thread_info *ti);

static int sha1_init(struct shash_desc *desc)
{
@@ -42,6 +42,7 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
sctx->count += len;

if ((partial + len) >= SHA1_BLOCK_SIZE) {
+ struct thread_info *ti = NULL;
int blocks;

if (partial) {
@@ -52,16 +53,30 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
len -= p;
}

+ /*
+ * Pass current's thread info pointer to sha1_ce_transform()
+ * below if we want it to play nice under preemption.
+ */
+ if ((IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY) ||
+ IS_ENABLED(CONFIG_PREEMPT)) && !in_interrupt())
+ ti = current_thread_info();
+
blocks = len / SHA1_BLOCK_SIZE;
len %= SHA1_BLOCK_SIZE;

- kernel_neon_begin_partial(16);
- sha1_ce_transform(blocks, data, sctx->state,
- partial ? sctx->buffer : NULL, 0);
- kernel_neon_end();
+ do {
+ int rem;
+
+ kernel_neon_begin_partial(16);
+ rem = sha1_ce_transform(blocks, data, sctx->state,
+ partial ? sctx->buffer : NULL,
+ 0, ti);
+ kernel_neon_end();

- data += blocks * SHA1_BLOCK_SIZE;
- partial = 0;
+ data += (blocks - rem) * SHA1_BLOCK_SIZE;
+ blocks = rem;
+ partial = 0;
+ } while (unlikely(ti && blocks > 0));
}
if (len)
memcpy(sctx->buffer + partial, data, len);
@@ -94,6 +109,7 @@ static int sha1_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
struct sha1_state *sctx = shash_desc_ctx(desc);
+ struct thread_info *ti = NULL;
__be32 *dst = (__be32 *)out;
int blocks;
int i;
@@ -111,9 +127,20 @@ static int sha1_finup(struct shash_desc *desc, const u8 *data,
*/
blocks = len / SHA1_BLOCK_SIZE;

- kernel_neon_begin_partial(16);
- sha1_ce_transform(blocks, data, sctx->state, NULL, len);
- kernel_neon_end();
+ if ((IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY) ||
+ IS_ENABLED(CONFIG_PREEMPT)) && !in_interrupt())
+ ti = current_thread_info();
+
+ do {
+ int rem;
+
+ kernel_neon_begin_partial(16);
+ rem = sha1_ce_transform(blocks, data, sctx->state,
+ NULL, len, ti);
+ kernel_neon_end();
+ data += (blocks - rem) * SHA1_BLOCK_SIZE;
+ blocks = rem;
+ } while (unlikely(ti && blocks > 0));

for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++)
put_unaligned_be32(sctx->state[i], dst++);
--
1.8.3.2

2014-05-01 15:51:40

by Ard Biesheuvel

[permalink] [raw]

Subject: [PATCH resend 14/15] arm64/crypto: add voluntary preemption to Crypto Extensions SHA2

The Crypto Extensions based SHA2 implementation uses the NEON register file,
and hence runs with preemption disabled. This patch adds a TIF_NEED_RESCHED
check to its inner loop so we at least give up the CPU voluntarily when we
are running in process context and have been tagged for preemption by the
scheduler.

Signed-off-by: Ard Biesheuvel <[email protected]>
---
arch/arm64/crypto/sha2-ce-core.S | 19 ++++++++-------
arch/arm64/crypto/sha2-ce-glue.c | 51 ++++++++++++++++++++++++++++++----------
2 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S
index 53e750614169..46b669d91c29 100644
--- a/arch/arm64/crypto/sha2-ce-core.S
+++ b/arch/arm64/crypto/sha2-ce-core.S
@@ -73,8 +73,8 @@
.word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2

/*
- * void sha2_ce_transform(int blocks, u8 const *src, u32 *state,
- * u8 *head, long bytes)
+ * int sha2_ce_transform(int blocks, u8 const *src, u32 *state,
+ * u8 *head, long bytes, struct thread_info *ti)
*/
ENTRY(sha2_ce_transform)
/* load round constants */
@@ -131,7 +131,14 @@ CPU_LE( rev32 v19.16b, v19.16b )
add dgbv.4s, dgbv.4s, dg1v.4s

/* handled all input blocks? */
- cbnz w0, 0b
+ cbz w0, 4f
+
+ /* should we exit early? */
+ b_if_no_resched x5, x8, 0b
+
+ /* store new state */
+3: stp dga, dgb, [x2]
+ ret

/*
* Final block: add padding and total bit count.
@@ -139,7 +146,7 @@ CPU_LE( rev32 v19.16b, v19.16b )
* size was not a round multiple of the block size, and the padding is
* handled by the C code.
*/
- cbz x4, 3f
+4: cbz x4, 3b
movi v17.2d, #0
mov x8, #0x80000000
movi v18.2d, #0
@@ -149,8 +156,4 @@ CPU_LE( rev32 v19.16b, v19.16b )
mov v19.d[0], xzr
mov v19.d[1], x7
b 2b
-
- /* store new state */
-3: stp dga, dgb, [x2]
- ret
ENDPROC(sha2_ce_transform)
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
index 81617262b3df..6566ad3fdf82 100644
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -1,4 +1,4 @@
-/*
+h/*
* sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions
*
* Copyright (C) 2014 Linaro Ltd <[email protected]>
@@ -20,8 +20,8 @@ MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <[email protected]>");
MODULE_LICENSE("GPL v2");

-asmlinkage void sha2_ce_transform(int blocks, u8 const *src, u32 *state,
- u8 *head, long bytes);
+asmlinkage int sha2_ce_transform(int blocks, u8 const *src, u32 *state,
+ u8 *head, long bytes, struct thread_info *ti);

static int sha224_init(struct shash_desc *desc)
{
@@ -58,6 +58,7 @@ static int sha2_update(struct shash_desc *desc, const u8 *data,
sctx->count += len;

if ((partial + len) >= SHA256_BLOCK_SIZE) {
+ struct thread_info *ti = NULL;
int blocks;

if (partial) {
@@ -68,16 +69,30 @@ static int sha2_update(struct shash_desc *desc, const u8 *data,
len -= p;
}

+ /*
+ * Pass current's thread info pointer to sha2_ce_transform()
+ * below if we want it to play nice under preemption.
+ */
+ if ((IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY) ||
+ IS_ENABLED(CONFIG_PREEMPT)) && !in_interrupt())
+ ti = current_thread_info();
+
blocks = len / SHA256_BLOCK_SIZE;
len %= SHA256_BLOCK_SIZE;

- kernel_neon_begin_partial(28);
- sha2_ce_transform(blocks, data, sctx->state,
- partial ? sctx->buf : NULL, 0);
- kernel_neon_end();
+ do {
+ int rem;
+
+ kernel_neon_begin_partial(28);
+ rem = sha2_ce_transform(blocks, data, sctx->state,
+ partial ? sctx->buf : NULL,
+ 0, ti);
+ kernel_neon_end();

- data += blocks * SHA256_BLOCK_SIZE;
- partial = 0;
+ data += (blocks - rem) * SHA256_BLOCK_SIZE;
+ blocks = rem;
+ partial = 0;
+ } while (unlikely(ti && blocks > 0));
}
if (len)
memcpy(sctx->buf + partial, data, len);
@@ -131,6 +146,7 @@ static void sha2_finup(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
struct sha256_state *sctx = shash_desc_ctx(desc);
+ struct thread_info *ti = NULL;
int blocks;

if (sctx->count || !len || (len % SHA256_BLOCK_SIZE)) {
@@ -147,9 +163,20 @@ static void sha2_finup(struct shash_desc *desc, const u8 *data,
*/
blocks = len / SHA256_BLOCK_SIZE;

- kernel_neon_begin_partial(28);
- sha2_ce_transform(blocks, data, sctx->state, NULL, len);
- kernel_neon_end();
+ if ((IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY) ||
+ IS_ENABLED(CONFIG_PREEMPT)) && !in_interrupt())
+ ti = current_thread_info();
+
+ do {
+ int rem;
+
+ kernel_neon_begin_partial(28);
+ rem = sha2_ce_transform(blocks, data, sctx->state,
+ NULL, len, ti);
+ kernel_neon_end();
+ data += (blocks - rem) * SHA256_BLOCK_SIZE;
+ blocks = rem;
+ } while (unlikely(ti && blocks > 0));
}

static int sha224_finup(struct shash_desc *desc, const u8 *data,
--
1.8.3.2

2014-05-01 15:51:41

by Ard Biesheuvel

[permalink] [raw]

Subject: [PATCH resend 15/15] arm64/crypto: add voluntary preemption to Crypto Extensions GHASH

The Crypto Extensions based GHASH implementation uses the NEON register file,
and hence runs with preemption disabled. This patch adds a TIF_NEED_RESCHED
check to its inner loop so we at least give up the CPU voluntarily when we
are running in process context and have been tagged for preemption by the
scheduler.

Signed-off-by: Ard Biesheuvel <[email protected]>
---
arch/arm64/crypto/ghash-ce-core.S | 10 ++++++----
arch/arm64/crypto/ghash-ce-glue.c | 33 +++++++++++++++++++++++++--------
2 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index b9e6eaf41c9b..523432f24ed2 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -31,8 +31,9 @@
.arch armv8-a+crypto

/*
- * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
- * struct ghash_key const *k, const char *head)
+ * int pmull_ghash_update(int blocks, u64 dg[], const char *src,
+ * struct ghash_key const *k, const char *head,
+ * struct thread_info *ti)
*/
ENTRY(pmull_ghash_update)
ld1 {DATA.16b}, [x1]
@@ -88,8 +89,9 @@ CPU_LE( rev64 IN1.16b, IN1.16b )
eor T1.16b, T1.16b, T2.16b
eor DATA.16b, DATA.16b, T1.16b

- cbnz w0, 0b
+ cbz w0, 2f
+ b_if_no_resched x5, x7, 0b

- st1 {DATA.16b}, [x1]
+2: st1 {DATA.16b}, [x1]
ret
ENDPROC(pmull_ghash_update)
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index b92baf3f68c7..4df64832617d 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -33,8 +33,9 @@ struct ghash_desc_ctx {
u32 count;
};

-asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src,
- struct ghash_key const *k, const char *head);
+asmlinkage int pmull_ghash_update(int blocks, u64 dg[], const char *src,
+ struct ghash_key const *k, const char *head,
+ struct thread_info *ti);

static int ghash_init(struct shash_desc *desc)
{
@@ -54,6 +55,7 @@ static int ghash_update(struct shash_desc *desc, const u8 *src,

if ((partial + len) >= GHASH_BLOCK_SIZE) {
struct ghash_key *key = crypto_shash_ctx(desc->tfm);
+ struct thread_info *ti = NULL;
int blocks;

if (partial) {
@@ -64,14 +66,29 @@ static int ghash_update(struct shash_desc *desc, const u8 *src,
len -= p;
}

+ /*
+ * Pass current's thread info pointer to pmull_ghash_update()
+ * below if we want it to play nice under preemption.
+ */
+ if ((IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY) ||
+ IS_ENABLED(CONFIG_PREEMPT)) && !in_interrupt())
+ ti = current_thread_info();
+
blocks = len / GHASH_BLOCK_SIZE;
len %= GHASH_BLOCK_SIZE;

- kernel_neon_begin_partial(6);
- pmull_ghash_update(blocks, ctx->digest, src, key,
- partial ? ctx->buf : NULL);
- kernel_neon_end();
- src += blocks * GHASH_BLOCK_SIZE;
+ do {
+ int rem;
+
+ kernel_neon_begin_partial(6);
+ rem = pmull_ghash_update(blocks, ctx->digest, src, key,
+ partial ? ctx->buf : NULL, ti);
+ kernel_neon_end();
+
+ src += (blocks - rem) * GHASH_BLOCK_SIZE;
+ blocks = rem;
+ partial = 0;
+ } while (unlikely(ti && blocks > 0));
}
if (len)
memcpy(ctx->buf + partial, src, len);
@@ -89,7 +106,7 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)
memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);

kernel_neon_begin_partial(6);
- pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL);
+ pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL, NULL);
kernel_neon_end();
}
put_unaligned_be64(ctx->digest[1], dst);
--
1.8.3.2

2014-05-13 18:58:57

by Jussi Kivilinna

[permalink] [raw]

Subject: Re: [PATCH resend 13/15] arm64/crypto: add voluntary preemption to Crypto Extensions SHA1

On 01.05.2014 18:51, Ard Biesheuvel wrote:
> The Crypto Extensions based SHA1 implementation uses the NEON register file,
> and hence runs with preemption disabled. This patch adds a TIF_NEED_RESCHED
> check to its inner loop so we at least give up the CPU voluntarily when we
> are running in process context and have been tagged for preemption by the
> scheduler.
>
> Signed-off-by: Ard Biesheuvel <[email protected]>
> ---
<snip>
> @@ -42,6 +42,7 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
> sctx->count += len;
>
> if ((partial + len) >= SHA1_BLOCK_SIZE) {
> + struct thread_info *ti = NULL;
> int blocks;
>
> if (partial) {
> @@ -52,16 +53,30 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
> len -= p;
> }
>
> + /*
> + * Pass current's thread info pointer to sha1_ce_transform()
> + * below if we want it to play nice under preemption.
> + */
> + if ((IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY) ||
> + IS_ENABLED(CONFIG_PREEMPT)) && !in_interrupt())
> + ti = current_thread_info();
> +
> blocks = len / SHA1_BLOCK_SIZE;
> len %= SHA1_BLOCK_SIZE;
>
> - kernel_neon_begin_partial(16);
> - sha1_ce_transform(blocks, data, sctx->state,
> - partial ? sctx->buffer : NULL, 0);
> - kernel_neon_end();
> + do {
> + int rem;
> +
> + kernel_neon_begin_partial(16);
> + rem = sha1_ce_transform(blocks, data, sctx->state,
> + partial ? sctx->buffer : NULL,
> + 0, ti);
> + kernel_neon_end();
>
> - data += blocks * SHA1_BLOCK_SIZE;
> - partial = 0;
> + data += (blocks - rem) * SHA1_BLOCK_SIZE;
> + blocks = rem;
> + partial = 0;
> + } while (unlikely(ti && blocks > 0));
> }
> if (len)
> memcpy(sctx->buffer + partial, data, len);
> @@ -94,6 +109,7 @@ static int sha1_finup(struct shash_desc *desc, const u8 *data,
> unsigned int len, u8 *out)
> {
> struct sha1_state *sctx = shash_desc_ctx(desc);
> + struct thread_info *ti = NULL;
> __be32 *dst = (__be32 *)out;
> int blocks;
> int i;
> @@ -111,9 +127,20 @@ static int sha1_finup(struct shash_desc *desc, const u8 *data,
> */
> blocks = len / SHA1_BLOCK_SIZE;
>
> - kernel_neon_begin_partial(16);
> - sha1_ce_transform(blocks, data, sctx->state, NULL, len);
> - kernel_neon_end();
> + if ((IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY) ||
> + IS_ENABLED(CONFIG_PREEMPT)) && !in_interrupt())
> + ti = current_thread_info();
> +
> + do {
> + int rem;
> +
> + kernel_neon_begin_partial(16);
> + rem = sha1_ce_transform(blocks, data, sctx->state,
> + NULL, len, ti);
> + kernel_neon_end();
> + data += (blocks - rem) * SHA1_BLOCK_SIZE;
> + blocks = rem;
> + } while (unlikely(ti && blocks > 0));
>

These seem to be similar, how about renaming assembly function to __sha1_ce_transform
and moving this loop to new sha1_ce_transform.

Otherwise, patches looks good.

-Jussi

2014-05-14 01:36:47

by Herbert Xu

[permalink] [raw]

Subject: Re: [PATCH resend 13/15] arm64/crypto: add voluntary preemption to Crypto Extensions SHA1

Ard Biesheuvel <[email protected]> wrote:
>
> + /*
> + * Pass current's thread info pointer to sha1_ce_transform()
> + * below if we want it to play nice under preemption.
> + */
> + if ((IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY) ||
> + IS_ENABLED(CONFIG_PREEMPT)) && !in_interrupt())
> + ti = current_thread_info();

Instead of in_interrupt you should test CRYPTO_TFM_REQ_MAY_SLEEP.
It's not a matter of correctness but it may avoid unnecessary
breaks in the loop.

Cheers,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt