LinuxLists.cc - [PATCH] arm: crypto: Add NEON optimized SHA-256

2015-03-16 15:48:40

by Sami Tolvanen

[permalink] [raw]

Subject: [PATCH] arm: crypto: Add NEON optimized SHA-256

Add Andy Polyakov's NEON optimized SHA-256 implementation.

On Nexus 6, this implementation is ~2x faster than sha256-generic.

Signed-off-by: Sami Tolvanen <[email protected]>

---
arch/arm/crypto/Makefile | 2
arch/arm/crypto/sha256-armv7-neon.S | 819 ++++++++++++++++++++++++++++++++++++
arch/arm/crypto/sha256_neon_glue.c | 201 ++++++++
crypto/Kconfig | 12
4 files changed, 1034 insertions(+)

diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index b48fa34..316dba2 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -6,12 +6,14 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
+obj-$(CONFIG_CRYPTO_SHA256_ARM_NEON) += sha256-arm-neon.o
obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o

aes-arm-y := aes-armv4.o aes_glue.o
aes-arm-bs-y := aesbs-core.o aesbs-glue.o
sha1-arm-y := sha1-armv4-large.o sha1_glue.o
sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o
+sha256-arm-neon-y := sha256-armv7-neon.o sha256_neon_glue.o
sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o

quiet_cmd_perl = PERL $@
diff --git a/arch/arm/crypto/sha256-armv7-neon.S b/arch/arm/crypto/sha256-armv7-neon.S
new file mode 100644
index 0000000..5ce04c2
--- /dev/null
+++ b/arch/arm/crypto/sha256-armv7-neon.S
@@ -0,0 +1,819 @@
+@ sha256-armv7-neon.S - ARM/NEON assembly implementation of SHA-256 transform
+@
+@ ====================================================================
+@ Written by Andy Polyakov <[email protected]> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@ ====================================================================
+
+#include <linux/linkage.h>
+
+.text
+.code 32
+.fpu neon
+
+.type K256,%object
+.align 5
+K256:
+.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size K256,.-K256
+.word 0 @ terminator
+.word 0
+.align 5
+
+.align 5
+ENTRY(sha256_transform_neon)
+ /* Input:
+ * %r0: SHA256_CONTEXT
+ * %r1: data
+ * %r2: nblks
+ */
+ sub r3,pc,#8 @ sha256_transform_neon
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+
+ stmdb sp!,{r4-r12,lr}
+
+ mov r12,sp
+ sub sp,sp,#16*4+16 @ alloca
+ sub r14,r3,#256+32 @ K256
+ bic sp,sp,#15 @ align for 128-bit stores
+
+ vld1.8 {q0},[r1]!
+ vld1.8 {q1},[r1]!
+ vld1.8 {q2},[r1]!
+ vld1.8 {q3},[r1]!
+ vld1.32 {q8},[r14,:128]!
+ vld1.32 {q9},[r14,:128]!
+ vld1.32 {q10},[r14,:128]!
+ vld1.32 {q11},[r14,:128]!
+ vrev32.8 q0,q0 @ yes, even on
+ str r0,[sp,#64]
+ vrev32.8 q1,q1 @ big-endian
+ str r1,[sp,#68]
+ mov r1,sp
+ vrev32.8 q2,q2
+ str r2,[sp,#72]
+ vrev32.8 q3,q3
+ str r12,[sp,#76] @ save original sp
+ vadd.i32 q8,q8,q0
+ vadd.i32 q9,q9,q1
+ vst1.32 {q8},[r1,:128]!
+ vadd.i32 q10,q10,q2
+ vst1.32 {q9},[r1,:128]!
+ vadd.i32 q11,q11,q3
+ vst1.32 {q10},[r1,:128]!
+ vst1.32 {q11},[r1,:128]!
+
+ ldmia r0,{r4-r11}
+ sub r1,r1,#64
+ ldr r2,[sp,#0]
+ eor r12,r12,r12
+ eor r3,r5,r6
+ b .L_00_48
+
+.align 4
+.L_00_48:
+ vext.8 q8,q0,q1,#4
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ vext.8 q9,q2,q3,#4
+ add r4,r4,r12
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vadd.i32 q0,q0,q9
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#4]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ veor q9,q9,q10
+ add r10,r10,r2
+ vsli.32 q11,q8,#14
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ vshr.u32 d24,d7,#17
+ add r11,r11,r3
+ and r2,r2,r7
+ veor q9,q9,q11
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ vsli.32 d24,d7,#15
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ vshr.u32 d25,d7,#10
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ vadd.i32 q0,q0,q9
+ add r10,r10,r2
+ ldr r2,[sp,#8]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r6,r6,r10
+ vshr.u32 d24,d7,#19
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ vsli.32 d24,d7,#13
+ add r9,r9,r2
+ eor r2,r7,r8
+ veor d25,d25,d24
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ vadd.i32 d0,d0,d25
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ vshr.u32 d24,d0,#17
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ vsli.32 d24,d0,#15
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ vshr.u32 d25,d0,#10
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#12]
+ and r3,r3,r12
+ vshr.u32 d24,d0,#19
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ vld1.32 {q8},[r14,:128]!
+ add r8,r8,r2
+ vsli.32 d24,d0,#13
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ veor d25,d25,d24
+ add r9,r9,r3
+ and r2,r2,r5
+ vadd.i32 d1,d1,d25
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ vadd.i32 q8,q8,q0
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#16]
+ and r12,r12,r3
+ add r4,r4,r8
+ vst1.32 {q8},[r1,:128]!
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vext.8 q8,q1,q2,#4
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ vext.8 q9,q3,q0,#4
+ add r8,r8,r12
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vadd.i32 q1,q1,q9
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#20]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ veor q9,q9,q10
+ add r6,r6,r2
+ vsli.32 q11,q8,#14
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ vshr.u32 d24,d1,#17
+ add r7,r7,r3
+ and r2,r2,r11
+ veor q9,q9,q11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ vsli.32 d24,d1,#15
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ vshr.u32 d25,d1,#10
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ vadd.i32 q1,q1,q9
+ add r6,r6,r2
+ ldr r2,[sp,#24]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r10,r10,r6
+ vshr.u32 d24,d1,#19
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ vsli.32 d24,d1,#13
+ add r5,r5,r2
+ eor r2,r11,r4
+ veor d25,d25,d24
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ vadd.i32 d2,d2,d25
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ vshr.u32 d24,d2,#17
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ vsli.32 d24,d2,#15
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ vshr.u32 d25,d2,#10
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#28]
+ and r3,r3,r12
+ vshr.u32 d24,d2,#19
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ vld1.32 {q8},[r14,:128]!
+ add r4,r4,r2
+ vsli.32 d24,d2,#13
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ veor d25,d25,d24
+ add r5,r5,r3
+ and r2,r2,r9
+ vadd.i32 d3,d3,d25
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ vadd.i32 q8,q8,q1
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#32]
+ and r12,r12,r3
+ add r8,r8,r4
+ vst1.32 {q8},[r1,:128]!
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vext.8 q8,q2,q3,#4
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ vext.8 q9,q0,q1,#4
+ add r4,r4,r12
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vadd.i32 q2,q2,q9
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#36]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ veor q9,q9,q10
+ add r10,r10,r2
+ vsli.32 q11,q8,#14
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ vshr.u32 d24,d3,#17
+ add r11,r11,r3
+ and r2,r2,r7
+ veor q9,q9,q11
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ vsli.32 d24,d3,#15
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ vshr.u32 d25,d3,#10
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ vadd.i32 q2,q2,q9
+ add r10,r10,r2
+ ldr r2,[sp,#40]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r6,r6,r10
+ vshr.u32 d24,d3,#19
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ vsli.32 d24,d3,#13
+ add r9,r9,r2
+ eor r2,r7,r8
+ veor d25,d25,d24
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ vadd.i32 d4,d4,d25
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ vshr.u32 d24,d4,#17
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ vsli.32 d24,d4,#15
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ vshr.u32 d25,d4,#10
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#44]
+ and r3,r3,r12
+ vshr.u32 d24,d4,#19
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ vld1.32 {q8},[r14,:128]!
+ add r8,r8,r2
+ vsli.32 d24,d4,#13
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ veor d25,d25,d24
+ add r9,r9,r3
+ and r2,r2,r5
+ vadd.i32 d5,d5,d25
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ vadd.i32 q8,q8,q2
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#48]
+ and r12,r12,r3
+ add r4,r4,r8
+ vst1.32 {q8},[r1,:128]!
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vext.8 q8,q3,q0,#4
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ vext.8 q9,q1,q2,#4
+ add r8,r8,r12
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vadd.i32 q3,q3,q9
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#52]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ veor q9,q9,q10
+ add r6,r6,r2
+ vsli.32 q11,q8,#14
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ vshr.u32 d24,d5,#17
+ add r7,r7,r3
+ and r2,r2,r11
+ veor q9,q9,q11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ vsli.32 d24,d5,#15
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ vshr.u32 d25,d5,#10
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ vadd.i32 q3,q3,q9
+ add r6,r6,r2
+ ldr r2,[sp,#56]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r10,r10,r6
+ vshr.u32 d24,d5,#19
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ vsli.32 d24,d5,#13
+ add r5,r5,r2
+ eor r2,r11,r4
+ veor d25,d25,d24
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ vadd.i32 d6,d6,d25
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ vshr.u32 d24,d6,#17
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ vsli.32 d24,d6,#15
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ vshr.u32 d25,d6,#10
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#60]
+ and r3,r3,r12
+ vshr.u32 d24,d6,#19
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ vld1.32 {q8},[r14,:128]!
+ add r4,r4,r2
+ vsli.32 d24,d6,#13
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ veor d25,d25,d24
+ add r5,r5,r3
+ and r2,r2,r9
+ vadd.i32 d7,d7,d25
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ vadd.i32 q8,q8,q3
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[r14]
+ and r12,r12,r3
+ add r8,r8,r4
+ vst1.32 {q8},[r1,:128]!
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ teq r2,#0 @ check for K256 terminator
+ ldr r2,[sp,#0]
+ sub r1,r1,#64
+ bne .L_00_48
+
+ ldr r1,[sp,#68]
+ ldr r0,[sp,#72]
+ sub r14,r14,#256 @ rewind r14
+ teq r1,r0
+ subeq r1,r1,#64 @ avoid SEGV
+ vld1.8 {q0},[r1]! @ load next input block
+ vld1.8 {q1},[r1]!
+ vld1.8 {q2},[r1]!
+ vld1.8 {q3},[r1]!
+ strne r1,[sp,#68]
+ mov r1,sp
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vrev32.8 q0,q0
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vadd.i32 q8,q8,q0
+ ldr r2,[sp,#4]
+ and r3,r3,r12
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ add r10,r10,r2
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3
+ and r2,r2,r7
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ add r10,r10,r2
+ ldr r2,[sp,#8]
+ and r12,r12,r3
+ add r6,r6,r10
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ add r9,r9,r2
+ eor r2,r7,r8
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ ldr r2,[sp,#12]
+ and r3,r3,r12
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ add r8,r8,r2
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3
+ and r2,r2,r5
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#16]
+ and r12,r12,r3
+ add r4,r4,r8
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vst1.32 {q8},[r1,:128]!
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vrev32.8 q1,q1
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vadd.i32 q8,q8,q1
+ ldr r2,[sp,#20]
+ and r3,r3,r12
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ add r6,r6,r2
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3
+ and r2,r2,r11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ add r6,r6,r2
+ ldr r2,[sp,#24]
+ and r12,r12,r3
+ add r10,r10,r6
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ add r5,r5,r2
+ eor r2,r11,r4
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ ldr r2,[sp,#28]
+ and r3,r3,r12
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ add r4,r4,r2
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3
+ and r2,r2,r9
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#32]
+ and r12,r12,r3
+ add r8,r8,r4
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vst1.32 {q8},[r1,:128]!
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vrev32.8 q2,q2
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vadd.i32 q8,q8,q2
+ ldr r2,[sp,#36]
+ and r3,r3,r12
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ add r10,r10,r2
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3
+ and r2,r2,r7
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ add r10,r10,r2
+ ldr r2,[sp,#40]
+ and r12,r12,r3
+ add r6,r6,r10
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ add r9,r9,r2
+ eor r2,r7,r8
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ ldr r2,[sp,#44]
+ and r3,r3,r12
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ add r8,r8,r2
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3
+ and r2,r2,r5
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#48]
+ and r12,r12,r3
+ add r4,r4,r8
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vst1.32 {q8},[r1,:128]!
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vrev32.8 q3,q3
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vadd.i32 q8,q8,q3
+ ldr r2,[sp,#52]
+ and r3,r3,r12
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ add r6,r6,r2
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3
+ and r2,r2,r11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ add r6,r6,r2
+ ldr r2,[sp,#56]
+ and r12,r12,r3
+ add r10,r10,r6
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ add r5,r5,r2
+ eor r2,r11,r4
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ ldr r2,[sp,#60]
+ and r3,r3,r12
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ add r4,r4,r2
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3
+ and r2,r2,r9
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#64]
+ and r12,r12,r3
+ add r8,r8,r4
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vst1.32 {q8},[r1,:128]!
+ ldr r0,[r2,#0]
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldr r12,[r2,#4]
+ ldr r3,[r2,#8]
+ ldr r1,[r2,#12]
+ add r4,r4,r0 @ accumulate
+ ldr r0,[r2,#16]
+ add r5,r5,r12
+ ldr r12,[r2,#20]
+ add r6,r6,r3
+ ldr r3,[r2,#24]
+ add r7,r7,r1
+ ldr r1,[r2,#28]
+ add r8,r8,r0
+ str r4,[r2],#4
+ add r9,r9,r12
+ str r5,[r2],#4
+ add r10,r10,r3
+ str r6,[r2],#4
+ add r11,r11,r1
+ str r7,[r2],#4
+ stmia r2,{r8-r11}
+
+ movne r1,sp
+ ldrne r2,[sp,#0]
+ eorne r12,r12,r12
+ ldreq sp,[sp,#76] @ restore original sp
+ eorne r3,r5,r6
+ bne .L_00_48
+
+ ldmia sp!,{r4-r12,pc}
+ENDPROC(sha256_transform_neon)
diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
new file mode 100644
index 0000000..698a498
--- /dev/null
+++ b/arch/arm/crypto/sha256_neon_glue.c
@@ -0,0 +1,201 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using NEON instructions.
+ *
+ * Copyright ? 2015 Google Inc.
+ *
+ * This file is based on sha512_neon_glue.c:
+ * Copyright ? 2014 Jussi Kivilinna <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+
+asmlinkage void sha256_transform_neon(u32 *digest, const void *data,
+ unsigned int num_blks);
+
+
+static int sha256_neon_init(struct shash_desc *desc)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ sctx->state[0] = SHA256_H0;
+ sctx->state[1] = SHA256_H1;
+ sctx->state[2] = SHA256_H2;
+ sctx->state[3] = SHA256_H3;
+ sctx->state[4] = SHA256_H4;
+ sctx->state[5] = SHA256_H5;
+ sctx->state[6] = SHA256_H6;
+ sctx->state[7] = SHA256_H7;
+ sctx->count = 0;
+
+ return 0;
+}
+
+static int __sha256_neon_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, unsigned int partial)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int done = 0;
+
+ sctx->count += len;
+
+ if (partial) {
+ done = SHA256_BLOCK_SIZE - partial;
+ memcpy(sctx->buf + partial, data, done);
+ sha256_transform_neon(sctx->state, sctx->buf, 1);
+ }
+
+ if (len - done >= SHA256_BLOCK_SIZE) {
+ const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+ sha256_transform_neon(sctx->state, data + done, rounds);
+ done += rounds * SHA256_BLOCK_SIZE;
+ }
+
+ memcpy(sctx->buf, data + done, len - done);
+
+ return 0;
+}
+
+static int sha256_neon_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+ int res;
+
+ /* Handle the fast case right here */
+ if (partial + len < SHA256_BLOCK_SIZE) {
+ sctx->count += len;
+ memcpy(sctx->buf + partial, data, len);
+
+ return 0;
+ }
+
+ if (!may_use_simd()) {
+ res = crypto_sha256_update(desc, data, len);
+ } else {
+ kernel_neon_begin();
+ res = __sha256_neon_update(desc, data, len, partial);
+ kernel_neon_end();
+ }
+
+ return res;
+}
+
+/* Add padding and return the message digest. */
+static int sha256_neon_final(struct shash_desc *desc, u8 *out)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int i, index, padlen;
+ __be32 *dst = (__be32 *)out;
+ __be64 bits;
+ static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+ /* save number of bits */
+ bits = cpu_to_be64(sctx->count << 3);
+
+ /* Pad out to 56 mod 64 and append length */
+ index = sctx->count % SHA256_BLOCK_SIZE;
+ padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+ if (!may_use_simd()) {
+ crypto_sha256_update(desc, padding, padlen);
+ crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits));
+ } else {
+ kernel_neon_begin();
+ /* We need to fill a whole block for __sha256_neon_update() */
+ if (padlen <= 56) {
+ sctx->count += padlen;
+ memcpy(sctx->buf + index, padding, padlen);
+ } else {
+ __sha256_neon_update(desc, padding, padlen, index);
+ }
+ __sha256_neon_update(desc, (const u8 *)&bits,
+ sizeof(bits), 56);
+ kernel_neon_end();
+ }
+
+ /* Store state in digest */
+ for (i = 0; i < 8; i++)
+ dst[i] = cpu_to_be32(sctx->state[i]);
+
+ /* Wipe context */
+ memset(sctx, 0, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha256_neon_export(struct shash_desc *desc, void *out)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(out, sctx, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha256_neon_import(struct shash_desc *desc, const void *in)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(sctx, in, sizeof(*sctx));
+
+ return 0;
+}
+
+static struct shash_alg alg = {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_neon_init,
+ .update = sha256_neon_update,
+ .final = sha256_neon_final,
+ .export = sha256_neon_export,
+ .import = sha256_neon_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-neon",
+ .cra_priority = 350,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int __init sha256_neon_mod_init(void)
+{
+ if (!cpu_has_neon())
+ return -ENODEV;
+
+ return crypto_register_shash(&alg);
+}
+
+static void __exit sha256_neon_mod_fini(void)
+{
+ crypto_unregister_shash(&alg);
+}
+
+module_init(sha256_neon_mod_init);
+module_exit(sha256_neon_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, NEON accelerated");
+
+MODULE_ALIAS("sha256");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 50f4da4..0505523 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -610,6 +610,18 @@ config CRYPTO_SHA256
This code also includes SHA-224, a 224 bit hash with 112 bits
of security against collision attacks.

+config CRYPTO_SHA256_ARM_NEON
+ tristate "SHA256 digest algorithm (ARM NEON)"
+ depends on ARM && KERNEL_MODE_NEON && !CPU_BIG_ENDIAN
+ select CRYPTO_SHA256
+ select CRYPTO_HASH
+ help
+ SHA-256 secure hash standard (DFIPS 180-2) implemented
+ using ARM NEON instructions, when available.
+
+ This version of SHA implements a 256 bit hash with 128 bits of
+ security against collision attacks.
+
config CRYPTO_SHA256_SPARC64
tristate "SHA224 and SHA256 digest algorithm (SPARC64)"
depends on SPARC64

2015-03-16 16:08:08

by Ard Biesheuvel

[permalink] [raw]

Subject: Re: [PATCH] arm: crypto: Add NEON optimized SHA-256

Hello Sami,

On 16 March 2015 at 16:48, Sami Tolvanen <[email protected]> wrote:
> Add Andy Polyakov's NEON optimized SHA-256 implementation.
>
> On Nexus 6, this implementation is ~2x faster than sha256-generic.
>
> Signed-off-by: Sami Tolvanen <[email protected]>
>

Have you tested this code with the tcrypt.ko module?

Some more comments below

> ---
> arch/arm/crypto/Makefile | 2
> arch/arm/crypto/sha256-armv7-neon.S | 819 ++++++++++++++++++++++++++++++++++++
> arch/arm/crypto/sha256_neon_glue.c | 201 ++++++++
> crypto/Kconfig | 12
> 4 files changed, 1034 insertions(+)
>
> diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
> index b48fa34..316dba2 100644
> --- a/arch/arm/crypto/Makefile
> +++ b/arch/arm/crypto/Makefile
> @@ -6,12 +6,14 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
> obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
> obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
> obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
> +obj-$(CONFIG_CRYPTO_SHA256_ARM_NEON) += sha256-arm-neon.o
> obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
>
> aes-arm-y := aes-armv4.o aes_glue.o
> aes-arm-bs-y := aesbs-core.o aesbs-glue.o
> sha1-arm-y := sha1-armv4-large.o sha1_glue.o
> sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o
> +sha256-arm-neon-y := sha256-armv7-neon.o sha256_neon_glue.o
> sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
>
> quiet_cmd_perl = PERL $@
> diff --git a/arch/arm/crypto/sha256-armv7-neon.S b/arch/arm/crypto/sha256-armv7-neon.S
> new file mode 100644
> index 0000000..5ce04c2
> --- /dev/null
> +++ b/arch/arm/crypto/sha256-armv7-neon.S
> @@ -0,0 +1,819 @@
> +@ sha256-armv7-neon.S - ARM/NEON assembly implementation of SHA-256 transform
> +@
> +@ ====================================================================
> +@ Written by Andy Polyakov <[email protected]> for the OpenSSL
> +@ project. The module is, however, dual licensed under OpenSSL and
> +@ CRYPTOGAMS licenses depending on where you obtain it. For further
> +@ details see http://www.openssl.org/~appro/cryptogams/.
> +@ ====================================================================
> +

Did you talk to Andy about the license? I don't think this is
permissible for the kernel as-is.

> +#include <linux/linkage.h>
> +
> +.text
> +.code 32
> +.fpu neon
> +
> +.type K256,%object
> +.align 5
> +K256:
> +.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
> +.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
> +.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
> +.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
> +.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
> +.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
> +.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
> +.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
> +.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
> +.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
> +.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
> +.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
> +.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
> +.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
> +.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
> +.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
> +.size K256,.-K256
> +.word 0 @ terminator
> +.word 0
> +.align 5
> +
> +.align 5
> +ENTRY(sha256_transform_neon)
> + /* Input:
> + * %r0: SHA256_CONTEXT
> + * %r1: data
> + * %r2: nblks
> + */
> + sub r3,pc,#8 @ sha256_transform_neon

This is broken on thumb-2, use adr instead

> + add r2,r1,r2,lsl#6 @ len to point at the end of inp
> +
> + stmdb sp!,{r4-r12,lr}
> +
> + mov r12,sp
> + sub sp,sp,#16*4+16 @ alloca
> + sub r14,r3,#256+32 @ K256
> + bic sp,sp,#15 @ align for 128-bit stores
> +
> + vld1.8 {q0},[r1]!
> + vld1.8 {q1},[r1]!
> + vld1.8 {q2},[r1]!
> + vld1.8 {q3},[r1]!
> + vld1.32 {q8},[r14,:128]!
> + vld1.32 {q9},[r14,:128]!
> + vld1.32 {q10},[r14,:128]!
> + vld1.32 {q11},[r14,:128]!
> + vrev32.8 q0,q0 @ yes, even on
> + str r0,[sp,#64]
> + vrev32.8 q1,q1 @ big-endian
> + str r1,[sp,#68]
> + mov r1,sp
> + vrev32.8 q2,q2
> + str r2,[sp,#72]
> + vrev32.8 q3,q3
> + str r12,[sp,#76] @ save original sp
> + vadd.i32 q8,q8,q0
> + vadd.i32 q9,q9,q1
> + vst1.32 {q8},[r1,:128]!
> + vadd.i32 q10,q10,q2
> + vst1.32 {q9},[r1,:128]!
> + vadd.i32 q11,q11,q3
> + vst1.32 {q10},[r1,:128]!
> + vst1.32 {q11},[r1,:128]!
> +
> + ldmia r0,{r4-r11}
> + sub r1,r1,#64
> + ldr r2,[sp,#0]
> + eor r12,r12,r12
> + eor r3,r5,r6
> + b .L_00_48
> +
> +.align 4
> +.L_00_48:
> + vext.8 q8,q0,q1,#4
> + add r11,r11,r2
> + eor r2,r9,r10
> + eor r0,r8,r8,ror#5
> + vext.8 q9,q2,q3,#4
> + add r4,r4,r12
> + and r2,r2,r8
> + eor r12,r0,r8,ror#19
> + vshr.u32 q10,q8,#7
> + eor r0,r4,r4,ror#11
> + eor r2,r2,r10
> + vadd.i32 q0,q0,q9
> + add r11,r11,r12,ror#6
> + eor r12,r4,r5
> + vshr.u32 q9,q8,#3
> + eor r0,r0,r4,ror#20
> + add r11,r11,r2
> + vsli.32 q10,q8,#25
> + ldr r2,[sp,#4]
> + and r3,r3,r12
> + vshr.u32 q11,q8,#18
> + add r7,r7,r11
> + add r11,r11,r0,ror#2
> + eor r3,r3,r5
> + veor q9,q9,q10
> + add r10,r10,r2
> + vsli.32 q11,q8,#14
> + eor r2,r8,r9
> + eor r0,r7,r7,ror#5
> + vshr.u32 d24,d7,#17
> + add r11,r11,r3
> + and r2,r2,r7
> + veor q9,q9,q11
> + eor r3,r0,r7,ror#19
> + eor r0,r11,r11,ror#11
> + vsli.32 d24,d7,#15
> + eor r2,r2,r9
> + add r10,r10,r3,ror#6
> + vshr.u32 d25,d7,#10
> + eor r3,r11,r4
> + eor r0,r0,r11,ror#20
> + vadd.i32 q0,q0,q9
> + add r10,r10,r2
> + ldr r2,[sp,#8]
> + veor d25,d25,d24
> + and r12,r12,r3
> + add r6,r6,r10
> + vshr.u32 d24,d7,#19
> + add r10,r10,r0,ror#2
> + eor r12,r12,r4
> + vsli.32 d24,d7,#13
> + add r9,r9,r2
> + eor r2,r7,r8
> + veor d25,d25,d24
> + eor r0,r6,r6,ror#5
> + add r10,r10,r12
> + vadd.i32 d0,d0,d25
> + and r2,r2,r6
> + eor r12,r0,r6,ror#19
> + vshr.u32 d24,d0,#17
> + eor r0,r10,r10,ror#11
> + eor r2,r2,r8
> + vsli.32 d24,d0,#15
> + add r9,r9,r12,ror#6
> + eor r12,r10,r11
> + vshr.u32 d25,d0,#10
> + eor r0,r0,r10,ror#20
> + add r9,r9,r2
> + veor d25,d25,d24
> + ldr r2,[sp,#12]
> + and r3,r3,r12
> + vshr.u32 d24,d0,#19
> + add r5,r5,r9
> + add r9,r9,r0,ror#2
> + eor r3,r3,r11
> + vld1.32 {q8},[r14,:128]!
> + add r8,r8,r2
> + vsli.32 d24,d0,#13
> + eor r2,r6,r7
> + eor r0,r5,r5,ror#5
> + veor d25,d25,d24
> + add r9,r9,r3
> + and r2,r2,r5
> + vadd.i32 d1,d1,d25
> + eor r3,r0,r5,ror#19
> + eor r0,r9,r9,ror#11
> + vadd.i32 q8,q8,q0
> + eor r2,r2,r7
> + add r8,r8,r3,ror#6
> + eor r3,r9,r10
> + eor r0,r0,r9,ror#20
> + add r8,r8,r2
> + ldr r2,[sp,#16]
> + and r12,r12,r3
> + add r4,r4,r8
> + vst1.32 {q8},[r1,:128]!
> + add r8,r8,r0,ror#2
> + eor r12,r12,r10
> + vext.8 q8,q1,q2,#4
> + add r7,r7,r2
> + eor r2,r5,r6
> + eor r0,r4,r4,ror#5
> + vext.8 q9,q3,q0,#4
> + add r8,r8,r12
> + and r2,r2,r4
> + eor r12,r0,r4,ror#19
> + vshr.u32 q10,q8,#7
> + eor r0,r8,r8,ror#11
> + eor r2,r2,r6
> + vadd.i32 q1,q1,q9
> + add r7,r7,r12,ror#6
> + eor r12,r8,r9
> + vshr.u32 q9,q8,#3
> + eor r0,r0,r8,ror#20
> + add r7,r7,r2
> + vsli.32 q10,q8,#25
> + ldr r2,[sp,#20]
> + and r3,r3,r12
> + vshr.u32 q11,q8,#18
> + add r11,r11,r7
> + add r7,r7,r0,ror#2
> + eor r3,r3,r9
> + veor q9,q9,q10
> + add r6,r6,r2
> + vsli.32 q11,q8,#14
> + eor r2,r4,r5
> + eor r0,r11,r11,ror#5
> + vshr.u32 d24,d1,#17
> + add r7,r7,r3
> + and r2,r2,r11
> + veor q9,q9,q11
> + eor r3,r0,r11,ror#19
> + eor r0,r7,r7,ror#11
> + vsli.32 d24,d1,#15
> + eor r2,r2,r5
> + add r6,r6,r3,ror#6
> + vshr.u32 d25,d1,#10
> + eor r3,r7,r8
> + eor r0,r0,r7,ror#20
> + vadd.i32 q1,q1,q9
> + add r6,r6,r2
> + ldr r2,[sp,#24]
> + veor d25,d25,d24
> + and r12,r12,r3
> + add r10,r10,r6
> + vshr.u32 d24,d1,#19
> + add r6,r6,r0,ror#2
> + eor r12,r12,r8
> + vsli.32 d24,d1,#13
> + add r5,r5,r2
> + eor r2,r11,r4
> + veor d25,d25,d24
> + eor r0,r10,r10,ror#5
> + add r6,r6,r12
> + vadd.i32 d2,d2,d25
> + and r2,r2,r10
> + eor r12,r0,r10,ror#19
> + vshr.u32 d24,d2,#17
> + eor r0,r6,r6,ror#11
> + eor r2,r2,r4
> + vsli.32 d24,d2,#15
> + add r5,r5,r12,ror#6
> + eor r12,r6,r7
> + vshr.u32 d25,d2,#10
> + eor r0,r0,r6,ror#20
> + add r5,r5,r2
> + veor d25,d25,d24
> + ldr r2,[sp,#28]
> + and r3,r3,r12
> + vshr.u32 d24,d2,#19
> + add r9,r9,r5
> + add r5,r5,r0,ror#2
> + eor r3,r3,r7
> + vld1.32 {q8},[r14,:128]!
> + add r4,r4,r2
> + vsli.32 d24,d2,#13
> + eor r2,r10,r11
> + eor r0,r9,r9,ror#5
> + veor d25,d25,d24
> + add r5,r5,r3
> + and r2,r2,r9
> + vadd.i32 d3,d3,d25
> + eor r3,r0,r9,ror#19
> + eor r0,r5,r5,ror#11
> + vadd.i32 q8,q8,q1
> + eor r2,r2,r11
> + add r4,r4,r3,ror#6
> + eor r3,r5,r6
> + eor r0,r0,r5,ror#20
> + add r4,r4,r2
> + ldr r2,[sp,#32]
> + and r12,r12,r3
> + add r8,r8,r4
> + vst1.32 {q8},[r1,:128]!
> + add r4,r4,r0,ror#2
> + eor r12,r12,r6
> + vext.8 q8,q2,q3,#4
> + add r11,r11,r2
> + eor r2,r9,r10
> + eor r0,r8,r8,ror#5
> + vext.8 q9,q0,q1,#4
> + add r4,r4,r12
> + and r2,r2,r8
> + eor r12,r0,r8,ror#19
> + vshr.u32 q10,q8,#7
> + eor r0,r4,r4,ror#11
> + eor r2,r2,r10
> + vadd.i32 q2,q2,q9
> + add r11,r11,r12,ror#6
> + eor r12,r4,r5
> + vshr.u32 q9,q8,#3
> + eor r0,r0,r4,ror#20
> + add r11,r11,r2
> + vsli.32 q10,q8,#25
> + ldr r2,[sp,#36]
> + and r3,r3,r12
> + vshr.u32 q11,q8,#18
> + add r7,r7,r11
> + add r11,r11,r0,ror#2
> + eor r3,r3,r5
> + veor q9,q9,q10
> + add r10,r10,r2
> + vsli.32 q11,q8,#14
> + eor r2,r8,r9
> + eor r0,r7,r7,ror#5
> + vshr.u32 d24,d3,#17
> + add r11,r11,r3
> + and r2,r2,r7
> + veor q9,q9,q11
> + eor r3,r0,r7,ror#19
> + eor r0,r11,r11,ror#11
> + vsli.32 d24,d3,#15
> + eor r2,r2,r9
> + add r10,r10,r3,ror#6
> + vshr.u32 d25,d3,#10
> + eor r3,r11,r4
> + eor r0,r0,r11,ror#20
> + vadd.i32 q2,q2,q9
> + add r10,r10,r2
> + ldr r2,[sp,#40]
> + veor d25,d25,d24
> + and r12,r12,r3
> + add r6,r6,r10
> + vshr.u32 d24,d3,#19
> + add r10,r10,r0,ror#2
> + eor r12,r12,r4
> + vsli.32 d24,d3,#13
> + add r9,r9,r2
> + eor r2,r7,r8
> + veor d25,d25,d24
> + eor r0,r6,r6,ror#5
> + add r10,r10,r12
> + vadd.i32 d4,d4,d25
> + and r2,r2,r6
> + eor r12,r0,r6,ror#19
> + vshr.u32 d24,d4,#17
> + eor r0,r10,r10,ror#11
> + eor r2,r2,r8
> + vsli.32 d24,d4,#15
> + add r9,r9,r12,ror#6
> + eor r12,r10,r11
> + vshr.u32 d25,d4,#10
> + eor r0,r0,r10,ror#20
> + add r9,r9,r2
> + veor d25,d25,d24
> + ldr r2,[sp,#44]
> + and r3,r3,r12
> + vshr.u32 d24,d4,#19
> + add r5,r5,r9
> + add r9,r9,r0,ror#2
> + eor r3,r3,r11
> + vld1.32 {q8},[r14,:128]!
> + add r8,r8,r2
> + vsli.32 d24,d4,#13
> + eor r2,r6,r7
> + eor r0,r5,r5,ror#5
> + veor d25,d25,d24
> + add r9,r9,r3
> + and r2,r2,r5
> + vadd.i32 d5,d5,d25
> + eor r3,r0,r5,ror#19
> + eor r0,r9,r9,ror#11
> + vadd.i32 q8,q8,q2
> + eor r2,r2,r7
> + add r8,r8,r3,ror#6
> + eor r3,r9,r10
> + eor r0,r0,r9,ror#20
> + add r8,r8,r2
> + ldr r2,[sp,#48]
> + and r12,r12,r3
> + add r4,r4,r8
> + vst1.32 {q8},[r1,:128]!
> + add r8,r8,r0,ror#2
> + eor r12,r12,r10
> + vext.8 q8,q3,q0,#4
> + add r7,r7,r2
> + eor r2,r5,r6
> + eor r0,r4,r4,ror#5
> + vext.8 q9,q1,q2,#4
> + add r8,r8,r12
> + and r2,r2,r4
> + eor r12,r0,r4,ror#19
> + vshr.u32 q10,q8,#7
> + eor r0,r8,r8,ror#11
> + eor r2,r2,r6
> + vadd.i32 q3,q3,q9
> + add r7,r7,r12,ror#6
> + eor r12,r8,r9
> + vshr.u32 q9,q8,#3
> + eor r0,r0,r8,ror#20
> + add r7,r7,r2
> + vsli.32 q10,q8,#25
> + ldr r2,[sp,#52]
> + and r3,r3,r12
> + vshr.u32 q11,q8,#18
> + add r11,r11,r7
> + add r7,r7,r0,ror#2
> + eor r3,r3,r9
> + veor q9,q9,q10
> + add r6,r6,r2
> + vsli.32 q11,q8,#14
> + eor r2,r4,r5
> + eor r0,r11,r11,ror#5
> + vshr.u32 d24,d5,#17
> + add r7,r7,r3
> + and r2,r2,r11
> + veor q9,q9,q11
> + eor r3,r0,r11,ror#19
> + eor r0,r7,r7,ror#11
> + vsli.32 d24,d5,#15
> + eor r2,r2,r5
> + add r6,r6,r3,ror#6
> + vshr.u32 d25,d5,#10
> + eor r3,r7,r8
> + eor r0,r0,r7,ror#20
> + vadd.i32 q3,q3,q9
> + add r6,r6,r2
> + ldr r2,[sp,#56]
> + veor d25,d25,d24
> + and r12,r12,r3
> + add r10,r10,r6
> + vshr.u32 d24,d5,#19
> + add r6,r6,r0,ror#2
> + eor r12,r12,r8
> + vsli.32 d24,d5,#13
> + add r5,r5,r2
> + eor r2,r11,r4
> + veor d25,d25,d24
> + eor r0,r10,r10,ror#5
> + add r6,r6,r12
> + vadd.i32 d6,d6,d25
> + and r2,r2,r10
> + eor r12,r0,r10,ror#19
> + vshr.u32 d24,d6,#17
> + eor r0,r6,r6,ror#11
> + eor r2,r2,r4
> + vsli.32 d24,d6,#15
> + add r5,r5,r12,ror#6
> + eor r12,r6,r7
> + vshr.u32 d25,d6,#10
> + eor r0,r0,r6,ror#20
> + add r5,r5,r2
> + veor d25,d25,d24
> + ldr r2,[sp,#60]
> + and r3,r3,r12
> + vshr.u32 d24,d6,#19
> + add r9,r9,r5
> + add r5,r5,r0,ror#2
> + eor r3,r3,r7
> + vld1.32 {q8},[r14,:128]!
> + add r4,r4,r2
> + vsli.32 d24,d6,#13
> + eor r2,r10,r11
> + eor r0,r9,r9,ror#5
> + veor d25,d25,d24
> + add r5,r5,r3
> + and r2,r2,r9
> + vadd.i32 d7,d7,d25
> + eor r3,r0,r9,ror#19
> + eor r0,r5,r5,ror#11
> + vadd.i32 q8,q8,q3
> + eor r2,r2,r11
> + add r4,r4,r3,ror#6
> + eor r3,r5,r6
> + eor r0,r0,r5,ror#20
> + add r4,r4,r2
> + ldr r2,[r14]
> + and r12,r12,r3
> + add r8,r8,r4
> + vst1.32 {q8},[r1,:128]!
> + add r4,r4,r0,ror#2
> + eor r12,r12,r6
> + teq r2,#0 @ check for K256 terminator
> + ldr r2,[sp,#0]
> + sub r1,r1,#64
> + bne .L_00_48
> +
> + ldr r1,[sp,#68]
> + ldr r0,[sp,#72]
> + sub r14,r14,#256 @ rewind r14
> + teq r1,r0
> + subeq r1,r1,#64 @ avoid SEGV
> + vld1.8 {q0},[r1]! @ load next input block
> + vld1.8 {q1},[r1]!
> + vld1.8 {q2},[r1]!
> + vld1.8 {q3},[r1]!
> + strne r1,[sp,#68]
> + mov r1,sp
> + add r11,r11,r2
> + eor r2,r9,r10
> + eor r0,r8,r8,ror#5
> + add r4,r4,r12
> + vld1.32 {q8},[r14,:128]!
> + and r2,r2,r8
> + eor r12,r0,r8,ror#19
> + eor r0,r4,r4,ror#11
> + eor r2,r2,r10
> + vrev32.8 q0,q0
> + add r11,r11,r12,ror#6
> + eor r12,r4,r5
> + eor r0,r0,r4,ror#20
> + add r11,r11,r2
> + vadd.i32 q8,q8,q0
> + ldr r2,[sp,#4]
> + and r3,r3,r12
> + add r7,r7,r11
> + add r11,r11,r0,ror#2
> + eor r3,r3,r5
> + add r10,r10,r2
> + eor r2,r8,r9
> + eor r0,r7,r7,ror#5
> + add r11,r11,r3
> + and r2,r2,r7
> + eor r3,r0,r7,ror#19
> + eor r0,r11,r11,ror#11
> + eor r2,r2,r9
> + add r10,r10,r3,ror#6
> + eor r3,r11,r4
> + eor r0,r0,r11,ror#20
> + add r10,r10,r2
> + ldr r2,[sp,#8]
> + and r12,r12,r3
> + add r6,r6,r10
> + add r10,r10,r0,ror#2
> + eor r12,r12,r4
> + add r9,r9,r2
> + eor r2,r7,r8
> + eor r0,r6,r6,ror#5
> + add r10,r10,r12
> + and r2,r2,r6
> + eor r12,r0,r6,ror#19
> + eor r0,r10,r10,ror#11
> + eor r2,r2,r8
> + add r9,r9,r12,ror#6
> + eor r12,r10,r11
> + eor r0,r0,r10,ror#20
> + add r9,r9,r2
> + ldr r2,[sp,#12]
> + and r3,r3,r12
> + add r5,r5,r9
> + add r9,r9,r0,ror#2
> + eor r3,r3,r11
> + add r8,r8,r2
> + eor r2,r6,r7
> + eor r0,r5,r5,ror#5
> + add r9,r9,r3
> + and r2,r2,r5
> + eor r3,r0,r5,ror#19
> + eor r0,r9,r9,ror#11
> + eor r2,r2,r7
> + add r8,r8,r3,ror#6
> + eor r3,r9,r10
> + eor r0,r0,r9,ror#20
> + add r8,r8,r2
> + ldr r2,[sp,#16]
> + and r12,r12,r3
> + add r4,r4,r8
> + add r8,r8,r0,ror#2
> + eor r12,r12,r10
> + vst1.32 {q8},[r1,:128]!
> + add r7,r7,r2
> + eor r2,r5,r6
> + eor r0,r4,r4,ror#5
> + add r8,r8,r12
> + vld1.32 {q8},[r14,:128]!
> + and r2,r2,r4
> + eor r12,r0,r4,ror#19
> + eor r0,r8,r8,ror#11
> + eor r2,r2,r6
> + vrev32.8 q1,q1
> + add r7,r7,r12,ror#6
> + eor r12,r8,r9
> + eor r0,r0,r8,ror#20
> + add r7,r7,r2
> + vadd.i32 q8,q8,q1
> + ldr r2,[sp,#20]
> + and r3,r3,r12
> + add r11,r11,r7
> + add r7,r7,r0,ror#2
> + eor r3,r3,r9
> + add r6,r6,r2
> + eor r2,r4,r5
> + eor r0,r11,r11,ror#5
> + add r7,r7,r3
> + and r2,r2,r11
> + eor r3,r0,r11,ror#19
> + eor r0,r7,r7,ror#11
> + eor r2,r2,r5
> + add r6,r6,r3,ror#6
> + eor r3,r7,r8
> + eor r0,r0,r7,ror#20
> + add r6,r6,r2
> + ldr r2,[sp,#24]
> + and r12,r12,r3
> + add r10,r10,r6
> + add r6,r6,r0,ror#2
> + eor r12,r12,r8
> + add r5,r5,r2
> + eor r2,r11,r4
> + eor r0,r10,r10,ror#5
> + add r6,r6,r12
> + and r2,r2,r10
> + eor r12,r0,r10,ror#19
> + eor r0,r6,r6,ror#11
> + eor r2,r2,r4
> + add r5,r5,r12,ror#6
> + eor r12,r6,r7
> + eor r0,r0,r6,ror#20
> + add r5,r5,r2
> + ldr r2,[sp,#28]
> + and r3,r3,r12
> + add r9,r9,r5
> + add r5,r5,r0,ror#2
> + eor r3,r3,r7
> + add r4,r4,r2
> + eor r2,r10,r11
> + eor r0,r9,r9,ror#5
> + add r5,r5,r3
> + and r2,r2,r9
> + eor r3,r0,r9,ror#19
> + eor r0,r5,r5,ror#11
> + eor r2,r2,r11
> + add r4,r4,r3,ror#6
> + eor r3,r5,r6
> + eor r0,r0,r5,ror#20
> + add r4,r4,r2
> + ldr r2,[sp,#32]
> + and r12,r12,r3
> + add r8,r8,r4
> + add r4,r4,r0,ror#2
> + eor r12,r12,r6
> + vst1.32 {q8},[r1,:128]!
> + add r11,r11,r2
> + eor r2,r9,r10
> + eor r0,r8,r8,ror#5
> + add r4,r4,r12
> + vld1.32 {q8},[r14,:128]!
> + and r2,r2,r8
> + eor r12,r0,r8,ror#19
> + eor r0,r4,r4,ror#11
> + eor r2,r2,r10
> + vrev32.8 q2,q2
> + add r11,r11,r12,ror#6
> + eor r12,r4,r5
> + eor r0,r0,r4,ror#20
> + add r11,r11,r2
> + vadd.i32 q8,q8,q2
> + ldr r2,[sp,#36]
> + and r3,r3,r12
> + add r7,r7,r11
> + add r11,r11,r0,ror#2
> + eor r3,r3,r5
> + add r10,r10,r2
> + eor r2,r8,r9
> + eor r0,r7,r7,ror#5
> + add r11,r11,r3
> + and r2,r2,r7
> + eor r3,r0,r7,ror#19
> + eor r0,r11,r11,ror#11
> + eor r2,r2,r9
> + add r10,r10,r3,ror#6
> + eor r3,r11,r4
> + eor r0,r0,r11,ror#20
> + add r10,r10,r2
> + ldr r2,[sp,#40]
> + and r12,r12,r3
> + add r6,r6,r10
> + add r10,r10,r0,ror#2
> + eor r12,r12,r4
> + add r9,r9,r2
> + eor r2,r7,r8
> + eor r0,r6,r6,ror#5
> + add r10,r10,r12
> + and r2,r2,r6
> + eor r12,r0,r6,ror#19
> + eor r0,r10,r10,ror#11
> + eor r2,r2,r8
> + add r9,r9,r12,ror#6
> + eor r12,r10,r11
> + eor r0,r0,r10,ror#20
> + add r9,r9,r2
> + ldr r2,[sp,#44]
> + and r3,r3,r12
> + add r5,r5,r9
> + add r9,r9,r0,ror#2
> + eor r3,r3,r11
> + add r8,r8,r2
> + eor r2,r6,r7
> + eor r0,r5,r5,ror#5
> + add r9,r9,r3
> + and r2,r2,r5
> + eor r3,r0,r5,ror#19
> + eor r0,r9,r9,ror#11
> + eor r2,r2,r7
> + add r8,r8,r3,ror#6
> + eor r3,r9,r10
> + eor r0,r0,r9,ror#20
> + add r8,r8,r2
> + ldr r2,[sp,#48]
> + and r12,r12,r3
> + add r4,r4,r8
> + add r8,r8,r0,ror#2
> + eor r12,r12,r10
> + vst1.32 {q8},[r1,:128]!
> + add r7,r7,r2
> + eor r2,r5,r6
> + eor r0,r4,r4,ror#5
> + add r8,r8,r12
> + vld1.32 {q8},[r14,:128]!
> + and r2,r2,r4
> + eor r12,r0,r4,ror#19
> + eor r0,r8,r8,ror#11
> + eor r2,r2,r6
> + vrev32.8 q3,q3
> + add r7,r7,r12,ror#6
> + eor r12,r8,r9
> + eor r0,r0,r8,ror#20
> + add r7,r7,r2
> + vadd.i32 q8,q8,q3
> + ldr r2,[sp,#52]
> + and r3,r3,r12
> + add r11,r11,r7
> + add r7,r7,r0,ror#2
> + eor r3,r3,r9
> + add r6,r6,r2
> + eor r2,r4,r5
> + eor r0,r11,r11,ror#5
> + add r7,r7,r3
> + and r2,r2,r11
> + eor r3,r0,r11,ror#19
> + eor r0,r7,r7,ror#11
> + eor r2,r2,r5
> + add r6,r6,r3,ror#6
> + eor r3,r7,r8
> + eor r0,r0,r7,ror#20
> + add r6,r6,r2
> + ldr r2,[sp,#56]
> + and r12,r12,r3
> + add r10,r10,r6
> + add r6,r6,r0,ror#2
> + eor r12,r12,r8
> + add r5,r5,r2
> + eor r2,r11,r4
> + eor r0,r10,r10,ror#5
> + add r6,r6,r12
> + and r2,r2,r10
> + eor r12,r0,r10,ror#19
> + eor r0,r6,r6,ror#11
> + eor r2,r2,r4
> + add r5,r5,r12,ror#6
> + eor r12,r6,r7
> + eor r0,r0,r6,ror#20
> + add r5,r5,r2
> + ldr r2,[sp,#60]
> + and r3,r3,r12
> + add r9,r9,r5
> + add r5,r5,r0,ror#2
> + eor r3,r3,r7
> + add r4,r4,r2
> + eor r2,r10,r11
> + eor r0,r9,r9,ror#5
> + add r5,r5,r3
> + and r2,r2,r9
> + eor r3,r0,r9,ror#19
> + eor r0,r5,r5,ror#11
> + eor r2,r2,r11
> + add r4,r4,r3,ror#6
> + eor r3,r5,r6
> + eor r0,r0,r5,ror#20
> + add r4,r4,r2
> + ldr r2,[sp,#64]
> + and r12,r12,r3
> + add r8,r8,r4
> + add r4,r4,r0,ror#2
> + eor r12,r12,r6
> + vst1.32 {q8},[r1,:128]!
> + ldr r0,[r2,#0]
> + add r4,r4,r12 @ h+=Maj(a,b,c) from the past
> + ldr r12,[r2,#4]
> + ldr r3,[r2,#8]
> + ldr r1,[r2,#12]
> + add r4,r4,r0 @ accumulate
> + ldr r0,[r2,#16]
> + add r5,r5,r12
> + ldr r12,[r2,#20]
> + add r6,r6,r3
> + ldr r3,[r2,#24]
> + add r7,r7,r1
> + ldr r1,[r2,#28]
> + add r8,r8,r0
> + str r4,[r2],#4
> + add r9,r9,r12
> + str r5,[r2],#4
> + add r10,r10,r3
> + str r6,[r2],#4
> + add r11,r11,r1
> + str r7,[r2],#4
> + stmia r2,{r8-r11}
> +
> + movne r1,sp
> + ldrne r2,[sp,#0]
> + eorne r12,r12,r12
> + ldreq sp,[sp,#76] @ restore original sp
> + eorne r3,r5,r6
> + bne .L_00_48
> +
> + ldmia sp!,{r4-r12,pc}
> +ENDPROC(sha256_transform_neon)
> diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
> new file mode 100644
> index 0000000..698a498
> --- /dev/null
> +++ b/arch/arm/crypto/sha256_neon_glue.c
> @@ -0,0 +1,201 @@
> +/*
> + * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
> + * using NEON instructions.
> + *
> + * Copyright © 2015 Google Inc.
> + *
> + * This file is based on sha512_neon_glue.c:
> + * Copyright © 2014 Jussi Kivilinna <[email protected]>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License as published by the Free
> + * Software Foundation; either version 2 of the License, or (at your option)
> + * any later version.
> + *
> + */
> +
> +#include <crypto/internal/hash.h>
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/mm.h>
> +#include <linux/cryptohash.h>
> +#include <linux/types.h>
> +#include <linux/string.h>
> +#include <crypto/sha.h>
> +#include <asm/byteorder.h>
> +#include <asm/simd.h>
> +#include <asm/neon.h>
> +
> +asmlinkage void sha256_transform_neon(u32 *digest, const void *data,
> + unsigned int num_blks);
> +
> +
> +static int sha256_neon_init(struct shash_desc *desc)
> +{
> + struct sha256_state *sctx = shash_desc_ctx(desc);
> +
> + sctx->state[0] = SHA256_H0;
> + sctx->state[1] = SHA256_H1;
> + sctx->state[2] = SHA256_H2;
> + sctx->state[3] = SHA256_H3;
> + sctx->state[4] = SHA256_H4;
> + sctx->state[5] = SHA256_H5;
> + sctx->state[6] = SHA256_H6;
> + sctx->state[7] = SHA256_H7;
> + sctx->count = 0;
> +
> + return 0;
> +}
> +
> +static int __sha256_neon_update(struct shash_desc *desc, const u8 *data,
> + unsigned int len, unsigned int partial)
> +{
> + struct sha256_state *sctx = shash_desc_ctx(desc);
> + unsigned int done = 0;
> +
> + sctx->count += len;
> +
> + if (partial) {
> + done = SHA256_BLOCK_SIZE - partial;
> + memcpy(sctx->buf + partial, data, done);
> + sha256_transform_neon(sctx->state, sctx->buf, 1);
> + }
> +
> + if (len - done >= SHA256_BLOCK_SIZE) {
> + const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
> +
> + sha256_transform_neon(sctx->state, data + done, rounds);
> + done += rounds * SHA256_BLOCK_SIZE;
> + }
> +
> + memcpy(sctx->buf, data + done, len - done);
> +
> + return 0;
> +}
> +
> +static int sha256_neon_update(struct shash_desc *desc, const u8 *data,
> + unsigned int len)
> +{
> + struct sha256_state *sctx = shash_desc_ctx(desc);
> + unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
> + int res;
> +
> + /* Handle the fast case right here */
> + if (partial + len < SHA256_BLOCK_SIZE) {
> + sctx->count += len;
> + memcpy(sctx->buf + partial, data, len);
> +
> + return 0;
> + }
> +
> + if (!may_use_simd()) {
> + res = crypto_sha256_update(desc, data, len);
> + } else {
> + kernel_neon_begin();
> + res = __sha256_neon_update(desc, data, len, partial);
> + kernel_neon_end();
> + }
> +
> + return res;
> +}
> +
> +/* Add padding and return the message digest. */
> +static int sha256_neon_final(struct shash_desc *desc, u8 *out)
> +{
> + struct sha256_state *sctx = shash_desc_ctx(desc);
> + unsigned int i, index, padlen;
> + __be32 *dst = (__be32 *)out;
> + __be64 bits;
> + static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
> +
> + /* save number of bits */
> + bits = cpu_to_be64(sctx->count << 3);
> +
> + /* Pad out to 56 mod 64 and append length */
> + index = sctx->count % SHA256_BLOCK_SIZE;
> + padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
> +
> + if (!may_use_simd()) {
> + crypto_sha256_update(desc, padding, padlen);
> + crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits));
> + } else {
> + kernel_neon_begin();
> + /* We need to fill a whole block for __sha256_neon_update() */
> + if (padlen <= 56) {
> + sctx->count += padlen;
> + memcpy(sctx->buf + index, padding, padlen);
> + } else {
> + __sha256_neon_update(desc, padding, padlen, index);
> + }
> + __sha256_neon_update(desc, (const u8 *)&bits,
> + sizeof(bits), 56);
> + kernel_neon_end();
> + }
> +
> + /* Store state in digest */
> + for (i = 0; i < 8; i++)
> + dst[i] = cpu_to_be32(sctx->state[i]);
> +
> + /* Wipe context */
> + memset(sctx, 0, sizeof(*sctx));
> +
> + return 0;
> +}
> +
> +static int sha256_neon_export(struct shash_desc *desc, void *out)
> +{
> + struct sha256_state *sctx = shash_desc_ctx(desc);
> +
> + memcpy(out, sctx, sizeof(*sctx));
> +
> + return 0;
> +}
> +
> +static int sha256_neon_import(struct shash_desc *desc, const void *in)
> +{
> + struct sha256_state *sctx = shash_desc_ctx(desc);
> +
> + memcpy(sctx, in, sizeof(*sctx));
> +
> + return 0;
> +}
> +
> +static struct shash_alg alg = {
> + .digestsize = SHA256_DIGEST_SIZE,
> + .init = sha256_neon_init,
> + .update = sha256_neon_update,
> + .final = sha256_neon_final,
> + .export = sha256_neon_export,
> + .import = sha256_neon_import,
> + .descsize = sizeof(struct sha256_state),
> + .statesize = sizeof(struct sha256_state),
> + .base = {
> + .cra_name = "sha256",
> + .cra_driver_name = "sha256-neon",
> + .cra_priority = 350,
> + .cra_flags = CRYPTO_ALG_TYPE_SHASH,
> + .cra_blocksize = SHA256_BLOCK_SIZE,
> + .cra_module = THIS_MODULE,
> + }
> +};
> +

You can also implement SHA-224 using the same core transform, it's
just some trivial glue code.

> +static int __init sha256_neon_mod_init(void)
> +{
> + if (!cpu_has_neon())
> + return -ENODEV;
> +
> + return crypto_register_shash(&alg);
> +}
> +
> +static void __exit sha256_neon_mod_fini(void)
> +{
> + crypto_unregister_shash(&alg);
> +}
> +
> +module_init(sha256_neon_mod_init);
> +module_exit(sha256_neon_mod_fini);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, NEON accelerated");
> +
> +MODULE_ALIAS("sha256");
> diff --git a/crypto/Kconfig b/crypto/Kconfig
> index 50f4da4..0505523 100644
> --- a/crypto/Kconfig
> +++ b/crypto/Kconfig
> @@ -610,6 +610,18 @@ config CRYPTO_SHA256
> This code also includes SHA-224, a 224 bit hash with 112 bits
> of security against collision attacks.
>
> +config CRYPTO_SHA256_ARM_NEON
> + tristate "SHA256 digest algorithm (ARM NEON)"
> + depends on ARM && KERNEL_MODE_NEON && !CPU_BIG_ENDIAN
> + select CRYPTO_SHA256
> + select CRYPTO_HASH
> + help
> + SHA-256 secure hash standard (DFIPS 180-2) implemented
> + using ARM NEON instructions, when available.
> +
> + This version of SHA implements a 256 bit hash with 128 bits of
> + security against collision attacks.
> +

Could you please rebase this onto Herbert's cryptodev tree and move
this to arch/arm/crypto/Kconfig?

> config CRYPTO_SHA256_SPARC64
> tristate "SHA224 and SHA256 digest algorithm (SPARC64)"
> depends on SPARC64

Regards,
Ard.

2015-03-16 16:23:08

by Sami Tolvanen

[permalink] [raw]

Subject: Re: [PATCH] arm: crypto: Add NEON optimized SHA-256

On Mon, Mar 16, 2015 at 05:08:03PM +0100, Ard Biesheuvel wrote:
> Have you tested this code with the tcrypt.ko module?

I have not, but I can look into it.

> Did you talk to Andy about the license? I don't think this is
> permissible for the kernel as-is.

Unless I have misunderstood something, the license at the Cryptogams
website includes an option to license the code under the GNU GPL.

However, I can certainly contact Andy to clarify his intentions.

> This is broken on thumb-2, use adr instead

> You can also implement SHA-224 using the same core transform, it's
> just some trivial glue code.

> Could you please rebase this onto Herbert's cryptodev tree and move
> this to arch/arm/crypto/Kconfig?

Thanks for the comments, I will submit a second version once we have
a clarification on the license.

Sami

2015-03-17 06:56:03

by Ard Biesheuvel

[permalink] [raw]

Subject: Re: [PATCH] arm: crypto: Add NEON optimized SHA-256

On 16 March 2015 at 17:23, Sami Tolvanen <[email protected]> wrote:
> On Mon, Mar 16, 2015 at 05:08:03PM +0100, Ard Biesheuvel wrote:
>> Have you tested this code with the tcrypt.ko module?
>
> I have not, but I can look into it.
>
>> Did you talk to Andy about the license? I don't think this is
>> permissible for the kernel as-is.
>
> Unless I have misunderstood something, the license at the Cryptogams
> website includes an option to license the code under the GNU GPL.
>
> However, I can certainly contact Andy to clarify his intentions.
>
>> This is broken on thumb-2, use adr instead
>
>> You can also implement SHA-224 using the same core transform, it's
>> just some trivial glue code.
>
>> Could you please rebase this onto Herbert's cryptodev tree and move
>> this to arch/arm/crypto/Kconfig?
>
> Thanks for the comments, I will submit a second version once we have
> a clarification on the license.
>

I have tested your code with a Thumb2 kernel. With this patch folded,
it builds and passes the tcrypt.ko test for me

--- a/arch/arm/crypto/sha256-armv7-neon.S
+++ b/arch/arm/crypto/sha256-armv7-neon.S
@@ -10,7 +10,6 @@
#include <linux/linkage.h>

.text
-.code 32
.fpu neon

.type K256,%object
@@ -44,15 +43,15 @@ ENTRY(sha256_transform_neon)
* %r1: data
* %r2: nblks
*/
- sub r3,pc,#8 @ sha256_transform_neon
add r2,r1,r2,lsl#6 @ len to point at the end of inp

stmdb sp!,{r4-r12,lr}

mov r12,sp
- sub sp,sp,#16*4+16 @ alloca
- sub r14,r3,#256+32 @ K256
- bic sp,sp,#15 @ align for 128-bit stores
+ sub r14,sp,#16*4+16 @ alloca
+ bic r14,r14,#15 @ align for 128-bit stores
+ mov sp,r14
+ adr r14,K256

vld1.8 {q0},[r1]!
vld1.8 {q1},[r1]!

2015-03-17 15:09:40

by Andy Polyakov

[permalink] [raw]

Subject: Re: [PATCH] arm: crypto: Add NEON optimized SHA-256

Hi,

>>> Have you tested this code with the tcrypt.ko module?
>>
>> I have not, but I can look into it.
>>
>>> Did you talk to Andy about the license? I don't think this is
>>> permissible for the kernel as-is.
>>
>> Unless I have misunderstood something, the license at the Cryptogams
>> website includes an option to license the code under the GNU GPL.
>>
>> However, I can certainly contact Andy to clarify his intentions.

I have no problems with reusing assembly modules in kernel context. The
whole idea behind cryptogams initiative was exactly to reuse code in
different contexts. But I'd prefer if it's more of cooperative effort,
when we help each other to improve code, test on and tune for more
platforms single developer might have access to, cross-report problems,
etc. For this reason I'd prefer if it can be arranged in way similar to
bsaes-armv7 module, i.e. we work together on shared copy of module that
generates assembly that can be then compiled for OpenSSL or kernel. Is
it sensible? BTW, why stop at SHA256? There is SHA512 and NEON SHA1...

As for practicalities. In order to spare brain capacity for list
subscribers, it would probably be appropriate to work out details such
as naming of entry points in smaller group and present result when it's
ready to go and tests. I'll start by looking at Thumb-ification...

Cheers.

2015-03-17 15:21:25

by Sami Tolvanen

[permalink] [raw]

Subject: Re: [PATCH] arm: crypto: Add NEON optimized SHA-256

On Tue, Mar 17, 2015 at 04:09:40PM +0100, Andy Polyakov wrote:
> I have no problems with reusing assembly modules in kernel context.

Awesome, thank you for clarifying this.

> I'd prefer if it can be arranged in way similar to bsaes-armv7 module,
> i.e. we work together on shared copy of module that generates assembly
> that can be then compiled for OpenSSL or kernel. Is it sensible?

Sure, that sounds good to me.

> BTW, why stop at SHA256? There is SHA512 and NEON SHA1...

The kernel already has NEON SHA-1 and SHA-512, but for some reason is
lacking SHA-256. I have not tested how they compare to yours though.

Sami

2015-03-17 15:51:41

by Ard Biesheuvel

[permalink] [raw]

Subject: Re: [PATCH] arm: crypto: Add NEON optimized SHA-256

On 17 March 2015 at 16:09, Andy Polyakov <[email protected]> wrote:
> Hi,
>
>>>> Have you tested this code with the tcrypt.ko module?
>>>
>>> I have not, but I can look into it.
>>>
>>>> Did you talk to Andy about the license? I don't think this is
>>>> permissible for the kernel as-is.
>>>
>>> Unless I have misunderstood something, the license at the Cryptogams
>>> website includes an option to license the code under the GNU GPL.
>>>
>>> However, I can certainly contact Andy to clarify his intentions.
>
> I have no problems with reusing assembly modules in kernel context. The
> whole idea behind cryptogams initiative was exactly to reuse code in
> different contexts. But I'd prefer if it's more of cooperative effort,
> when we help each other to improve code, test on and tune for more
> platforms single developer might have access to, cross-report problems,
> etc. For this reason I'd prefer if it can be arranged in way similar to
> bsaes-armv7 module, i.e. we work together on shared copy of module that
> generates assembly that can be then compiled for OpenSSL or kernel. Is

Yes, that is what I thought.

@Sami, this also means we should integrate the .pl file and the
generated .S. Please have a look at how I integrated Andy's
bsaes-armv7.pl module.

> it sensible? BTW, why stop at SHA256? There is SHA512 and NEON SHA1...
>
> As for practicalities. In order to spare brain capacity for list
> subscribers, it would probably be appropriate to work out details such
> as naming of entry points in smaller group and present result when it's
> ready to go and tests. I'll start by looking at Thumb-ification...
>

@Andy: the core code builds without problems, but for Thumb2 there are
some trivial changes to apply:
- '.code 32' needs to be made conditional
- use adr instead of pc arithmetic for the address of K256
- bic cannot use r13 as input/output, so it needs to go via another register
- some conditional instructions need 'it' instructions added (not
strictly necessary for the kernel, because it passes
-mimplicit-it-thumb on the assembler command line, but nice for
completeness)
(see patch in other reply)

I am happy to test and/or review stuff as well.

Thanks,
Ard.

2015-03-23 13:50:09

by Sami Tolvanen

[permalink] [raw]

Subject: [PATCHv2] arm: crypto: Add optimized SHA-256/224

Add Andy Polyakov's optimized assembly and NEON implementations for
SHA-256/224.

The sha256-armv4.pl script for generating the assembly code is from
OpenSSL commit 2ecd32a1f8f0643ae7b38f59bbaf9f0d6ef326fe.

Compared to sha256-generic these implementations have the following
tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):

bs b/u sha256-neon sha256-asm
16 16 x1.32 x1.19
64 16 x1.27 x1.15
64 64 x1.36 x1.20
256 16 x1.22 x1.11
256 64 x1.36 x1.19
256 256 x1.59 x1.23
1024 16 x1.21 x1.10
1024 256 x1.65 x1.23
1024 1024 x1.76 x1.25
2048 16 x1.21 x1.10
2048 256 x1.66 x1.23
2048 1024 x1.78 x1.25
2048 2048 x1.79 x1.25
4096 16 x1.20 x1.09
4096 256 x1.66 x1.23
4096 1024 x1.79 x1.26
4096 4096 x1.82 x1.26
8192 16 x1.20 x1.09
8192 256 x1.67 x1.23
8192 1024 x1.80 x1.26
8192 4096 x1.85 x1.28
8192 8192 x1.85 x1.27

Where bs refers to block size and b/u to bytes per update.

Signed-off-by: Sami Tolvanen <[email protected]>
Cc: Andy Polyakov <[email protected]>

---
Changes since v1:
Rebased to Herbert's cryptodev tree
Include sha256-armv4.pl and use it to generate sha256-core.S
Add integer-only assembly version as sha256-asm
Add support for SHA-224 to the glue code
Change priority for sha256/224-ce to 300

---
arch/arm/crypto/Kconfig | 7
arch/arm/crypto/Makefile | 8
arch/arm/crypto/sha2-ce-glue.c | 4
arch/arm/crypto/sha256-armv4.pl | 713 ++++++
arch/arm/crypto/sha256-core.S_shipped | 2775 ++++++++++++++++++++++++
arch/arm/crypto/sha256_glue.c | 246 ++
arch/arm/crypto/sha256_glue.h | 23
arch/arm/crypto/sha256_neon_glue.c | 172 +
8 files changed, 3945 insertions(+), 3 deletions(-)

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index d63f319..458729d 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -46,6 +46,13 @@ config CRYPTO_SHA2_ARM_CE
SHA-256 secure hash standard (DFIPS 180-2) implemented
using special ARMv8 Crypto Extensions.

+config CRYPTO_SHA256_ARM
+ tristate "SHA-224/256 digest algorithm (ARM-asm and NEON)"
+ select CRYPTO_HASH
+ help
+ SHA-256 secure hash standard (DFIPS 180-2) implemented
+ using optimized ARM assembler and NEON, when available.
+
config CRYPTO_SHA512_ARM_NEON
tristate "SHA384 and SHA512 digest algorithm (ARM NEON)"
depends on KERNEL_MODE_NEON
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 9a273bd..ef46e89 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
+obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
@@ -16,6 +17,8 @@ aes-arm-y := aes-armv4.o aes_glue.o
aes-arm-bs-y := aesbs-core.o aesbs-glue.o
sha1-arm-y := sha1-armv4-large.o sha1_glue.o
sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o
+sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
+sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o
sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o
@@ -28,4 +31,7 @@ quiet_cmd_perl = PERL $@
$(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
$(call cmd,perl)

-.PRECIOUS: $(obj)/aesbs-core.S
+$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
+ $(call cmd,perl)
+
+.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S
diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c
index 9ffe8ad..0449eca 100644
--- a/arch/arm/crypto/sha2-ce-glue.c
+++ b/arch/arm/crypto/sha2-ce-glue.c
@@ -163,7 +163,7 @@ static struct shash_alg algs[] = { {
.base = {
.cra_name = "sha224",
.cra_driver_name = "sha224-ce",
- .cra_priority = 200,
+ .cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
@@ -180,7 +180,7 @@ static struct shash_alg algs[] = { {
.base = {
.cra_name = "sha256",
.cra_driver_name = "sha256-ce",
- .cra_priority = 200,
+ .cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl
new file mode 100644
index 0000000..4fee74d
--- /dev/null
+++ b/arch/arm/crypto/sha256-armv4.pl
@@ -0,0 +1,713 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
+# ====================================================================
+
+# SHA256 block procedure for ARMv4. May 2007.
+
+# Performance is ~2x better than gcc 3.4 generated code and in "abso-
+# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0"; $t0="r0";
+$inp="r1"; $t4="r1";
+$len="r2"; $t1="r2";
+$T1="r3"; $t3="r3";
+$A="r4";
+$B="r5";
+$C="r6";
+$D="r7";
+$E="r8";
+$F="r9";
+$G="r10";
+$H="r11";
+@V=($A,$B,$C,$D,$E,$F,$G,$H);
+$t2="r12";
+$Ktbl="r14";
+
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+ @ ldr $t1,[$inp],#4 @ $i
+# if $i==15
+ str $inp,[sp,#17*4] @ make room for $t4
+# endif
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+ rev $t1,$t1
+#else
+ @ ldrb $t1,[$inp,#3] @ $i
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ ldrb $t2,[$inp,#2]
+ ldrb $t0,[$inp,#1]
+ orr $t1,$t1,$t2,lsl#8
+ ldrb $t2,[$inp],#4
+ orr $t1,$t1,$t0,lsl#16
+# if $i==15
+ str $inp,[sp,#17*4] @ make room for $t4
+# endif
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+ orr $t1,$t1,$t2,lsl#24
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+#endif
+___
+$code.=<<___;
+ ldr $t2,[$Ktbl],#4 @ *K256++
+ add $h,$h,$t1 @ h+=X[i]
+ str $t1,[sp,#`$i%16`*4]
+ eor $t1,$f,$g
+ add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
+ and $t1,$t1,$e
+ add $h,$h,$t2 @ h+=K256[i]
+ eor $t1,$t1,$g @ Ch(e,f,g)
+ eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
+ add $h,$h,$t1 @ h+=Ch(e,f,g)
+#if $i==31
+ and $t2,$t2,#0xff
+ cmp $t2,#0xf2 @ done?
+#endif
+#if $i<15
+# if __ARM_ARCH__>=7
+ ldr $t1,[$inp],#4 @ prefetch
+# else
+ ldrb $t1,[$inp,#3]
+# endif
+ eor $t2,$a,$b @ a^b, b^c in next round
+#else
+ ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
+ eor $t2,$a,$b @ a^b, b^c in next round
+ ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
+#endif
+ eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
+ and $t3,$t3,$t2 @ (b^c)&=(a^b)
+ add $d,$d,$h @ d+=h
+ eor $t3,$t3,$b @ Maj(a,b,c)
+ add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
+ @ add $h,$h,$t3 @ h+=Maj(a,b,c)
+___
+ ($t2,$t3)=($t3,$t2);
+}
+
+sub BODY_16_XX {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+ @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
+ @ ldr $t4,[sp,#`($i+14)%16`*4]
+ mov $t0,$t1,ror#$sigma0[0]
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ mov $t2,$t4,ror#$sigma1[0]
+ eor $t0,$t0,$t1,ror#$sigma0[1]
+ eor $t2,$t2,$t4,ror#$sigma1[1]
+ eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
+ ldr $t1,[sp,#`($i+0)%16`*4]
+ eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
+ ldr $t4,[sp,#`($i+9)%16`*4]
+
+ add $t2,$t2,$t0
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
+ add $t1,$t1,$t2
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+ add $t1,$t1,$t4 @ X[i]
+___
+ &BODY_00_15(@_);
+}
+
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code 32
+# endif
+#endif
+
+.type K256,%object
+.align 5
+K256:
+.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size K256,.-K256
+.word 0 @ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align 5
+
+.global sha256_block_data_order
+.type sha256_block_data_order,%function
+sha256_block_data_order:
+#if __ARM_ARCH__<7
+ sub r3,pc,#8 @ sha256_block_data_order
+#else
+ adr r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+ tst r12,#ARMV8_SHA256
+ bne .LARMv8
+ tst r12,#ARMV7_NEON
+ bne .LNEON
+#endif
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+ stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
+ ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
+ sub $Ktbl,r3,#256+32 @ K256
+ sub sp,sp,#16*4 @ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+ ldr $t1,[$inp],#4
+# else
+ ldrb $t1,[$inp,#3]
+# endif
+ eor $t3,$B,$C @ magic
+ eor $t2,$t2,$t2
+___
+for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".Lrounds_16_xx:\n";
+for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+#if __ARM_ARCH__>=7
+ ite eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq $t3,[sp,#16*4] @ pull ctx
+ bne .Lrounds_16_xx
+
+ add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
+ ldr $t0,[$t3,#0]
+ ldr $t1,[$t3,#4]
+ ldr $t2,[$t3,#8]
+ add $A,$A,$t0
+ ldr $t0,[$t3,#12]
+ add $B,$B,$t1
+ ldr $t1,[$t3,#16]
+ add $C,$C,$t2
+ ldr $t2,[$t3,#20]
+ add $D,$D,$t0
+ ldr $t0,[$t3,#24]
+ add $E,$E,$t1
+ ldr $t1,[$t3,#28]
+ add $F,$F,$t2
+ ldr $inp,[sp,#17*4] @ pull inp
+ ldr $t2,[sp,#18*4] @ pull inp+len
+ add $G,$G,$t0
+ add $H,$H,$t1
+ stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
+ cmp $inp,$t2
+ sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
+ bne .Loop
+
+ add sp,sp,#`16+3`*4 @ destroy frame
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size sha256_block_data_order,.-sha256_block_data_order
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+ my $arg = pop;
+ $arg = "#$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T2,$T0,$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T1,$T0,$sigma0[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T2,$T0,32-$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T3,$T0,$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T1,$T1,$T2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T3,$T0,32-$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T1,$T1,$T3); # sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4); # sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_32 ("{$T0}","[$Ktbl,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4); # sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 ($T0,$T0,@X[0]);
+ while($#insns>=2) { eval(shift(@insns)); }
+ &vst1_32 ("{$T0}","[$Xfer,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_32 ("{$T0}","[$Ktbl,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vrev32_8 (@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 ($T0,$T0,@X[0]);
+ foreach (@insns) { eval; } # remaining instructions
+ &vst1_32 ("{$T0}","[$Xfer,:128]!");
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub body_00_15 () {
+ (
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+ '&add ($h,$h,$t1)', # h+=X[i]+K[i]
+ '&eor ($t1,$f,$g)',
+ '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+ '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
+ '&and ($t1,$t1,$e)',
+ '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
+ '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+ '&eor ($t1,$t1,$g)', # Ch(e,f,g)
+ '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
+ '&eor ($t2,$a,$b)', # a^b, b^c in next round
+ '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
+ '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
+ '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
+ '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
+ '&ldr ($t1,"[sp,#64]") if ($j==31)',
+ '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
+ '&add ($d,$d,$h)', # d+=h
+ '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
+ '&eor ($t3,$t3,$b)', # Maj(a,b,c)
+ '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+ )
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.global sha256_block_data_order_neon
+.type sha256_block_data_order_neon,%function
+.align 4
+sha256_block_data_order_neon:
+.LNEON:
+ stmdb sp!,{r4-r12,lr}
+
+ sub $H,sp,#16*4+16
+ adr $Ktbl,K256
+ bic $H,$H,#15 @ align for 128-bit stores
+ mov $t2,sp
+ mov sp,$H @ alloca
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+
+ vld1.8 {@X[0]},[$inp]!
+ vld1.8 {@X[1]},[$inp]!
+ vld1.8 {@X[2]},[$inp]!
+ vld1.8 {@X[3]},[$inp]!
+ vld1.32 {$T0},[$Ktbl,:128]!
+ vld1.32 {$T1},[$Ktbl,:128]!
+ vld1.32 {$T2},[$Ktbl,:128]!
+ vld1.32 {$T3},[$Ktbl,:128]!
+ vrev32.8 @X[0],@X[0] @ yes, even on
+ str $ctx,[sp,#64]
+ vrev32.8 @X[1],@X[1] @ big-endian
+ str $inp,[sp,#68]
+ mov $Xfer,sp
+ vrev32.8 @X[2],@X[2]
+ str $len,[sp,#72]
+ vrev32.8 @X[3],@X[3]
+ str $t2,[sp,#76] @ save original sp
+ vadd.i32 $T0,$T0,@X[0]
+ vadd.i32 $T1,$T1,@X[1]
+ vst1.32 {$T0},[$Xfer,:128]!
+ vadd.i32 $T2,$T2,@X[2]
+ vst1.32 {$T1},[$Xfer,:128]!
+ vadd.i32 $T3,$T3,@X[3]
+ vst1.32 {$T2},[$Xfer,:128]!
+ vst1.32 {$T3},[$Xfer,:128]!
+
+ ldmia $ctx,{$A-$H}
+ sub $Xfer,$Xfer,#64
+ ldr $t1,[sp,#0]
+ eor $t2,$t2,$t2
+ eor $t3,$B,$C
+ b .L_00_48
+
+.align 4
+.L_00_48:
+___
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+$code.=<<___;
+ teq $t1,#0 @ check for K256 terminator
+ ldr $t1,[sp,#0]
+ sub $Xfer,$Xfer,#64
+ bne .L_00_48
+
+ ldr $inp,[sp,#68]
+ ldr $t0,[sp,#72]
+ sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
+ teq $inp,$t0
+ it eq
+ subeq $inp,$inp,#64 @ avoid SEGV
+ vld1.8 {@X[0]},[$inp]! @ load next input block
+ vld1.8 {@X[1]},[$inp]!
+ vld1.8 {@X[2]},[$inp]!
+ vld1.8 {@X[3]},[$inp]!
+ it ne
+ strne $inp,[sp,#68]
+ mov $Xfer,sp
+___
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+$code.=<<___;
+ ldr $t0,[$t1,#0]
+ add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
+ ldr $t2,[$t1,#4]
+ ldr $t3,[$t1,#8]
+ ldr $t4,[$t1,#12]
+ add $A,$A,$t0 @ accumulate
+ ldr $t0,[$t1,#16]
+ add $B,$B,$t2
+ ldr $t2,[$t1,#20]
+ add $C,$C,$t3
+ ldr $t3,[$t1,#24]
+ add $D,$D,$t4
+ ldr $t4,[$t1,#28]
+ add $E,$E,$t0
+ str $A,[$t1],#4
+ add $F,$F,$t2
+ str $B,[$t1],#4
+ add $G,$G,$t3
+ str $C,[$t1],#4
+ add $H,$H,$t4
+ str $D,[$t1],#4
+ stmia $t1,{$E-$H}
+
+ ittte ne
+ movne $Xfer,sp
+ ldrne $t1,[sp,#0]
+ eorne $t2,$t2,$t2
+ ldreq sp,[sp,#76] @ restore original sp
+ itt ne
+ eorne $t3,$B,$C
+ bne .L_00_48
+
+ ldmia sp!,{r4-r12,pc}
+.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+___
+}}}
+######################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
+my @MSG=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
+my $Ktbl="r3";
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d
+# endif
+
+.type sha256_block_data_order_armv8,%function
+.align 5
+sha256_block_data_order_armv8:
+.LARMv8:
+ vld1.32 {$ABCD,$EFGH},[$ctx]
+# ifdef __thumb2__
+ adr $Ktbl,.LARMv8
+ sub $Ktbl,$Ktbl,#.LARMv8-K256
+# else
+ adrl $Ktbl,K256
+# endif
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+
+.Loop_v8:
+ vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
+ vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
+ vld1.32 {$W0},[$Ktbl]!
+ vrev32.8 @MSG[0],@MSG[0]
+ vrev32.8 @MSG[1],@MSG[1]
+ vrev32.8 @MSG[2],@MSG[2]
+ vrev32.8 @MSG[3],@MSG[3]
+ vmov $ABCD_SAVE,$ABCD @ offload
+ vmov $EFGH_SAVE,$EFGH
+ teq $inp,$len
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+ vld1.32 {$W1},[$Ktbl]!
+ vadd.i32 $W0,$W0,@MSG[0]
+ sha256su0 @MSG[0],@MSG[1]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+ sha256su1 @MSG[0],@MSG[2],@MSG[3]
+___
+ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+ vld1.32 {$W1},[$Ktbl]!
+ vadd.i32 $W0,$W0,@MSG[0]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ vld1.32 {$W0},[$Ktbl]!
+ vadd.i32 $W1,$W1,@MSG[1]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ vld1.32 {$W1},[$Ktbl]
+ vadd.i32 $W0,$W0,@MSG[2]
+ sub $Ktbl,$Ktbl,#256-16 @ rewind
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ vadd.i32 $W1,$W1,@MSG[3]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
+ vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
+ it ne
+ bne .Loop_v8
+
+ vst1.32 {$ABCD,$EFGH},[$ctx]
+
+ ret @ bx lr
+.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm OPENSSL_armcap_P,4,4
+#endif
+___
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
+{ my %opcode = (
+ "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
+ "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
+
+ sub unsha256 {
+ my ($mnemonic,$arg)=@_;
+
+ if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+ my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<17)|(($2&8)<<4)
+ |(($3&7)<<1) |(($3&8)<<2);
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+ sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
+ $mnemonic,$arg;
+ }
+ }
+}
+
+foreach (split($/,$code)) {
+
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
+
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/arch/arm/crypto/sha256-core.S_shipped b/arch/arm/crypto/sha256-core.S_shipped
new file mode 100644
index 0000000..683f1cc
--- /dev/null
+++ b/arch/arm/crypto/sha256-core.S_shipped
@@ -0,0 +1,2775 @@
+
+@ ====================================================================
+@ Written by Andy Polyakov <[email protected]> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA256 block procedure for ARMv4. May 2007.
+
+@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
+@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+@ byte [on single-issue Xscale PXA250 core].
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
+@ Cortex A8 core and ~20 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+@ September 2013.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process one
+@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+@ code (meaning that latter performs sub-optimally, nothing was done
+@ about it).
+
+@ May 2014.
+@
+@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code 32
+# endif
+#endif
+
+.type K256,%object
+.align 5
+K256:
+.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size K256,.-K256
+.word 0 @ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align 5
+
+.global sha256_block_data_order
+.type sha256_block_data_order,%function
+sha256_block_data_order:
+#if __ARM_ARCH__<7
+ sub r3,pc,#8 @ sha256_block_data_order
+#else
+ adr r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+ tst r12,#ARMV8_SHA256
+ bne .LARMv8
+ tst r12,#ARMV7_NEON
+ bne .LNEON
+#endif
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+ stmdb sp!,{r0,r1,r2,r4-r11,lr}
+ ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+ sub r14,r3,#256+32 @ K256
+ sub sp,sp,#16*4 @ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ magic
+ eor r12,r12,r12
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 0
+# if 0==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+ rev r2,r2
+#else
+ @ ldrb r2,[r1,#3] @ 0
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 0==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#0*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 0==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 0<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#2*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#15*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 1
+# if 1==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+ rev r2,r2
+#else
+ @ ldrb r2,[r1,#3] @ 1
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 1==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#1*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 1==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 1<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#3*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#0*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 2
+# if 2==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+ rev r2,r2
+#else
+ @ ldrb r2,[r1,#3] @ 2
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 2==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#2*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 2==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 2<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#4*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#1*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 3
+# if 3==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+ rev r2,r2
+#else
+ @ ldrb r2,[r1,#3] @ 3
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 3==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#3*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 3==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 3<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#5*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#2*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 4
+# if 4==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+ rev r2,r2
+#else
+ @ ldrb r2,[r1,#3] @ 4
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 4==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#4*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 4==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 4<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#6*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#3*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 5
+# if 5==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+ rev r2,r2
+#else
+ @ ldrb r2,[r1,#3] @ 5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 5==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#5*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 5==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 5<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#7*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#4*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 6
+# if 6==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+ rev r2,r2
+#else
+ @ ldrb r2,[r1,#3] @ 6
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 6==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#6*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 6==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 6<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#8*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#5*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 7
+# if 7==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+ rev r2,r2
+#else
+ @ ldrb r2,[r1,#3] @ 7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 7==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#7*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 7==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 7<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#9*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#6*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 8
+# if 8==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+ rev r2,r2
+#else
+ @ ldrb r2,[r1,#3] @ 8
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 8==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#8*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 8==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 8<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#10*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#7*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 9
+# if 9==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+ rev r2,r2
+#else
+ @ ldrb r2,[r1,#3] @ 9
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 9==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#9*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 9==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 9<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#11*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#8*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 10
+# if 10==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+ rev r2,r2
+#else
+ @ ldrb r2,[r1,#3] @ 10
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 10==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#10*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 10==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 10<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#12*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#9*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 11
+# if 11==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+ rev r2,r2
+#else
+ @ ldrb r2,[r1,#3] @ 11
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 11==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#11*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 11==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 11<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#13*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#10*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 12
+# if 12==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+ rev r2,r2
+#else
+ @ ldrb r2,[r1,#3] @ 12
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 12==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#12*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 12==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 12<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#14*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#11*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 13
+# if 13==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+ rev r2,r2
+#else
+ @ ldrb r2,[r1,#3] @ 13
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 13==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#13*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 13==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 13<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#15*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#12*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 14
+# if 14==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+ rev r2,r2
+#else
+ @ ldrb r2,[r1,#3] @ 14
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 14==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#14*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 14==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 14<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#0*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#13*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 15
+# if 15==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+ rev r2,r2
+#else
+ @ ldrb r2,[r1,#3] @ 15
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 15==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#15*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 15==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 15<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#1*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#14*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+.Lrounds_16_xx:
+ @ ldr r2,[sp,#1*4] @ 16
+ @ ldr r1,[sp,#14*4]
+ mov r0,r2,ror#7
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#0*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#9*4]
+
+ add r12,r12,r0
+ eor r0,r8,r8,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#0*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 16==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 16<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#2*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#15*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#2*4] @ 17
+ @ ldr r1,[sp,#15*4]
+ mov r0,r2,ror#7
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#1*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#10*4]
+
+ add r3,r3,r0
+ eor r0,r7,r7,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#1*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 17==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 17<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#3*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#0*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#3*4] @ 18
+ @ ldr r1,[sp,#0*4]
+ mov r0,r2,ror#7
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#2*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#11*4]
+
+ add r12,r12,r0
+ eor r0,r6,r6,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#2*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 18==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 18<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#4*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#1*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#4*4] @ 19
+ @ ldr r1,[sp,#1*4]
+ mov r0,r2,ror#7
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#3*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#12*4]
+
+ add r3,r3,r0
+ eor r0,r5,r5,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#3*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 19==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 19<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#5*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#2*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#5*4] @ 20
+ @ ldr r1,[sp,#2*4]
+ mov r0,r2,ror#7
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#4*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#13*4]
+
+ add r12,r12,r0
+ eor r0,r4,r4,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#4*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 20==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 20<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#6*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#3*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#6*4] @ 21
+ @ ldr r1,[sp,#3*4]
+ mov r0,r2,ror#7
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#5*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#14*4]
+
+ add r3,r3,r0
+ eor r0,r11,r11,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#5*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 21==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 21<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#7*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#4*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#7*4] @ 22
+ @ ldr r1,[sp,#4*4]
+ mov r0,r2,ror#7
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#6*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#15*4]
+
+ add r12,r12,r0
+ eor r0,r10,r10,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#6*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 22==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 22<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#8*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#5*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#8*4] @ 23
+ @ ldr r1,[sp,#5*4]
+ mov r0,r2,ror#7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#7*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#0*4]
+
+ add r3,r3,r0
+ eor r0,r9,r9,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#7*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 23==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 23<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#9*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#6*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#9*4] @ 24
+ @ ldr r1,[sp,#6*4]
+ mov r0,r2,ror#7
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#8*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#1*4]
+
+ add r12,r12,r0
+ eor r0,r8,r8,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#8*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 24==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 24<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#10*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#7*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#10*4] @ 25
+ @ ldr r1,[sp,#7*4]
+ mov r0,r2,ror#7
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#9*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#2*4]
+
+ add r3,r3,r0
+ eor r0,r7,r7,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#9*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 25==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 25<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#11*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#8*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#11*4] @ 26
+ @ ldr r1,[sp,#8*4]
+ mov r0,r2,ror#7
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#10*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#3*4]
+
+ add r12,r12,r0
+ eor r0,r6,r6,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#10*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 26==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 26<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#12*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#9*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#12*4] @ 27
+ @ ldr r1,[sp,#9*4]
+ mov r0,r2,ror#7
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#11*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#4*4]
+
+ add r3,r3,r0
+ eor r0,r5,r5,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#11*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 27==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 27<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#13*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#10*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#13*4] @ 28
+ @ ldr r1,[sp,#10*4]
+ mov r0,r2,ror#7
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#12*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#5*4]
+
+ add r12,r12,r0
+ eor r0,r4,r4,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#12*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 28==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 28<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#14*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#11*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#14*4] @ 29
+ @ ldr r1,[sp,#11*4]
+ mov r0,r2,ror#7
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#13*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#6*4]
+
+ add r3,r3,r0
+ eor r0,r11,r11,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#13*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 29==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 29<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#15*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#12*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#15*4] @ 30
+ @ ldr r1,[sp,#12*4]
+ mov r0,r2,ror#7
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#14*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#7*4]
+
+ add r12,r12,r0
+ eor r0,r10,r10,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#14*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 30==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 30<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#0*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#13*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#0*4] @ 31
+ @ ldr r1,[sp,#13*4]
+ mov r0,r2,ror#7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#15*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#8*4]
+
+ add r3,r3,r0
+ eor r0,r9,r9,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#15*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 31==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 31<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#1*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#14*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ ite eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq r3,[sp,#16*4] @ pull ctx
+ bne .Lrounds_16_xx
+
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldr r0,[r3,#0]
+ ldr r2,[r3,#4]
+ ldr r12,[r3,#8]
+ add r4,r4,r0
+ ldr r0,[r3,#12]
+ add r5,r5,r2
+ ldr r2,[r3,#16]
+ add r6,r6,r12
+ ldr r12,[r3,#20]
+ add r7,r7,r0
+ ldr r0,[r3,#24]
+ add r8,r8,r2
+ ldr r2,[r3,#28]
+ add r9,r9,r12
+ ldr r1,[sp,#17*4] @ pull inp
+ ldr r12,[sp,#18*4] @ pull inp+len
+ add r10,r10,r0
+ add r11,r11,r2
+ stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11}
+ cmp r1,r12
+ sub r14,r14,#256 @ rewind Ktbl
+ bne .Loop
+
+ add sp,sp,#19*4 @ destroy frame
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size sha256_block_data_order,.-sha256_block_data_order
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.global sha256_block_data_order_neon
+.type sha256_block_data_order_neon,%function
+.align 4
+sha256_block_data_order_neon:
+.LNEON:
+ stmdb sp!,{r4-r12,lr}
+
+ sub r11,sp,#16*4+16
+ adr r14,K256
+ bic r11,r11,#15 @ align for 128-bit stores
+ mov r12,sp
+ mov sp,r11 @ alloca
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+
+ vld1.8 {q0},[r1]!
+ vld1.8 {q1},[r1]!
+ vld1.8 {q2},[r1]!
+ vld1.8 {q3},[r1]!
+ vld1.32 {q8},[r14,:128]!
+ vld1.32 {q9},[r14,:128]!
+ vld1.32 {q10},[r14,:128]!
+ vld1.32 {q11},[r14,:128]!
+ vrev32.8 q0,q0 @ yes, even on
+ str r0,[sp,#64]
+ vrev32.8 q1,q1 @ big-endian
+ str r1,[sp,#68]
+ mov r1,sp
+ vrev32.8 q2,q2
+ str r2,[sp,#72]
+ vrev32.8 q3,q3
+ str r12,[sp,#76] @ save original sp
+ vadd.i32 q8,q8,q0
+ vadd.i32 q9,q9,q1
+ vst1.32 {q8},[r1,:128]!
+ vadd.i32 q10,q10,q2
+ vst1.32 {q9},[r1,:128]!
+ vadd.i32 q11,q11,q3
+ vst1.32 {q10},[r1,:128]!
+ vst1.32 {q11},[r1,:128]!
+
+ ldmia r0,{r4-r11}
+ sub r1,r1,#64
+ ldr r2,[sp,#0]
+ eor r12,r12,r12
+ eor r3,r5,r6
+ b .L_00_48
+
+.align 4
+.L_00_48:
+ vext.8 q8,q0,q1,#4
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ vext.8 q9,q2,q3,#4
+ add r4,r4,r12
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vadd.i32 q0,q0,q9
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#4]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ veor q9,q9,q10
+ add r10,r10,r2
+ vsli.32 q11,q8,#14
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ vshr.u32 d24,d7,#17
+ add r11,r11,r3
+ and r2,r2,r7
+ veor q9,q9,q11
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ vsli.32 d24,d7,#15
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ vshr.u32 d25,d7,#10
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ vadd.i32 q0,q0,q9
+ add r10,r10,r2
+ ldr r2,[sp,#8]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r6,r6,r10
+ vshr.u32 d24,d7,#19
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ vsli.32 d24,d7,#13
+ add r9,r9,r2
+ eor r2,r7,r8
+ veor d25,d25,d24
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ vadd.i32 d0,d0,d25
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ vshr.u32 d24,d0,#17
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ vsli.32 d24,d0,#15
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ vshr.u32 d25,d0,#10
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#12]
+ and r3,r3,r12
+ vshr.u32 d24,d0,#19
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ vld1.32 {q8},[r14,:128]!
+ add r8,r8,r2
+ vsli.32 d24,d0,#13
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ veor d25,d25,d24
+ add r9,r9,r3
+ and r2,r2,r5
+ vadd.i32 d1,d1,d25
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ vadd.i32 q8,q8,q0
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#16]
+ and r12,r12,r3
+ add r4,r4,r8
+ vst1.32 {q8},[r1,:128]!
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vext.8 q8,q1,q2,#4
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ vext.8 q9,q3,q0,#4
+ add r8,r8,r12
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vadd.i32 q1,q1,q9
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#20]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ veor q9,q9,q10
+ add r6,r6,r2
+ vsli.32 q11,q8,#14
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ vshr.u32 d24,d1,#17
+ add r7,r7,r3
+ and r2,r2,r11
+ veor q9,q9,q11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ vsli.32 d24,d1,#15
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ vshr.u32 d25,d1,#10
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ vadd.i32 q1,q1,q9
+ add r6,r6,r2
+ ldr r2,[sp,#24]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r10,r10,r6
+ vshr.u32 d24,d1,#19
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ vsli.32 d24,d1,#13
+ add r5,r5,r2
+ eor r2,r11,r4
+ veor d25,d25,d24
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ vadd.i32 d2,d2,d25
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ vshr.u32 d24,d2,#17
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ vsli.32 d24,d2,#15
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ vshr.u32 d25,d2,#10
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#28]
+ and r3,r3,r12
+ vshr.u32 d24,d2,#19
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ vld1.32 {q8},[r14,:128]!
+ add r4,r4,r2
+ vsli.32 d24,d2,#13
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ veor d25,d25,d24
+ add r5,r5,r3
+ and r2,r2,r9
+ vadd.i32 d3,d3,d25
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ vadd.i32 q8,q8,q1
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#32]
+ and r12,r12,r3
+ add r8,r8,r4
+ vst1.32 {q8},[r1,:128]!
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vext.8 q8,q2,q3,#4
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ vext.8 q9,q0,q1,#4
+ add r4,r4,r12
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vadd.i32 q2,q2,q9
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#36]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ veor q9,q9,q10
+ add r10,r10,r2
+ vsli.32 q11,q8,#14
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ vshr.u32 d24,d3,#17
+ add r11,r11,r3
+ and r2,r2,r7
+ veor q9,q9,q11
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ vsli.32 d24,d3,#15
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ vshr.u32 d25,d3,#10
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ vadd.i32 q2,q2,q9
+ add r10,r10,r2
+ ldr r2,[sp,#40]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r6,r6,r10
+ vshr.u32 d24,d3,#19
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ vsli.32 d24,d3,#13
+ add r9,r9,r2
+ eor r2,r7,r8
+ veor d25,d25,d24
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ vadd.i32 d4,d4,d25
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ vshr.u32 d24,d4,#17
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ vsli.32 d24,d4,#15
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ vshr.u32 d25,d4,#10
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#44]
+ and r3,r3,r12
+ vshr.u32 d24,d4,#19
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ vld1.32 {q8},[r14,:128]!
+ add r8,r8,r2
+ vsli.32 d24,d4,#13
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ veor d25,d25,d24
+ add r9,r9,r3
+ and r2,r2,r5
+ vadd.i32 d5,d5,d25
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ vadd.i32 q8,q8,q2
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#48]
+ and r12,r12,r3
+ add r4,r4,r8
+ vst1.32 {q8},[r1,:128]!
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vext.8 q8,q3,q0,#4
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ vext.8 q9,q1,q2,#4
+ add r8,r8,r12
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vadd.i32 q3,q3,q9
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#52]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ veor q9,q9,q10
+ add r6,r6,r2
+ vsli.32 q11,q8,#14
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ vshr.u32 d24,d5,#17
+ add r7,r7,r3
+ and r2,r2,r11
+ veor q9,q9,q11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ vsli.32 d24,d5,#15
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ vshr.u32 d25,d5,#10
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ vadd.i32 q3,q3,q9
+ add r6,r6,r2
+ ldr r2,[sp,#56]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r10,r10,r6
+ vshr.u32 d24,d5,#19
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ vsli.32 d24,d5,#13
+ add r5,r5,r2
+ eor r2,r11,r4
+ veor d25,d25,d24
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ vadd.i32 d6,d6,d25
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ vshr.u32 d24,d6,#17
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ vsli.32 d24,d6,#15
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ vshr.u32 d25,d6,#10
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#60]
+ and r3,r3,r12
+ vshr.u32 d24,d6,#19
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ vld1.32 {q8},[r14,:128]!
+ add r4,r4,r2
+ vsli.32 d24,d6,#13
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ veor d25,d25,d24
+ add r5,r5,r3
+ and r2,r2,r9
+ vadd.i32 d7,d7,d25
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ vadd.i32 q8,q8,q3
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[r14]
+ and r12,r12,r3
+ add r8,r8,r4
+ vst1.32 {q8},[r1,:128]!
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ teq r2,#0 @ check for K256 terminator
+ ldr r2,[sp,#0]
+ sub r1,r1,#64
+ bne .L_00_48
+
+ ldr r1,[sp,#68]
+ ldr r0,[sp,#72]
+ sub r14,r14,#256 @ rewind r14
+ teq r1,r0
+ it eq
+ subeq r1,r1,#64 @ avoid SEGV
+ vld1.8 {q0},[r1]! @ load next input block
+ vld1.8 {q1},[r1]!
+ vld1.8 {q2},[r1]!
+ vld1.8 {q3},[r1]!
+ it ne
+ strne r1,[sp,#68]
+ mov r1,sp
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vrev32.8 q0,q0
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vadd.i32 q8,q8,q0
+ ldr r2,[sp,#4]
+ and r3,r3,r12
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ add r10,r10,r2
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3
+ and r2,r2,r7
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ add r10,r10,r2
+ ldr r2,[sp,#8]
+ and r12,r12,r3
+ add r6,r6,r10
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ add r9,r9,r2
+ eor r2,r7,r8
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ ldr r2,[sp,#12]
+ and r3,r3,r12
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ add r8,r8,r2
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3
+ and r2,r2,r5
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#16]
+ and r12,r12,r3
+ add r4,r4,r8
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vst1.32 {q8},[r1,:128]!
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vrev32.8 q1,q1
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vadd.i32 q8,q8,q1
+ ldr r2,[sp,#20]
+ and r3,r3,r12
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ add r6,r6,r2
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3
+ and r2,r2,r11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ add r6,r6,r2
+ ldr r2,[sp,#24]
+ and r12,r12,r3
+ add r10,r10,r6
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ add r5,r5,r2
+ eor r2,r11,r4
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ ldr r2,[sp,#28]
+ and r3,r3,r12
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ add r4,r4,r2
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3
+ and r2,r2,r9
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#32]
+ and r12,r12,r3
+ add r8,r8,r4
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vst1.32 {q8},[r1,:128]!
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vrev32.8 q2,q2
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vadd.i32 q8,q8,q2
+ ldr r2,[sp,#36]
+ and r3,r3,r12
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ add r10,r10,r2
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3
+ and r2,r2,r7
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ add r10,r10,r2
+ ldr r2,[sp,#40]
+ and r12,r12,r3
+ add r6,r6,r10
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ add r9,r9,r2
+ eor r2,r7,r8
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ ldr r2,[sp,#44]
+ and r3,r3,r12
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ add r8,r8,r2
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3
+ and r2,r2,r5
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#48]
+ and r12,r12,r3
+ add r4,r4,r8
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vst1.32 {q8},[r1,:128]!
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vrev32.8 q3,q3
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vadd.i32 q8,q8,q3
+ ldr r2,[sp,#52]
+ and r3,r3,r12
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ add r6,r6,r2
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3
+ and r2,r2,r11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ add r6,r6,r2
+ ldr r2,[sp,#56]
+ and r12,r12,r3
+ add r10,r10,r6
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ add r5,r5,r2
+ eor r2,r11,r4
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ ldr r2,[sp,#60]
+ and r3,r3,r12
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ add r4,r4,r2
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3
+ and r2,r2,r9
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#64]
+ and r12,r12,r3
+ add r8,r8,r4
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vst1.32 {q8},[r1,:128]!
+ ldr r0,[r2,#0]
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldr r12,[r2,#4]
+ ldr r3,[r2,#8]
+ ldr r1,[r2,#12]
+ add r4,r4,r0 @ accumulate
+ ldr r0,[r2,#16]
+ add r5,r5,r12
+ ldr r12,[r2,#20]
+ add r6,r6,r3
+ ldr r3,[r2,#24]
+ add r7,r7,r1
+ ldr r1,[r2,#28]
+ add r8,r8,r0
+ str r4,[r2],#4
+ add r9,r9,r12
+ str r5,[r2],#4
+ add r10,r10,r3
+ str r6,[r2],#4
+ add r11,r11,r1
+ str r7,[r2],#4
+ stmia r2,{r8-r11}
+
+ ittte ne
+ movne r1,sp
+ ldrne r2,[sp,#0]
+ eorne r12,r12,r12
+ ldreq sp,[sp,#76] @ restore original sp
+ itt ne
+ eorne r3,r5,r6
+ bne .L_00_48
+
+ ldmia sp!,{r4-r12,pc}
+.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d
+# endif
+
+.type sha256_block_data_order_armv8,%function
+.align 5
+sha256_block_data_order_armv8:
+.LARMv8:
+ vld1.32 {q0,q1},[r0]
+# ifdef __thumb2__
+ adr r3,.LARMv8
+ sub r3,r3,#.LARMv8-K256
+# else
+ adrl r3,K256
+# endif
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+
+.Loop_v8:
+ vld1.8 {q8-q9},[r1]!
+ vld1.8 {q10-q11},[r1]!
+ vld1.32 {q12},[r3]!
+ vrev32.8 q8,q8
+ vrev32.8 q9,q9
+ vrev32.8 q10,q10
+ vrev32.8 q11,q11
+ vmov q14,q0 @ offload
+ vmov q15,q1
+ teq r1,r2
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+
+ vld1.32 {q13},[r3]
+ vadd.i32 q12,q12,q10
+ sub r3,r3,#256-16 @ rewind
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+
+ vadd.i32 q13,q13,q11
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+
+ vadd.i32 q0,q0,q14
+ vadd.i32 q1,q1,q15
+ it ne
+ bne .Loop_v8
+
+ vst1.32 {q0,q1},[r0]
+
+ bx lr @ bx lr
+.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <[email protected]>"
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm OPENSSL_armcap_P,4,4
+#endif
diff --git a/arch/arm/crypto/sha256_glue.c b/arch/arm/crypto/sha256_glue.c
new file mode 100644
index 0000000..25ceba6
--- /dev/null
+++ b/arch/arm/crypto/sha256_glue.c
@@ -0,0 +1,246 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using optimized ARM assembler and NEON instructions.
+ *
+ * Copyright ? 2015 Google Inc.
+ *
+ * This file is based on sha256_ssse3_glue.c:
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+#include "sha256_glue.h"
+
+asmlinkage void sha256_block_data_order(u32 *digest, const void *data,
+ unsigned int num_blks);
+
+
+int sha256_init(struct shash_desc *desc)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ sctx->state[0] = SHA256_H0;
+ sctx->state[1] = SHA256_H1;
+ sctx->state[2] = SHA256_H2;
+ sctx->state[3] = SHA256_H3;
+ sctx->state[4] = SHA256_H4;
+ sctx->state[5] = SHA256_H5;
+ sctx->state[6] = SHA256_H6;
+ sctx->state[7] = SHA256_H7;
+ sctx->count = 0;
+
+ return 0;
+}
+
+int sha224_init(struct shash_desc *desc)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ sctx->state[0] = SHA224_H0;
+ sctx->state[1] = SHA224_H1;
+ sctx->state[2] = SHA224_H2;
+ sctx->state[3] = SHA224_H3;
+ sctx->state[4] = SHA224_H4;
+ sctx->state[5] = SHA224_H5;
+ sctx->state[6] = SHA224_H6;
+ sctx->state[7] = SHA224_H7;
+ sctx->count = 0;
+
+ return 0;
+}
+
+int __sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len,
+ unsigned int partial)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int done = 0;
+
+ sctx->count += len;
+
+ if (partial) {
+ done = SHA256_BLOCK_SIZE - partial;
+ memcpy(sctx->buf + partial, data, done);
+ sha256_block_data_order(sctx->state, sctx->buf, 1);
+ }
+
+ if (len - done >= SHA256_BLOCK_SIZE) {
+ const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+ sha256_block_data_order(sctx->state, data + done, rounds);
+ done += rounds * SHA256_BLOCK_SIZE;
+ }
+
+ memcpy(sctx->buf, data + done, len - done);
+
+ return 0;
+}
+
+int sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+
+ /* Handle the fast case right here */
+ if (partial + len < SHA256_BLOCK_SIZE) {
+ sctx->count += len;
+ memcpy(sctx->buf + partial, data, len);
+
+ return 0;
+ }
+
+ return __sha256_update(desc, data, len, partial);
+}
+
+/* Add padding and return the message digest. */
+static int sha256_final(struct shash_desc *desc, u8 *out)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int i, index, padlen;
+ __be32 *dst = (__be32 *)out;
+ __be64 bits;
+ static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+ /* save number of bits */
+ bits = cpu_to_be64(sctx->count << 3);
+
+ /* Pad out to 56 mod 64 and append length */
+ index = sctx->count % SHA256_BLOCK_SIZE;
+ padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+ /* We need to fill a whole block for __sha256_update */
+ if (padlen <= 56) {
+ sctx->count += padlen;
+ memcpy(sctx->buf + index, padding, padlen);
+ } else {
+ __sha256_update(desc, padding, padlen, index);
+ }
+ __sha256_update(desc, (const u8 *)&bits, sizeof(bits), 56);
+
+ /* Store state in digest */
+ for (i = 0; i < 8; i++)
+ dst[i] = cpu_to_be32(sctx->state[i]);
+
+ /* Wipe context */
+ memset(sctx, 0, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha224_final(struct shash_desc *desc, u8 *out)
+{
+ u8 D[SHA256_DIGEST_SIZE];
+
+ sha256_final(desc, D);
+
+ memcpy(out, D, SHA224_DIGEST_SIZE);
+ memzero_explicit(D, SHA256_DIGEST_SIZE);
+
+ return 0;
+}
+
+int sha256_export(struct shash_desc *desc, void *out)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(out, sctx, sizeof(*sctx));
+
+ return 0;
+}
+
+int sha256_import(struct shash_desc *desc, const void *in)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(sctx, in, sizeof(*sctx));
+
+ return 0;
+}
+
+static struct shash_alg algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_init,
+ .update = sha256_update,
+ .final = sha256_final,
+ .export = sha256_export,
+ .import = sha256_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-asm",
+ .cra_priority = 150,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_init,
+ .update = sha256_update,
+ .final = sha224_final,
+ .export = sha256_export,
+ .import = sha256_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-asm",
+ .cra_priority = 150,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static int __init sha256_mod_init(void)
+{
+ int res = crypto_register_shashes(algs, ARRAY_SIZE(algs));
+
+ if (res < 0)
+ return res;
+
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) {
+ res = crypto_register_shashes(sha256_neon_algs,
+ ARRAY_SIZE(sha256_neon_algs));
+
+ if (res < 0)
+ crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+ }
+
+ return res;
+}
+
+static void __exit sha256_mod_fini(void)
+{
+ crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon())
+ crypto_unregister_shashes(sha256_neon_algs,
+ ARRAY_SIZE(sha256_neon_algs));
+}
+
+module_init(sha256_mod_init);
+module_exit(sha256_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm (ARM), including NEON");
+
+MODULE_ALIAS_CRYPTO("sha256");
diff --git a/arch/arm/crypto/sha256_glue.h b/arch/arm/crypto/sha256_glue.h
new file mode 100644
index 0000000..0312f4f
--- /dev/null
+++ b/arch/arm/crypto/sha256_glue.h
@@ -0,0 +1,23 @@
+#ifndef _CRYPTO_SHA256_GLUE_H
+#define _CRYPTO_SHA256_GLUE_H
+
+#include <linux/crypto.h>
+#include <crypto/sha.h>
+
+extern struct shash_alg sha256_neon_algs[2];
+
+extern int sha256_init(struct shash_desc *desc);
+
+extern int sha224_init(struct shash_desc *desc);
+
+extern int __sha256_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, unsigned int partial);
+
+extern int sha256_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len);
+
+extern int sha256_export(struct shash_desc *desc, void *out);
+
+extern int sha256_import(struct shash_desc *desc, const void *in);
+
+#endif /* _CRYPTO_SHA256_GLUE_H */
diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
new file mode 100644
index 0000000..d0f168d
--- /dev/null
+++ b/arch/arm/crypto/sha256_neon_glue.c
@@ -0,0 +1,172 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using NEON instructions.
+ *
+ * Copyright ? 2015 Google Inc.
+ *
+ * This file is based on sha512_neon_glue.c:
+ * Copyright ? 2014 Jussi Kivilinna <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+#include "sha256_glue.h"
+
+asmlinkage void sha256_block_data_order_neon(u32 *digest, const void *data,
+ unsigned int num_blks);
+
+
+static int __sha256_neon_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, unsigned int partial)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int done = 0;
+
+ sctx->count += len;
+
+ if (partial) {
+ done = SHA256_BLOCK_SIZE - partial;
+ memcpy(sctx->buf + partial, data, done);
+ sha256_block_data_order_neon(sctx->state, sctx->buf, 1);
+ }
+
+ if (len - done >= SHA256_BLOCK_SIZE) {
+ const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+ sha256_block_data_order_neon(sctx->state, data + done, rounds);
+ done += rounds * SHA256_BLOCK_SIZE;
+ }
+
+ memcpy(sctx->buf, data + done, len - done);
+
+ return 0;
+}
+
+static int sha256_neon_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+ int res;
+
+ /* Handle the fast case right here */
+ if (partial + len < SHA256_BLOCK_SIZE) {
+ sctx->count += len;
+ memcpy(sctx->buf + partial, data, len);
+
+ return 0;
+ }
+
+ if (!may_use_simd()) {
+ res = __sha256_update(desc, data, len, partial);
+ } else {
+ kernel_neon_begin();
+ res = __sha256_neon_update(desc, data, len, partial);
+ kernel_neon_end();
+ }
+
+ return res;
+}
+
+/* Add padding and return the message digest. */
+static int sha256_neon_final(struct shash_desc *desc, u8 *out)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int i, index, padlen;
+ __be32 *dst = (__be32 *)out;
+ __be64 bits;
+ static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+ /* save number of bits */
+ bits = cpu_to_be64(sctx->count << 3);
+
+ /* Pad out to 56 mod 64 and append length */
+ index = sctx->count % SHA256_BLOCK_SIZE;
+ padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+ if (!may_use_simd()) {
+ sha256_update(desc, padding, padlen);
+ sha256_update(desc, (const u8 *)&bits, sizeof(bits));
+ } else {
+ kernel_neon_begin();
+ /* We need to fill a whole block for __sha256_neon_update() */
+ if (padlen <= 56) {
+ sctx->count += padlen;
+ memcpy(sctx->buf + index, padding, padlen);
+ } else {
+ __sha256_neon_update(desc, padding, padlen, index);
+ }
+ __sha256_neon_update(desc, (const u8 *)&bits,
+ sizeof(bits), 56);
+ kernel_neon_end();
+ }
+
+ /* Store state in digest */
+ for (i = 0; i < 8; i++)
+ dst[i] = cpu_to_be32(sctx->state[i]);
+
+ /* Wipe context */
+ memzero_explicit(sctx, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha224_neon_final(struct shash_desc *desc, u8 *out)
+{
+ u8 D[SHA256_DIGEST_SIZE];
+
+ sha256_neon_final(desc, D);
+
+ memcpy(out, D, SHA224_DIGEST_SIZE);
+ memzero_explicit(D, SHA256_DIGEST_SIZE);
+
+ return 0;
+}
+
+struct shash_alg sha256_neon_algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_init,
+ .update = sha256_neon_update,
+ .final = sha256_neon_final,
+ .export = sha256_export,
+ .import = sha256_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-neon",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_init,
+ .update = sha256_neon_update,
+ .final = sha224_neon_final,
+ .export = sha256_export,
+ .import = sha256_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-neon",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };

2015-03-23 18:26:04

by Ard Biesheuvel

[permalink] [raw]

Subject: Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224

(resending due to size bounce)

On 23 March 2015 at 14:50, Sami Tolvanen <[email protected]> wrote:
> Add Andy Polyakov's optimized assembly and NEON implementations for
> SHA-256/224.
>
> The sha256-armv4.pl script for generating the assembly code is from
> OpenSSL commit 2ecd32a1f8f0643ae7b38f59bbaf9f0d6ef326fe.
>
> Compared to sha256-generic these implementations have the following
> tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):
>
> bs b/u sha256-neon sha256-asm
> 16 16 x1.32 x1.19
> 64 16 x1.27 x1.15
> 64 64 x1.36 x1.20
> 256 16 x1.22 x1.11
> 256 64 x1.36 x1.19
> 256 256 x1.59 x1.23
> 1024 16 x1.21 x1.10
> 1024 256 x1.65 x1.23
> 1024 1024 x1.76 x1.25
> 2048 16 x1.21 x1.10
> 2048 256 x1.66 x1.23
> 2048 1024 x1.78 x1.25
> 2048 2048 x1.79 x1.25
> 4096 16 x1.20 x1.09
> 4096 256 x1.66 x1.23
> 4096 1024 x1.79 x1.26
> 4096 4096 x1.82 x1.26
> 8192 16 x1.20 x1.09
> 8192 256 x1.67 x1.23
> 8192 1024 x1.80 x1.26
> 8192 4096 x1.85 x1.28
> 8192 8192 x1.85 x1.27
>
> Where bs refers to block size and b/u to bytes per update.
>
> Signed-off-by: Sami Tolvanen <[email protected]>
> Cc: Andy Polyakov <[email protected]>
>

This builds fine and passes the tcrypt.ko tests in ARM and Thumb2 and
even in big-endian (ARM) mode, so

Tested-by: Ard Biesheuvel <[email protected]>
Reviewed-by: Ard Biesheuvel <[email protected]>

Nice work!

Ard.

> ---
> Changes since v1:
> Rebased to Herbert's cryptodev tree
> Include sha256-armv4.pl and use it to generate sha256-core.S
> Add integer-only assembly version as sha256-asm
> Add support for SHA-224 to the glue code
> Change priority for sha256/224-ce to 300
>
> ---
> arch/arm/crypto/Kconfig | 7
> arch/arm/crypto/Makefile | 8
> arch/arm/crypto/sha2-ce-glue.c | 4
> arch/arm/crypto/sha256-armv4.pl | 713 ++++++
> arch/arm/crypto/sha256-core.S_shipped | 2775 ++++++++++++++++++++++++
> arch/arm/crypto/sha256_glue.c | 246 ++
> arch/arm/crypto/sha256_glue.h | 23
> arch/arm/crypto/sha256_neon_glue.c | 172 +
> 8 files changed, 3945 insertions(+), 3 deletions(-)
>

2015-03-24 11:33:09

[permalink] [raw]

Subject: Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224

On Mon, Mar 23, 2015 at 01:50:09PM +0000, Sami Tolvanen wrote:
> Add Andy Polyakov's optimized assembly and NEON implementations for
> SHA-256/224.
>
> The sha256-armv4.pl script for generating the assembly code is from
> OpenSSL commit 2ecd32a1f8f0643ae7b38f59bbaf9f0d6ef326fe.
>
> Compared to sha256-generic these implementations have the following
> tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):
>
> bs b/u sha256-neon sha256-asm
> 16 16 x1.32 x1.19
> 64 16 x1.27 x1.15
> 64 64 x1.36 x1.20
> 256 16 x1.22 x1.11
> 256 64 x1.36 x1.19
> 256 256 x1.59 x1.23
> 1024 16 x1.21 x1.10
> 1024 256 x1.65 x1.23
> 1024 1024 x1.76 x1.25
> 2048 16 x1.21 x1.10
> 2048 256 x1.66 x1.23
> 2048 1024 x1.78 x1.25
> 2048 2048 x1.79 x1.25
> 4096 16 x1.20 x1.09
> 4096 256 x1.66 x1.23
> 4096 1024 x1.79 x1.26
> 4096 4096 x1.82 x1.26
> 8192 16 x1.20 x1.09
> 8192 256 x1.67 x1.23
> 8192 1024 x1.80 x1.26
> 8192 4096 x1.85 x1.28
> 8192 8192 x1.85 x1.27
>
> Where bs refers to block size and b/u to bytes per update.
>
> Signed-off-by: Sami Tolvanen <[email protected]>
> Cc: Andy Polyakov <[email protected]>

Your patch didn't make it to the linux-crypto list and therefore
it never got into patchwork. Can you please find out why and
resend it?

Thanks,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2015-03-24 11:33:53

by Ard Biesheuvel

[permalink] [raw]

Subject: Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224

On 24 March 2015 at 12:32, Herbert Xu <[email protected]> wrote:
> On Mon, Mar 23, 2015 at 01:50:09PM +0000, Sami Tolvanen wrote:
>> Add Andy Polyakov's optimized assembly and NEON implementations for
>> SHA-256/224.
>>
>> The sha256-armv4.pl script for generating the assembly code is from
>> OpenSSL commit 2ecd32a1f8f0643ae7b38f59bbaf9f0d6ef326fe.
>>
>> Compared to sha256-generic these implementations have the following
>> tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):
>>
>> bs b/u sha256-neon sha256-asm
>> 16 16 x1.32 x1.19
>> 64 16 x1.27 x1.15
>> 64 64 x1.36 x1.20
>> 256 16 x1.22 x1.11
>> 256 64 x1.36 x1.19
>> 256 256 x1.59 x1.23
>> 1024 16 x1.21 x1.10
>> 1024 256 x1.65 x1.23
>> 1024 1024 x1.76 x1.25
>> 2048 16 x1.21 x1.10
>> 2048 256 x1.66 x1.23
>> 2048 1024 x1.78 x1.25
>> 2048 2048 x1.79 x1.25
>> 4096 16 x1.20 x1.09
>> 4096 256 x1.66 x1.23
>> 4096 1024 x1.79 x1.26
>> 4096 4096 x1.82 x1.26
>> 8192 16 x1.20 x1.09
>> 8192 256 x1.67 x1.23
>> 8192 1024 x1.80 x1.26
>> 8192 4096 x1.85 x1.28
>> 8192 8192 x1.85 x1.27
>>
>> Where bs refers to block size and b/u to bytes per update.
>>
>> Signed-off-by: Sami Tolvanen <[email protected]>
>> Cc: Andy Polyakov <[email protected]>
>
> Your patch didn't make it to the linux-crypto list and therefore
> it never got into patchwork. Can you please find out why and
> resend it?
>

Most likely because it is so big ...

2015-03-24 11:35:11

[permalink] [raw]

Subject: Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224

On Mon, Mar 23, 2015 at 07:26:03PM +0100, Ard Biesheuvel wrote:
> (resending due to size bounce)

Aha that's why the patch didn't make it through. Can it be split
up?

Thanks,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2015-03-24 11:40:50

by Ard Biesheuvel

[permalink] [raw]

Subject: Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224

On 24 March 2015 at 12:35, Herbert Xu <[email protected]> wrote:
> On Mon, Mar 23, 2015 at 07:26:03PM +0100, Ard Biesheuvel wrote:
>> (resending due to size bounce)
>
> Aha that's why the patch didn't make it through. Can it be split
> up?

Not so easily. It consists (among other things) of a .pl file that
generates a .S file, but to prevent introducing a build time
dependency on perl, the .S file is included as a .S_shipped file. That
is the big one.

i suppose we could add the .S_shipped file in a separate patch, but
I'd prefer to keep it as is

2015-03-24 11:46:18

[permalink] [raw]

Subject: Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224

On Tue, Mar 24, 2015 at 12:40:50PM +0100, Ard Biesheuvel wrote:
>
> Not so easily. It consists (among other things) of a .pl file that
> generates a .S file, but to prevent introducing a build time
> dependency on perl, the .S file is included as a .S_shipped file. That
> is the big one.
>
> i suppose we could add the .S_shipped file in a separate patch, but
> I'd prefer to keep it as is

OK then this will have to go up on a website somewhere and then
the URL can be posted to the list for review.

Thanks,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2015-03-24 11:57:34

by Ard Biesheuvel

[permalink] [raw]

Subject: Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224

On 24 March 2015 at 12:46, Herbert Xu <[email protected]> wrote:
> On Tue, Mar 24, 2015 at 12:40:50PM +0100, Ard Biesheuvel wrote:
>>
>> Not so easily. It consists (among other things) of a .pl file that
>> generates a .S file, but to prevent introducing a build time
>> dependency on perl, the .S file is included as a .S_shipped file. That
>> is the big one.
>>
>> i suppose we could add the .S_shipped file in a separate patch, but
>> I'd prefer to keep it as is
>
> OK then this will have to go up on a website somewhere and then
> the URL can be posted to the list for review.
>

You can pull it from here if you like
https://git.linaro.org/people/ard.biesheuvel/linux-arm.git/shortlog/refs/tags/arm-sha256-neon
(rebased onto your cryptodev-2.6/master branch as of 30 mins ago)

Does that work or you?

Thanks,
Ard.

2015-03-24 12:27:02

by Jean-Christophe PLAGNIOL-VILLARD

[permalink] [raw]

Subject: Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224

On 13:50 Mon 23 Mar , Sami Tolvanen wrote:
> Add Andy Polyakov's optimized assembly and NEON implementations for
> SHA-256/224.
>
> The sha256-armv4.pl script for generating the assembly code is from
> OpenSSL commit 2ecd32a1f8f0643ae7b38f59bbaf9f0d6ef326fe.
>
> Compared to sha256-generic these implementations have the following
> tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):
>
> bs b/u sha256-neon sha256-asm
> 16 16 x1.32 x1.19
> 64 16 x1.27 x1.15
> 64 64 x1.36 x1.20
> 256 16 x1.22 x1.11
> 256 64 x1.36 x1.19
> 256 256 x1.59 x1.23
> 1024 16 x1.21 x1.10
> 1024 256 x1.65 x1.23
> 1024 1024 x1.76 x1.25
> 2048 16 x1.21 x1.10
> 2048 256 x1.66 x1.23
> 2048 1024 x1.78 x1.25
> 2048 2048 x1.79 x1.25
> 4096 16 x1.20 x1.09
> 4096 256 x1.66 x1.23
> 4096 1024 x1.79 x1.26
> 4096 4096 x1.82 x1.26
> 8192 16 x1.20 x1.09
> 8192 256 x1.67 x1.23
> 8192 1024 x1.80 x1.26
> 8192 4096 x1.85 x1.28
> 8192 8192 x1.85 x1.27
>
> Where bs refers to block size and b/u to bytes per update.
>
> Signed-off-by: Sami Tolvanen <[email protected]>
> Cc: Andy Polyakov <[email protected]>
>
> ---
> Changes since v1:
> Rebased to Herbert's cryptodev tree
> Include sha256-armv4.pl and use it to generate sha256-core.S
> Add integer-only assembly version as sha256-asm
> Add support for SHA-224 to the glue code
> Change priority for sha256/224-ce to 300
>
> ---
> arch/arm/crypto/Kconfig | 7
> arch/arm/crypto/Makefile | 8
> arch/arm/crypto/sha2-ce-glue.c | 4
> arch/arm/crypto/sha256-armv4.pl | 713 ++++++
> arch/arm/crypto/sha256-core.S_shipped | 2775 ++++++++++++++++++++++++
> arch/arm/crypto/sha256_glue.c | 246 ++
> arch/arm/crypto/sha256_glue.h | 23
> arch/arm/crypto/sha256_neon_glue.c | 172 +
> 8 files changed, 3945 insertions(+), 3 deletions(-)
>
> diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
> index d63f319..458729d 100644
> --- a/arch/arm/crypto/Kconfig
> +++ b/arch/arm/crypto/Kconfig
> @@ -46,6 +46,13 @@ config CRYPTO_SHA2_ARM_CE
> SHA-256 secure hash standard (DFIPS 180-2) implemented
> using special ARMv8 Crypto Extensions.
>
> +config CRYPTO_SHA256_ARM
> + tristate "SHA-224/256 digest algorithm (ARM-asm and NEON)"
> + select CRYPTO_HASH
> + help
> + SHA-256 secure hash standard (DFIPS 180-2) implemented
> + using optimized ARM assembler and NEON, when available.
> +
> config CRYPTO_SHA512_ARM_NEON
> tristate "SHA384 and SHA512 digest algorithm (ARM NEON)"
> depends on KERNEL_MODE_NEON
> diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
> index 9a273bd..ef46e89 100644
> --- a/arch/arm/crypto/Makefile
> +++ b/arch/arm/crypto/Makefile
> @@ -7,6 +7,7 @@ obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
> obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
> obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
> obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
> +obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
> obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
> obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
> obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
> @@ -16,6 +17,8 @@ aes-arm-y := aes-armv4.o aes_glue.o
> aes-arm-bs-y := aesbs-core.o aesbs-glue.o
> sha1-arm-y := sha1-armv4-large.o sha1_glue.o
> sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o
> +sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
> +sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
> sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
> sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o
> sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o
> @@ -28,4 +31,7 @@ quiet_cmd_perl = PERL $@
> $(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
> $(call cmd,perl)
>
> -.PRECIOUS: $(obj)/aesbs-core.S
> +$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
> + $(call cmd,perl)
> +
> +.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S
> diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c
> index 9ffe8ad..0449eca 100644
> --- a/arch/arm/crypto/sha2-ce-glue.c
> +++ b/arch/arm/crypto/sha2-ce-glue.c
> @@ -163,7 +163,7 @@ static struct shash_alg algs[] = { {
> .base = {
> .cra_name = "sha224",
> .cra_driver_name = "sha224-ce",
> - .cra_priority = 200,
> + .cra_priority = 300,
> .cra_flags = CRYPTO_ALG_TYPE_SHASH,
> .cra_blocksize = SHA256_BLOCK_SIZE,
> .cra_module = THIS_MODULE,
> @@ -180,7 +180,7 @@ static struct shash_alg algs[] = { {
> .base = {
> .cra_name = "sha256",
> .cra_driver_name = "sha256-ce",
> - .cra_priority = 200,
> + .cra_priority = 300,
> .cra_flags = CRYPTO_ALG_TYPE_SHASH,
> .cra_blocksize = SHA256_BLOCK_SIZE,
> .cra_module = THIS_MODULE,
> diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl
> new file mode 100644
> index 0000000..4fee74d
> --- /dev/null
> +++ b/arch/arm/crypto/sha256-armv4.pl
> @@ -0,0 +1,713 @@
> +#!/usr/bin/env perl
> +
> +# ====================================================================
> +# Written by Andy Polyakov <[email protected]> for the OpenSSL
> +# project. The module is, however, dual licensed under OpenSSL and
> +# CRYPTOGAMS licenses depending on where you obtain it. For further
> +# details see http://www.openssl.org/~appro/cryptogams/.
> +#
> +# Permission to use under GPL terms is granted.
> +# ====================================================================
> +
> +# SHA256 block procedure for ARMv4. May 2007.
> +
> +# Performance is ~2x better than gcc 3.4 generated code and in "abso-
> +# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
> +# byte [on single-issue Xscale PXA250 core].
> +
> +# July 2010.
> +#
> +# Rescheduling for dual-issue pipeline resulted in 22% improvement on
> +# Cortex A8 core and ~20 cycles per processed byte.
> +
> +# February 2011.
> +#
> +# Profiler-assisted and platform-specific optimization resulted in 16%
> +# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
> +
> +# September 2013.
> +#
> +# Add NEON implementation. On Cortex A8 it was measured to process one
> +# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
> +# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
> +# code (meaning that latter performs sub-optimally, nothing was done
> +# about it).
> +
> +# May 2014.
> +#
> +# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
> +
> +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
> +open STDOUT,">$output";
> +
> +$ctx="r0"; $t0="r0";
> +$inp="r1"; $t4="r1";
> +$len="r2"; $t1="r2";
> +$T1="r3"; $t3="r3";
> +$A="r4";
> +$B="r5";
> +$C="r6";
> +$D="r7";
> +$E="r8";
> +$F="r9";
> +$G="r10";
> +$H="r11";
> +@V=($A,$B,$C,$D,$E,$F,$G,$H);
> +$t2="r12";
> +$Ktbl="r14";
> +
> +@Sigma0=( 2,13,22);
> +@Sigma1=( 6,11,25);
> +@sigma0=( 7,18, 3);
> +@sigma1=(17,19,10);
> +
> +sub BODY_00_15 {
> +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
> +
> +$code.=<<___ if ($i<16);
> +#if __ARM_ARCH__>=7
> + @ ldr $t1,[$inp],#4 @ $i
> +# if $i==15
> + str $inp,[sp,#17*4] @ make room for $t4
> +# endif
> + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
> + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
> + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
> + rev $t1,$t1
> +#else
> + @ ldrb $t1,[$inp,#3] @ $i
> + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
> + ldrb $t2,[$inp,#2]
> + ldrb $t0,[$inp,#1]
> + orr $t1,$t1,$t2,lsl#8
> + ldrb $t2,[$inp],#4
> + orr $t1,$t1,$t0,lsl#16
> +# if $i==15
> + str $inp,[sp,#17*4] @ make room for $t4
> +# endif
> + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
> + orr $t1,$t1,$t2,lsl#24
> + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
> +#endif
> +___
> +$code.=<<___;
> + ldr $t2,[$Ktbl],#4 @ *K256++
> + add $h,$h,$t1 @ h+=X[i]
> + str $t1,[sp,#`$i%16`*4]
> + eor $t1,$f,$g
> + add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
> + and $t1,$t1,$e
> + add $h,$h,$t2 @ h+=K256[i]
> + eor $t1,$t1,$g @ Ch(e,f,g)
> + eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
> + add $h,$h,$t1 @ h+=Ch(e,f,g)
> +#if $i==31
> + and $t2,$t2,#0xff
> + cmp $t2,#0xf2 @ done?
> +#endif
> +#if $i<15
> +# if __ARM_ARCH__>=7
> + ldr $t1,[$inp],#4 @ prefetch
> +# else
> + ldrb $t1,[$inp,#3]
> +# endif
> + eor $t2,$a,$b @ a^b, b^c in next round
> +#else
> + ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
> + eor $t2,$a,$b @ a^b, b^c in next round
> + ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
> +#endif
> + eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
> + and $t3,$t3,$t2 @ (b^c)&=(a^b)
> + add $d,$d,$h @ d+=h
> + eor $t3,$t3,$b @ Maj(a,b,c)
> + add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
> + @ add $h,$h,$t3 @ h+=Maj(a,b,c)
> +___
> + ($t2,$t3)=($t3,$t2);
> +}
> +
> +sub BODY_16_XX {
> +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
> +
> +$code.=<<___;
> + @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
> + @ ldr $t4,[sp,#`($i+14)%16`*4]
> + mov $t0,$t1,ror#$sigma0[0]
> + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
> + mov $t2,$t4,ror#$sigma1[0]
> + eor $t0,$t0,$t1,ror#$sigma0[1]
> + eor $t2,$t2,$t4,ror#$sigma1[1]
> + eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
> + ldr $t1,[sp,#`($i+0)%16`*4]
> + eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
> + ldr $t4,[sp,#`($i+9)%16`*4]
> +
> + add $t2,$t2,$t0
> + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
> + add $t1,$t1,$t2
> + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
> + add $t1,$t1,$t4 @ X[i]
> +___
> + &BODY_00_15(@_);
> +}
> +
> +$code=<<___;
> +#ifndef __KERNEL__
> +# include "arm_arch.h"
> +#else
> +# define __ARM_ARCH__ __LINUX_ARM_ARCH__
> +# define __ARM_MAX_ARCH__ 7
I'm not sure this will work for kernel that is for older arm
> +#endif
> +
> +.text
> +#if __ARM_ARCH__<7
> +.code 32
> +#else
> +.syntax unified
> +# ifdef __thumb2__
> +.thumb
> +# else
> +.code 32
> +# endif
> +#endif
> +
> +.type K256,%object
> +.align 5
> +K256:
> +.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
> +.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
> +.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
> +.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
> +.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
> +.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
> +.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
> +.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
> +.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
> +.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
> +.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
> +.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
> +.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
> +.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
> +.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
> +.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
> +.size K256,.-K256
> +.word 0 @ terminator
> +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
> +.LOPENSSL_armcap:
> +.word OPENSSL_armcap_P-sha256_block_data_order
> +#endif
> +.align 5
> +
> +.global sha256_block_data_order
> +.type sha256_block_data_order,%function
> +sha256_block_data_order:
> +#if __ARM_ARCH__<7
> + sub r3,pc,#8 @ sha256_block_data_order
> +#else
> + adr r3,sha256_block_data_order
> +#endif
> +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
> + ldr r12,.LOPENSSL_armcap
> + ldr r12,[r3,r12] @ OPENSSL_armcap_P
> + tst r12,#ARMV8_SHA256
> + bne .LARMv8
> + tst r12,#ARMV7_NEON
> + bne .LNEON
> +#endif
> + add $len,$inp,$len,lsl#6 @ len to point at the end of inp
> + stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
> + ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
> + sub $Ktbl,r3,#256+32 @ K256
> + sub sp,sp,#16*4 @ alloca(X[16])
> +.Loop:
> +# if __ARM_ARCH__>=7
> + ldr $t1,[$inp],#4
> +# else
> + ldrb $t1,[$inp,#3]
> +# endif
> + eor $t3,$B,$C @ magic
> + eor $t2,$t2,$t2
> +___
> +for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
> +$code.=".Lrounds_16_xx:\n";
> +for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
> +$code.=<<___;
> +#if __ARM_ARCH__>=7
> + ite eq @ Thumb2 thing, sanity check in ARM
> +#endif
> + ldreq $t3,[sp,#16*4] @ pull ctx
> + bne .Lrounds_16_xx
> +
> + add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
> + ldr $t0,[$t3,#0]
> + ldr $t1,[$t3,#4]
> + ldr $t2,[$t3,#8]
> + add $A,$A,$t0
> + ldr $t0,[$t3,#12]
> + add $B,$B,$t1
> + ldr $t1,[$t3,#16]
> + add $C,$C,$t2
> + ldr $t2,[$t3,#20]
> + add $D,$D,$t0
> + ldr $t0,[$t3,#24]
> + add $E,$E,$t1
> + ldr $t1,[$t3,#28]
> + add $F,$F,$t2
> + ldr $inp,[sp,#17*4] @ pull inp
> + ldr $t2,[sp,#18*4] @ pull inp+len
> + add $G,$G,$t0
> + add $H,$H,$t1
> + stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
> + cmp $inp,$t2
> + sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
> + bne .Loop
> +
> + add sp,sp,#`16+3`*4 @ destroy frame
> +#if __ARM_ARCH__>=5
> + ldmia sp!,{r4-r11,pc}
> +#else
> + ldmia sp!,{r4-r11,lr}
> + tst lr,#1
> + moveq pc,lr @ be binary compatible with V4, yet
> + bx lr @ interoperable with Thumb ISA:-)
> +#endif
> +.size sha256_block_data_order,.-sha256_block_data_order
> +___
> +######################################################################
> +# NEON stuff
> +#
> +{{{
> +my @X=map("q$_",(0..3));
> +my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
> +my $Xfer=$t4;
> +my $j=0;
> +
> +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
> +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
> +
> +sub AUTOLOAD() # thunk [simplified] x86-style perlasm
> +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
> + my $arg = pop;
> + $arg = "#$arg" if ($arg*1 eq $arg);
> + $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
> +}
> +
> +sub Xupdate()
> +{ use integer;
> + my $body = shift;
> + my @insns = (&$body,&$body,&$body,&$body);
> + my ($a,$b,$c,$d,$e,$f,$g,$h);
> +
> + &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
> + eval(shift(@insns));
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
> + eval(shift(@insns));
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vshr_u32 ($T2,$T0,$sigma0[0]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vshr_u32 ($T1,$T0,$sigma0[2]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vsli_32 ($T2,$T0,32-$sigma0[0]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vshr_u32 ($T3,$T0,$sigma0[1]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &veor ($T1,$T1,$T2);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vsli_32 ($T3,$T0,32-$sigma0[1]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &veor ($T1,$T1,$T3); # sigma0(X[1..4])
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &veor ($T5,$T5,$T4);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &veor ($T5,$T5,$T4); # sigma1(X[14..15])
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &veor ($T5,$T5,$T4);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vld1_32 ("{$T0}","[$Ktbl,:128]!");
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &veor ($T5,$T5,$T4); # sigma1(X[16..17])
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vadd_i32 ($T0,$T0,@X[0]);
> + while($#insns>=2) { eval(shift(@insns)); }
> + &vst1_32 ("{$T0}","[$Xfer,:128]!");
> + eval(shift(@insns));
> + eval(shift(@insns));
> +
> + push(@X,shift(@X)); # "rotate" X[]
> +}
> +
> +sub Xpreload()
> +{ use integer;
> + my $body = shift;
> + my @insns = (&$body,&$body,&$body,&$body);
> + my ($a,$b,$c,$d,$e,$f,$g,$h);
> +
> + eval(shift(@insns));
> + eval(shift(@insns));
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vld1_32 ("{$T0}","[$Ktbl,:128]!");
> + eval(shift(@insns));
> + eval(shift(@insns));
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vrev32_8 (@X[0],@X[0]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vadd_i32 ($T0,$T0,@X[0]);
> + foreach (@insns) { eval; } # remaining instructions
> + &vst1_32 ("{$T0}","[$Xfer,:128]!");
> +
> + push(@X,shift(@X)); # "rotate" X[]
> +}
> +
> +sub body_00_15 () {
> + (
> + '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
> + '&add ($h,$h,$t1)', # h+=X[i]+K[i]
> + '&eor ($t1,$f,$g)',
> + '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
> + '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
> + '&and ($t1,$t1,$e)',
> + '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
> + '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
> + '&eor ($t1,$t1,$g)', # Ch(e,f,g)
> + '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
> + '&eor ($t2,$a,$b)', # a^b, b^c in next round
> + '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
> + '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
> + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
> + '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
> + '&ldr ($t1,"[sp,#64]") if ($j==31)',
> + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
> + '&add ($d,$d,$h)', # d+=h
> + '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
> + '&eor ($t3,$t3,$b)', # Maj(a,b,c)
> + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
> + )
> +}
> +
> +$code.=<<___;
> +#if __ARM_MAX_ARCH__>=7
this will be compile on armv4 but gcc will not allow it

we need to drop the neon code for older non v7 build

Best Regards,
J.

2015-03-24 12:42:38

by Ard Biesheuvel

[permalink] [raw]

Subject: Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224

On 24 March 2015 at 13:27, Jean-Christophe PLAGNIOL-VILLARD
<[email protected]> wrote:
> On 13:50 Mon 23 Mar , Sami Tolvanen wrote:
>> Add Andy Polyakov's optimized assembly and NEON implementations for
>> SHA-256/224.
>>
>> The sha256-armv4.pl script for generating the assembly code is from
>> OpenSSL commit 2ecd32a1f8f0643ae7b38f59bbaf9f0d6ef326fe.
>>
>> Compared to sha256-generic these implementations have the following
>> tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):
>>
>> bs b/u sha256-neon sha256-asm
>> 16 16 x1.32 x1.19
>> 64 16 x1.27 x1.15
>> 64 64 x1.36 x1.20
>> 256 16 x1.22 x1.11
>> 256 64 x1.36 x1.19
>> 256 256 x1.59 x1.23
>> 1024 16 x1.21 x1.10
>> 1024 256 x1.65 x1.23
>> 1024 1024 x1.76 x1.25
>> 2048 16 x1.21 x1.10
>> 2048 256 x1.66 x1.23
>> 2048 1024 x1.78 x1.25
>> 2048 2048 x1.79 x1.25
>> 4096 16 x1.20 x1.09
>> 4096 256 x1.66 x1.23
>> 4096 1024 x1.79 x1.26
>> 4096 4096 x1.82 x1.26
>> 8192 16 x1.20 x1.09
>> 8192 256 x1.67 x1.23
>> 8192 1024 x1.80 x1.26
>> 8192 4096 x1.85 x1.28
>> 8192 8192 x1.85 x1.27
>>
>> Where bs refers to block size and b/u to bytes per update.
>>
>> Signed-off-by: Sami Tolvanen <[email protected]>
>> Cc: Andy Polyakov <[email protected]>
>>
>> ---
>> Changes since v1:
>> Rebased to Herbert's cryptodev tree
>> Include sha256-armv4.pl and use it to generate sha256-core.S
>> Add integer-only assembly version as sha256-asm
>> Add support for SHA-224 to the glue code
>> Change priority for sha256/224-ce to 300
>>
>> ---
>> arch/arm/crypto/Kconfig | 7
>> arch/arm/crypto/Makefile | 8
>> arch/arm/crypto/sha2-ce-glue.c | 4
>> arch/arm/crypto/sha256-armv4.pl | 713 ++++++
>> arch/arm/crypto/sha256-core.S_shipped | 2775 ++++++++++++++++++++++++
>> arch/arm/crypto/sha256_glue.c | 246 ++
>> arch/arm/crypto/sha256_glue.h | 23
>> arch/arm/crypto/sha256_neon_glue.c | 172 +
>> 8 files changed, 3945 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
>> index d63f319..458729d 100644
>> --- a/arch/arm/crypto/Kconfig
>> +++ b/arch/arm/crypto/Kconfig
>> @@ -46,6 +46,13 @@ config CRYPTO_SHA2_ARM_CE
>> SHA-256 secure hash standard (DFIPS 180-2) implemented
>> using special ARMv8 Crypto Extensions.
>>
>> +config CRYPTO_SHA256_ARM
>> + tristate "SHA-224/256 digest algorithm (ARM-asm and NEON)"
>> + select CRYPTO_HASH
>> + help
>> + SHA-256 secure hash standard (DFIPS 180-2) implemented
>> + using optimized ARM assembler and NEON, when available.
>> +
>> config CRYPTO_SHA512_ARM_NEON
>> tristate "SHA384 and SHA512 digest algorithm (ARM NEON)"
>> depends on KERNEL_MODE_NEON
>> diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
>> index 9a273bd..ef46e89 100644
>> --- a/arch/arm/crypto/Makefile
>> +++ b/arch/arm/crypto/Makefile
>> @@ -7,6 +7,7 @@ obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
>> obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
>> obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
>> obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
>> +obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
>> obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
>> obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
>> obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
>> @@ -16,6 +17,8 @@ aes-arm-y := aes-armv4.o aes_glue.o
>> aes-arm-bs-y := aesbs-core.o aesbs-glue.o
>> sha1-arm-y := sha1-armv4-large.o sha1_glue.o
>> sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o
>> +sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
>> +sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
>> sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
>> sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o
>> sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o
>> @@ -28,4 +31,7 @@ quiet_cmd_perl = PERL $@
>> $(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
>> $(call cmd,perl)
>>
>> -.PRECIOUS: $(obj)/aesbs-core.S
>> +$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
>> + $(call cmd,perl)
>> +
>> +.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S
>> diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c
>> index 9ffe8ad..0449eca 100644
>> --- a/arch/arm/crypto/sha2-ce-glue.c
>> +++ b/arch/arm/crypto/sha2-ce-glue.c
>> @@ -163,7 +163,7 @@ static struct shash_alg algs[] = { {
>> .base = {
>> .cra_name = "sha224",
>> .cra_driver_name = "sha224-ce",
>> - .cra_priority = 200,
>> + .cra_priority = 300,
>> .cra_flags = CRYPTO_ALG_TYPE_SHASH,
>> .cra_blocksize = SHA256_BLOCK_SIZE,
>> .cra_module = THIS_MODULE,
>> @@ -180,7 +180,7 @@ static struct shash_alg algs[] = { {
>> .base = {
>> .cra_name = "sha256",
>> .cra_driver_name = "sha256-ce",
>> - .cra_priority = 200,
>> + .cra_priority = 300,
>> .cra_flags = CRYPTO_ALG_TYPE_SHASH,
>> .cra_blocksize = SHA256_BLOCK_SIZE,
>> .cra_module = THIS_MODULE,
>> diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl
>> new file mode 100644
>> index 0000000..4fee74d
>> --- /dev/null
>> +++ b/arch/arm/crypto/sha256-armv4.pl
>> @@ -0,0 +1,713 @@
>> +#!/usr/bin/env perl
>> +
>> +# ====================================================================
>> +# Written by Andy Polyakov <[email protected]> for the OpenSSL
>> +# project. The module is, however, dual licensed under OpenSSL and
>> +# CRYPTOGAMS licenses depending on where you obtain it. For further
>> +# details see http://www.openssl.org/~appro/cryptogams/.
>> +#
>> +# Permission to use under GPL terms is granted.
>> +# ====================================================================
>> +
>> +# SHA256 block procedure for ARMv4. May 2007.
>> +
>> +# Performance is ~2x better than gcc 3.4 generated code and in "abso-
>> +# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
>> +# byte [on single-issue Xscale PXA250 core].
>> +
>> +# July 2010.
>> +#
>> +# Rescheduling for dual-issue pipeline resulted in 22% improvement on
>> +# Cortex A8 core and ~20 cycles per processed byte.
>> +
>> +# February 2011.
>> +#
>> +# Profiler-assisted and platform-specific optimization resulted in 16%
>> +# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
>> +
>> +# September 2013.
>> +#
>> +# Add NEON implementation. On Cortex A8 it was measured to process one
>> +# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
>> +# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
>> +# code (meaning that latter performs sub-optimally, nothing was done
>> +# about it).
>> +
>> +# May 2014.
>> +#
>> +# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
>> +
>> +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
>> +open STDOUT,">$output";
>> +
>> +$ctx="r0"; $t0="r0";
>> +$inp="r1"; $t4="r1";
>> +$len="r2"; $t1="r2";
>> +$T1="r3"; $t3="r3";
>> +$A="r4";
>> +$B="r5";
>> +$C="r6";
>> +$D="r7";
>> +$E="r8";
>> +$F="r9";
>> +$G="r10";
>> +$H="r11";
>> +@V=($A,$B,$C,$D,$E,$F,$G,$H);
>> +$t2="r12";
>> +$Ktbl="r14";
>> +
>> +@Sigma0=( 2,13,22);
>> +@Sigma1=( 6,11,25);
>> +@sigma0=( 7,18, 3);
>> +@sigma1=(17,19,10);
>> +
>> +sub BODY_00_15 {
>> +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
>> +
>> +$code.=<<___ if ($i<16);
>> +#if __ARM_ARCH__>=7
>> + @ ldr $t1,[$inp],#4 @ $i
>> +# if $i==15
>> + str $inp,[sp,#17*4] @ make room for $t4
>> +# endif
>> + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
>> + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
>> + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
>> + rev $t1,$t1
>> +#else
>> + @ ldrb $t1,[$inp,#3] @ $i
>> + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
>> + ldrb $t2,[$inp,#2]
>> + ldrb $t0,[$inp,#1]
>> + orr $t1,$t1,$t2,lsl#8
>> + ldrb $t2,[$inp],#4
>> + orr $t1,$t1,$t0,lsl#16
>> +# if $i==15
>> + str $inp,[sp,#17*4] @ make room for $t4
>> +# endif
>> + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
>> + orr $t1,$t1,$t2,lsl#24
>> + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
>> +#endif
>> +___
>> +$code.=<<___;
>> + ldr $t2,[$Ktbl],#4 @ *K256++
>> + add $h,$h,$t1 @ h+=X[i]
>> + str $t1,[sp,#`$i%16`*4]
>> + eor $t1,$f,$g
>> + add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
>> + and $t1,$t1,$e
>> + add $h,$h,$t2 @ h+=K256[i]
>> + eor $t1,$t1,$g @ Ch(e,f,g)
>> + eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
>> + add $h,$h,$t1 @ h+=Ch(e,f,g)
>> +#if $i==31
>> + and $t2,$t2,#0xff
>> + cmp $t2,#0xf2 @ done?
>> +#endif
>> +#if $i<15
>> +# if __ARM_ARCH__>=7
>> + ldr $t1,[$inp],#4 @ prefetch
>> +# else
>> + ldrb $t1,[$inp,#3]
>> +# endif
>> + eor $t2,$a,$b @ a^b, b^c in next round
>> +#else
>> + ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
>> + eor $t2,$a,$b @ a^b, b^c in next round
>> + ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
>> +#endif
>> + eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
>> + and $t3,$t3,$t2 @ (b^c)&=(a^b)
>> + add $d,$d,$h @ d+=h
>> + eor $t3,$t3,$b @ Maj(a,b,c)
>> + add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
>> + @ add $h,$h,$t3 @ h+=Maj(a,b,c)
>> +___
>> + ($t2,$t3)=($t3,$t2);
>> +}
>> +
>> +sub BODY_16_XX {
>> +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
>> +
>> +$code.=<<___;
>> + @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
>> + @ ldr $t4,[sp,#`($i+14)%16`*4]
>> + mov $t0,$t1,ror#$sigma0[0]
>> + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
>> + mov $t2,$t4,ror#$sigma1[0]
>> + eor $t0,$t0,$t1,ror#$sigma0[1]
>> + eor $t2,$t2,$t4,ror#$sigma1[1]
>> + eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
>> + ldr $t1,[sp,#`($i+0)%16`*4]
>> + eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
>> + ldr $t4,[sp,#`($i+9)%16`*4]
>> +
>> + add $t2,$t2,$t0
>> + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
>> + add $t1,$t1,$t2
>> + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
>> + add $t1,$t1,$t4 @ X[i]
>> +___
>> + &BODY_00_15(@_);
>> +}
>> +
>> +$code=<<___;
>> +#ifndef __KERNEL__
>> +# include "arm_arch.h"
>> +#else
>> +# define __ARM_ARCH__ __LINUX_ARM_ARCH__
>> +# define __ARM_MAX_ARCH__ 7
> I'm not sure this will work for kernel that is for older arm

This code is only used at runtime if a NEON is detected on the current CPU

>> +#endif
>> +
>> +.text
>> +#if __ARM_ARCH__<7
>> +.code 32
>> +#else
>> +.syntax unified
>> +# ifdef __thumb2__
>> +.thumb
>> +# else
>> +.code 32
>> +# endif
>> +#endif
>> +
>> +.type K256,%object
>> +.align 5
>> +K256:
>> +.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
>> +.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
>> +.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
>> +.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
>> +.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
>> +.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
>> +.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
>> +.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
>> +.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
>> +.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
>> +.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
>> +.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
>> +.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
>> +.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
>> +.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
>> +.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
>> +.size K256,.-K256
>> +.word 0 @ terminator
>> +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
>> +.LOPENSSL_armcap:
>> +.word OPENSSL_armcap_P-sha256_block_data_order
>> +#endif
>> +.align 5
>> +
>> +.global sha256_block_data_order
>> +.type sha256_block_data_order,%function
>> +sha256_block_data_order:
>> +#if __ARM_ARCH__<7
>> + sub r3,pc,#8 @ sha256_block_data_order
>> +#else
>> + adr r3,sha256_block_data_order
>> +#endif
>> +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
>> + ldr r12,.LOPENSSL_armcap
>> + ldr r12,[r3,r12] @ OPENSSL_armcap_P
>> + tst r12,#ARMV8_SHA256
>> + bne .LARMv8
>> + tst r12,#ARMV7_NEON
>> + bne .LNEON
>> +#endif
>> + add $len,$inp,$len,lsl#6 @ len to point at the end of inp
>> + stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
>> + ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
>> + sub $Ktbl,r3,#256+32 @ K256
>> + sub sp,sp,#16*4 @ alloca(X[16])
>> +.Loop:
>> +# if __ARM_ARCH__>=7
>> + ldr $t1,[$inp],#4
>> +# else
>> + ldrb $t1,[$inp,#3]
>> +# endif
>> + eor $t3,$B,$C @ magic
>> + eor $t2,$t2,$t2
>> +___
>> +for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
>> +$code.=".Lrounds_16_xx:\n";
>> +for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
>> +$code.=<<___;
>> +#if __ARM_ARCH__>=7
>> + ite eq @ Thumb2 thing, sanity check in ARM
>> +#endif
>> + ldreq $t3,[sp,#16*4] @ pull ctx
>> + bne .Lrounds_16_xx
>> +
>> + add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
>> + ldr $t0,[$t3,#0]
>> + ldr $t1,[$t3,#4]
>> + ldr $t2,[$t3,#8]
>> + add $A,$A,$t0
>> + ldr $t0,[$t3,#12]
>> + add $B,$B,$t1
>> + ldr $t1,[$t3,#16]
>> + add $C,$C,$t2
>> + ldr $t2,[$t3,#20]
>> + add $D,$D,$t0
>> + ldr $t0,[$t3,#24]
>> + add $E,$E,$t1
>> + ldr $t1,[$t3,#28]
>> + add $F,$F,$t2
>> + ldr $inp,[sp,#17*4] @ pull inp
>> + ldr $t2,[sp,#18*4] @ pull inp+len
>> + add $G,$G,$t0
>> + add $H,$H,$t1
>> + stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
>> + cmp $inp,$t2
>> + sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
>> + bne .Loop
>> +
>> + add sp,sp,#`16+3`*4 @ destroy frame
>> +#if __ARM_ARCH__>=5
>> + ldmia sp!,{r4-r11,pc}
>> +#else
>> + ldmia sp!,{r4-r11,lr}
>> + tst lr,#1
>> + moveq pc,lr @ be binary compatible with V4, yet
>> + bx lr @ interoperable with Thumb ISA:-)
>> +#endif
>> +.size sha256_block_data_order,.-sha256_block_data_order
>> +___
>> +######################################################################
>> +# NEON stuff
>> +#
>> +{{{
>> +my @X=map("q$_",(0..3));
>> +my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
>> +my $Xfer=$t4;
>> +my $j=0;
>> +
>> +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
>> +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
>> +
>> +sub AUTOLOAD() # thunk [simplified] x86-style perlasm
>> +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
>> + my $arg = pop;
>> + $arg = "#$arg" if ($arg*1 eq $arg);
>> + $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
>> +}
>> +
>> +sub Xupdate()
>> +{ use integer;
>> + my $body = shift;
>> + my @insns = (&$body,&$body,&$body,&$body);
>> + my ($a,$b,$c,$d,$e,$f,$g,$h);
>> +
>> + &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vshr_u32 ($T2,$T0,$sigma0[0]);
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vshr_u32 ($T1,$T0,$sigma0[2]);
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vsli_32 ($T2,$T0,32-$sigma0[0]);
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vshr_u32 ($T3,$T0,$sigma0[1]);
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &veor ($T1,$T1,$T2);
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vsli_32 ($T3,$T0,32-$sigma0[1]);
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &veor ($T1,$T1,$T3); # sigma0(X[1..4])
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &veor ($T5,$T5,$T4);
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &veor ($T5,$T5,$T4); # sigma1(X[14..15])
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &veor ($T5,$T5,$T4);
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vld1_32 ("{$T0}","[$Ktbl,:128]!");
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &veor ($T5,$T5,$T4); # sigma1(X[16..17])
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vadd_i32 ($T0,$T0,@X[0]);
>> + while($#insns>=2) { eval(shift(@insns)); }
>> + &vst1_32 ("{$T0}","[$Xfer,:128]!");
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> +
>> + push(@X,shift(@X)); # "rotate" X[]
>> +}
>> +
>> +sub Xpreload()
>> +{ use integer;
>> + my $body = shift;
>> + my @insns = (&$body,&$body,&$body,&$body);
>> + my ($a,$b,$c,$d,$e,$f,$g,$h);
>> +
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vld1_32 ("{$T0}","[$Ktbl,:128]!");
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vrev32_8 (@X[0],@X[0]);
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + eval(shift(@insns));
>> + &vadd_i32 ($T0,$T0,@X[0]);
>> + foreach (@insns) { eval; } # remaining instructions
>> + &vst1_32 ("{$T0}","[$Xfer,:128]!");
>> +
>> + push(@X,shift(@X)); # "rotate" X[]
>> +}
>> +
>> +sub body_00_15 () {
>> + (
>> + '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
>> + '&add ($h,$h,$t1)', # h+=X[i]+K[i]
>> + '&eor ($t1,$f,$g)',
>> + '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
>> + '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
>> + '&and ($t1,$t1,$e)',
>> + '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
>> + '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
>> + '&eor ($t1,$t1,$g)', # Ch(e,f,g)
>> + '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
>> + '&eor ($t2,$a,$b)', # a^b, b^c in next round
>> + '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
>> + '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
>> + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
>> + '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
>> + '&ldr ($t1,"[sp,#64]") if ($j==31)',
>> + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
>> + '&add ($d,$d,$h)', # d+=h
>> + '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
>> + '&eor ($t3,$t3,$b)', # Maj(a,b,c)
>> + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
>> + )
>> +}
>> +
>> +$code.=<<___;
>> +#if __ARM_MAX_ARCH__>=7
> this will be compile on armv4 but gcc will not allow it
>
> we need to drop the neon code for older non v7 build
>

The .arch and .fpu declarations ensure that it can be built regardless
of the platform you are compiling for, unless you have a really old
toolchain.
The glue code ensures that the module can only be loaded if HWCAP_NEON is set.

Did you get errors trying to build it?

2015-03-24 13:06:42

by Ard Biesheuvel

[permalink] [raw]

Subject: Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224

On 24 March 2015 at 14:05, Jean-Christophe PLAGNIOL-VILLARD
<[email protected]> wrote:
> >> + '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
>> >> + '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
>> >> + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
>> >> + '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
>> >> + '&ldr ($t1,"[sp,#64]") if ($j==31)',
>> >> + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
>> >> + '&add ($d,$d,$h)', # d+=h
>> >> + '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
>> >> + '&eor ($t3,$t3,$b)', # Maj(a,b,c)
>> >> + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
>> >> + )
>> >> +}
>> >> +
>> >> +$code.=<<___;
>> >> +#if __ARM_MAX_ARCH__>=7
>> > this will be compile on armv4 but gcc will not allow it
>> >
>> > we need to drop the neon code for older non v7 build
>> >
>>
>> The .arch and .fpu declarations ensure that it can be built regardless
>> of the platform you are compiling for, unless you have a really old
>> toolchain.
> I known but does not work for me
>> The glue code ensures that the module can only be loaded if HWCAP_NEON is set.
>>
>> Did you get errors trying to build it?
>
> yes I do
>
> I use
>
> arm-none-linux-gnueabi-gcc (Sourcery CodeBench Lite 2014.05-29) 4.8.3 20140320
> (prerelease)
> Copyright (C) 2013 Free Software Foundation, Inc.
> This is free software; see the source for copying conditions. There is NO
> warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
>
> so it's not that old
>

Could you share the error log please?

2015-03-24 13:05:11

by Jean-Christophe PLAGNIOL-VILLARD

[permalink] [raw]

Subject: Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224

>> + '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
> >> + '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
> >> + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
> >> + '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
> >> + '&ldr ($t1,"[sp,#64]") if ($j==31)',
> >> + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
> >> + '&add ($d,$d,$h)', # d+=h
> >> + '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
> >> + '&eor ($t3,$t3,$b)', # Maj(a,b,c)
> >> + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
> >> + )
> >> +}
> >> +
> >> +$code.=<<___;
> >> +#if __ARM_MAX_ARCH__>=7
> > this will be compile on armv4 but gcc will not allow it
> >
> > we need to drop the neon code for older non v7 build
> >
>
> The .arch and .fpu declarations ensure that it can be built regardless
> of the platform you are compiling for, unless you have a really old
> toolchain.
I known but does not work for me
> The glue code ensures that the module can only be loaded if HWCAP_NEON is set.
>
> Did you get errors trying to build it?

yes I do

I use

arm-none-linux-gnueabi-gcc (Sourcery CodeBench Lite 2014.05-29) 4.8.3 20140320
(prerelease)
Copyright (C) 2013 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

so it's not that old

Best Regards,
J.

2015-03-24 14:46:59

by Ard Biesheuvel

[permalink] [raw]

Subject: Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224

On 24 March 2015 at 14:06, Ard Biesheuvel <[email protected]> wrote:
> On 24 March 2015 at 14:05, Jean-Christophe PLAGNIOL-VILLARD
> <[email protected]> wrote:
>> >> + '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
>>> >> + '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
>>> >> + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
>>> >> + '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
>>> >> + '&ldr ($t1,"[sp,#64]") if ($j==31)',
>>> >> + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
>>> >> + '&add ($d,$d,$h)', # d+=h
>>> >> + '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
>>> >> + '&eor ($t3,$t3,$b)', # Maj(a,b,c)
>>> >> + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
>>> >> + )
>>> >> +}
>>> >> +
>>> >> +$code.=<<___;
>>> >> +#if __ARM_MAX_ARCH__>=7
>>> > this will be compile on armv4 but gcc will not allow it
>>> >
>>> > we need to drop the neon code for older non v7 build
>>> >
>>>
>>> The .arch and .fpu declarations ensure that it can be built regardless
>>> of the platform you are compiling for, unless you have a really old
>>> toolchain.
>> I known but does not work for me
>>> The glue code ensures that the module can only be loaded if HWCAP_NEON is set.
>>>
>>> Did you get errors trying to build it?
>>
>> yes I do
>>
>> I use
>>
>> arm-none-linux-gnueabi-gcc (Sourcery CodeBench Lite 2014.05-29) 4.8.3 20140320
>> (prerelease)
>> Copyright (C) 2013 Free Software Foundation, Inc.
>> This is free software; see the source for copying conditions. There is NO
>> warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
>>
>> so it's not that old
>>
>
> Could you share the error log please?

OK, I spotted one issue with this code:

arch/arm/crypto/sha256-core.S: Assembler messages:
arch/arm/crypto/sha256-core.S:1847: Error: invalid constant (ffffefb0)
after fixup

This is caused by the fact that, when building the integer-only code
for an older architecture, the conditional compilation produces a
slightly bigger preceding function, and the symbol K256 is out of
range for the adr instruction.

@Jean-Christophe: is that the same problem that you hit?

@Andy: I propose we do something similar as in the bsaes code:

#ifdef __thumb__
#define adrl adr
#endif

and replace the offending line with

adrl r14,K256

@Herbert: we will need to respin this, so please don't pull it yet.

Regards,
--
Ard.

2015-03-24 17:05:43

by Jean-Christophe PLAGNIOL-VILLARD

[permalink] [raw]

Subject: Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224

On 15:46 Tue 24 Mar , Ard Biesheuvel wrote:
> On 24 March 2015 at 14:06, Ard Biesheuvel <[email protected]> wrote:
> > On 24 March 2015 at 14:05, Jean-Christophe PLAGNIOL-VILLARD
> > <[email protected]> wrote:
> >> >> + '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
> >>> >> + '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
> >>> >> + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
> >>> >> + '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
> >>> >> + '&ldr ($t1,"[sp,#64]") if ($j==31)',
> >>> >> + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
> >>> >> + '&add ($d,$d,$h)', # d+=h
> >>> >> + '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
> >>> >> + '&eor ($t3,$t3,$b)', # Maj(a,b,c)
> >>> >> + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
> >>> >> + )
> >>> >> +}
> >>> >> +
> >>> >> +$code.=<<___;
> >>> >> +#if __ARM_MAX_ARCH__>=7
> >>> > this will be compile on armv4 but gcc will not allow it
> >>> >
> >>> > we need to drop the neon code for older non v7 build
> >>> >
> >>>
> >>> The .arch and .fpu declarations ensure that it can be built regardless
> >>> of the platform you are compiling for, unless you have a really old
> >>> toolchain.
> >> I known but does not work for me
> >>> The glue code ensures that the module can only be loaded if HWCAP_NEON is set.
> >>>
> >>> Did you get errors trying to build it?
> >>
> >> yes I do
> >>
> >> I use
> >>
> >> arm-none-linux-gnueabi-gcc (Sourcery CodeBench Lite 2014.05-29) 4.8.3 20140320
> >> (prerelease)
> >> Copyright (C) 2013 Free Software Foundation, Inc.
> >> This is free software; see the source for copying conditions. There is NO
> >> warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
> >>
> >> so it's not that old
> >>
> >
> > Could you share the error log please?
>
> OK, I spotted one issue with this code:
>
> arch/arm/crypto/sha256-core.S: Assembler messages:
> arch/arm/crypto/sha256-core.S:1847: Error: invalid constant (ffffefb0)
> after fixup

yes exactly
>
> This is caused by the fact that, when building the integer-only code
> for an older architecture, the conditional compilation produces a
> slightly bigger preceding function, and the symbol K256 is out of
> range for the adr instruction.

Yeap I see that too when debuging
>
> @Jean-Christophe: is that the same problem that you hit?
>
> @Andy: I propose we do something similar as in the bsaes code:
>
> #ifdef __thumb__
> #define adrl adr
> #endif
>
> and replace the offending line with
>
> adrl r14,K256

Acked-by: Jean-Christophe PLAGNIOL-VILLARD <[email protected]>
Tested-by: Jean-Christophe PLAGNIOL-VILLARD <[email protected]>

on arm926ejs

Best Regards,
J.

2015-03-24 17:40:30

by Ard Biesheuvel

[permalink] [raw]

Subject: Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224

On 24 March 2015 at 18:05, Jean-Christophe PLAGNIOL-VILLARD
<[email protected]> wrote:
> On 15:46 Tue 24 Mar , Ard Biesheuvel wrote:
>> On 24 March 2015 at 14:06, Ard Biesheuvel <[email protected]> wrote:
>> > On 24 March 2015 at 14:05, Jean-Christophe PLAGNIOL-VILLARD
>> > <[email protected]> wrote:
>> >> >> + '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
>> >>> >> + '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
>> >>> >> + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
>> >>> >> + '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
>> >>> >> + '&ldr ($t1,"[sp,#64]") if ($j==31)',
>> >>> >> + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
>> >>> >> + '&add ($d,$d,$h)', # d+=h
>> >>> >> + '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
>> >>> >> + '&eor ($t3,$t3,$b)', # Maj(a,b,c)
>> >>> >> + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
>> >>> >> + )
>> >>> >> +}
>> >>> >> +
>> >>> >> +$code.=<<___;
>> >>> >> +#if __ARM_MAX_ARCH__>=7
>> >>> > this will be compile on armv4 but gcc will not allow it
>> >>> >
>> >>> > we need to drop the neon code for older non v7 build
>> >>> >
>> >>>
>> >>> The .arch and .fpu declarations ensure that it can be built regardless
>> >>> of the platform you are compiling for, unless you have a really old
>> >>> toolchain.
>> >> I known but does not work for me
>> >>> The glue code ensures that the module can only be loaded if HWCAP_NEON is set.
>> >>>
>> >>> Did you get errors trying to build it?
>> >>
>> >> yes I do
>> >>
>> >> I use
>> >>
>> >> arm-none-linux-gnueabi-gcc (Sourcery CodeBench Lite 2014.05-29) 4.8.3 20140320
>> >> (prerelease)
>> >> Copyright (C) 2013 Free Software Foundation, Inc.
>> >> This is free software; see the source for copying conditions. There is NO
>> >> warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
>> >>
>> >> so it's not that old
>> >>
>> >
>> > Could you share the error log please?
>>
>> OK, I spotted one issue with this code:
>>
>> arch/arm/crypto/sha256-core.S: Assembler messages:
>> arch/arm/crypto/sha256-core.S:1847: Error: invalid constant (ffffefb0)
>> after fixup
>
> yes exactly
>>
>> This is caused by the fact that, when building the integer-only code
>> for an older architecture, the conditional compilation produces a
>> slightly bigger preceding function, and the symbol K256 is out of
>> range for the adr instruction.
>
> Yeap I see that too when debuging
>>
>> @Jean-Christophe: is that the same problem that you hit?
>>
>> @Andy: I propose we do something similar as in the bsaes code:
>>
>> #ifdef __thumb__
>> #define adrl adr
>> #endif
>>
>> and replace the offending line with
>>
>> adrl r14,K256
>
> Acked-by: Jean-Christophe PLAGNIOL-VILLARD <[email protected]>
> Tested-by: Jean-Christophe PLAGNIOL-VILLARD <[email protected]>
>

Thanks!

@Sami, Andy: we need to respin the whole patch, including updated
OpenSSL upstream commit id :-(

Regards,
Ard.

2015-03-24 18:18:03

by Sami Tolvanen

[permalink] [raw]

Subject: Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224

On Tue, Mar 24, 2015 at 06:40:29PM +0100, Ard Biesheuvel wrote:
> @Sami, Andy: we need to respin the whole patch, including updated
> OpenSSL upstream commit id :-(

Sure, I will send v3 once the changes are in OpenSSL. Thanks for testing!
Do you still prefer everything in one patch?

Sami

2015-03-25 06:49:57

by Ard Biesheuvel

[permalink] [raw]

Subject: Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224

On 24 March 2015 at 19:17, Sami Tolvanen <[email protected]> wrote:
> On Tue, Mar 24, 2015 at 06:40:29PM +0100, Ard Biesheuvel wrote:
>> @Sami, Andy: we need to respin the whole patch, including updated
>> OpenSSL upstream commit id :-(
>
> Sure, I will send v3 once the changes are in OpenSSL. Thanks for testing!
> Do you still prefer everything in one patch?
>

If it is easily done, you could add the .S_swapped file and the
related Makefile rule in a separate followup patch, yes.
But then, the second one might still be too big ...

2015-03-25 20:00:38

by Jean-Christophe PLAGNIOL-VILLARD

[permalink] [raw]

Subject: Re: [PATCH] arm: crypto: Add NEON optimized SHA-256

On 15:48 Mon 16 Mar , Sami Tolvanen wrote:
> Add Andy Polyakov's NEON optimized SHA-256 implementation.
>
> On Nexus 6, this implementation is ~2x faster than sha256-generic.

do you plan to add the sha512 from openssl too?

Whould be nice so armv4 can get faster implementatio too

Best Regards,
J.

2015-03-27 10:42:55

by Andy Polyakov

[permalink] [raw]

Subject: Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224

_______________________________________________
linux-arm-kernel mailing list
[email protected]
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

Attachments:

sha256-armv4.diff (838.00 B)
(No filename) (176.00 B)
Download all attachments

2015-03-27 10:44:46

by Ard Biesheuvel

[permalink] [raw]

Subject: Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224

On 27 March 2015 at 11:42, Andy Polyakov <[email protected]> wrote:
>>> Could you share the error log please?
>>
>> OK, I spotted one issue with this code:
>>
>> arch/arm/crypto/sha256-core.S: Assembler messages:
>> arch/arm/crypto/sha256-core.S:1847: Error: invalid constant (ffffefb0)
>> after fixup
>>
>> This is caused by the fact that, when building the integer-only code
>> for an older architecture, the conditional compilation produces a
>> slightly bigger preceding function, and the symbol K256 is out of
>> range for the adr instruction.
>>
>> @Jean-Christophe: is that the same problem that you hit?
>>
>> @Andy: I propose we do something similar as in the bsaes code:
>>
>> #ifdef __thumb__
>> #define adrl adr
>> #endif
>>
>> and replace the offending line with
>>
>> adrl r14,K256
>
> Sorry about delay. Yes, that would do. I did test all combinations, but
> all "my" combinations, i.e. without __KERNEL__ defined :-( And without
> __KERNEL__ there are few extra instructions in integer-only subroutine
> that "push" instruction in question code toward higher address, so that
> constant was ffffefc0, which can be encoded. Anyway, I've chosen to add
> that #define next to .thumb directive. See attached.
>
> Ard, you have mentioned that you've verified it on big-endian, but I've
> spotted little-endian dependency (see #ifndef __ARMEB__ in attached). I
> guess that it worked for you either because it was NEON that was tested
> (it does work as is) or __LINUX_ARM_ARCH__ was less than 7 (in which
> case it uses endian-neutral byte-by-byte data load). Can you confirm either?
>

I need to double check that, but my suspicion is that it was the latter.

2015-03-27 18:07:01

by Ard Biesheuvel

[permalink] [raw]

Subject: Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224

On 27 March 2015 at 11:44, Ard Biesheuvel <[email protected]> wrote:
> On 27 March 2015 at 11:42, Andy Polyakov <[email protected]> wrote:
>>>> Could you share the error log please?
>>>
>>> OK, I spotted one issue with this code:
>>>
>>> arch/arm/crypto/sha256-core.S: Assembler messages:
>>> arch/arm/crypto/sha256-core.S:1847: Error: invalid constant (ffffefb0)
>>> after fixup
>>>
>>> This is caused by the fact that, when building the integer-only code
>>> for an older architecture, the conditional compilation produces a
>>> slightly bigger preceding function, and the symbol K256 is out of
>>> range for the adr instruction.
>>>
>>> @Jean-Christophe: is that the same problem that you hit?
>>>
>>> @Andy: I propose we do something similar as in the bsaes code:
>>>
>>> #ifdef __thumb__
>>> #define adrl adr
>>> #endif
>>>
>>> and replace the offending line with
>>>
>>> adrl r14,K256
>>
>> Sorry about delay. Yes, that would do. I did test all combinations, but
>> all "my" combinations, i.e. without __KERNEL__ defined :-( And without
>> __KERNEL__ there are few extra instructions in integer-only subroutine
>> that "push" instruction in question code toward higher address, so that
>> constant was ffffefc0, which can be encoded. Anyway, I've chosen to add
>> that #define next to .thumb directive. See attached.
>>
>> Ard, you have mentioned that you've verified it on big-endian, but I've
>> spotted little-endian dependency (see #ifndef __ARMEB__ in attached). I
>> guess that it worked for you either because it was NEON that was tested
>> (it does work as is) or __LINUX_ARM_ARCH__ was less than 7 (in which
>> case it uses endian-neutral byte-by-byte data load). Can you confirm either?
>>
>
> I need to double check that, but my suspicion is that it was the latter.

Indeed, if I build for v7 I get

[ 0.269418] 00000000: 4e a5 c5 08 a6 56 6e 76 24 05 43 f8 fe b0 6f d4
[ 0.275261] 00000010: 57 77 7b e3 95 49 c4 01 64 36 af da 65 d2 33 0e
[ 0.281031] alg: hash: Test 1 failed for sha224-asm
[ 0.285315] 00000000: 9d 6a 5d e9 e1 6c 39 99 c7 14 84 0f 47 77 1f 36
[ 0.290912] 00000010: dc c2 97 a7 bd ef aa c3 6c 95 15 ae

which is indeed the integer code failing, and your attached patch fixes it.

2015-03-29 13:27:27

by Andy Polyakov

[permalink] [raw]

Subject: Re: [PATCHv2] arm: crypto: Add optimized SHA-256/224

>>>> arch/arm/crypto/sha256-core.S: Assembler messages:
>>>> arch/arm/crypto/sha256-core.S:1847: Error: invalid constant (ffffefb0)
>>>> after fixup
>>>>
>>>> This is caused by the fact that, when building the integer-only code
>>>> for an older architecture, the conditional compilation produces a
>>>> slightly bigger preceding function, and the symbol K256 is out of
>>>> range for the adr instruction.
>>>>
>>>> ... and replace the offending line with
>>>>
>>>> adrl r14,K256
>>>
>>> Ard, you have mentioned that you've verified it on big-endian, but I've
>>> spotted little-endian dependency (see #ifndef __ARMEB__ in attached). I
>>> guess that it worked for you either because it was NEON that was tested
>>> (it does work as is) or __LINUX_ARM_ARCH__ was less than 7 (in which
>>> case it uses endian-neutral byte-by-byte data load). Can you confirm either?
>
> Indeed, if I build for v7 I get
>
> [ 0.269418] 00000000: 4e a5 c5 08 a6 56 6e 76 24 05 43 f8 fe b0 6f d4
> [ 0.275261] 00000010: 57 77 7b e3 95 49 c4 01 64 36 af da 65 d2 33 0e
> [ 0.281031] alg: hash: Test 1 failed for sha224-asm
> [ 0.285315] 00000000: 9d 6a 5d e9 e1 6c 39 99 c7 14 84 0f 47 77 1f 36
> [ 0.290912] 00000010: dc c2 97 a7 bd ef aa c3 6c 95 15 ae
>
> which is indeed the integer code failing, and your attached patch fixes it.

Committed as
http://git.openssl.org/gitweb/?p=openssl.git;a=commitdiff;h=51f8d095562f36cdaa6893597b5c609e943b0565.

2015-03-30 08:37:27

by Sami Tolvanen

[permalink] [raw]

Subject: [PATCHv3] arm: crypto: Add optimized SHA-256/224

Add Andy Polyakov's optimized assembly and NEON implementations for
SHA-256/224.

The sha256-armv4.pl script for generating the assembly code is from
OpenSSL commit 51f8d095562f36cdaa6893597b5c609e943b0565.

Compared to sha256-generic these implementations have the following
tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):

bs b/u sha256-neon sha256-asm
16 16 x1.32 x1.19
64 16 x1.27 x1.15
64 64 x1.36 x1.20
256 16 x1.22 x1.11
256 64 x1.36 x1.19
256 256 x1.59 x1.23
1024 16 x1.21 x1.10
1024 256 x1.65 x1.23
1024 1024 x1.76 x1.25
2048 16 x1.21 x1.10
2048 256 x1.66 x1.23
2048 1024 x1.78 x1.25
2048 2048 x1.79 x1.25
4096 16 x1.20 x1.09
4096 256 x1.66 x1.23
4096 1024 x1.79 x1.26
4096 4096 x1.82 x1.26
8192 16 x1.20 x1.09
8192 256 x1.67 x1.23
8192 1024 x1.80 x1.26
8192 4096 x1.85 x1.28
8192 8192 x1.85 x1.27

Where bs refers to block size and b/u to bytes per update.

Signed-off-by: Sami Tolvanen <[email protected]>
Cc: Andy Polyakov <[email protected]>

---
Changes since v2:
Updated sha256-armv4.pl and sha256-core.S_shipped

Changes since v1:
Rebased to Herbert's cryptodev tree
Include sha256-armv4.pl and use it to generate sha256-core.S
Add integer-only assembly version as sha256-asm
Add support for SHA-224 to the glue code
Change priority for sha256/224-ce to 300

---
arch/arm/crypto/Kconfig | 7
arch/arm/crypto/Makefile | 8
arch/arm/crypto/sha2-ce-glue.c | 4
arch/arm/crypto/sha256-armv4.pl | 713 ++++++
arch/arm/crypto/sha256-core.S_shipped | 2775 ++++++++++++++++++++++++
arch/arm/crypto/sha256_glue.c | 246 ++
arch/arm/crypto/sha256_glue.h | 23
arch/arm/crypto/sha256_neon_glue.c | 172 +
8 files changed, 3945 insertions(+), 3 deletions(-)

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index d63f319..458729d 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -46,6 +46,13 @@ config CRYPTO_SHA2_ARM_CE
SHA-256 secure hash standard (DFIPS 180-2) implemented
using special ARMv8 Crypto Extensions.

+config CRYPTO_SHA256_ARM
+ tristate "SHA-224/256 digest algorithm (ARM-asm and NEON)"
+ select CRYPTO_HASH
+ help
+ SHA-256 secure hash standard (DFIPS 180-2) implemented
+ using optimized ARM assembler and NEON, when available.
+
config CRYPTO_SHA512_ARM_NEON
tristate "SHA384 and SHA512 digest algorithm (ARM NEON)"
depends on KERNEL_MODE_NEON
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 9a273bd..ef46e89 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
+obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
@@ -16,6 +17,8 @@ aes-arm-y := aes-armv4.o aes_glue.o
aes-arm-bs-y := aesbs-core.o aesbs-glue.o
sha1-arm-y := sha1-armv4-large.o sha1_glue.o
sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o
+sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
+sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o
sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o
@@ -28,4 +31,7 @@ quiet_cmd_perl = PERL $@
$(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
$(call cmd,perl)

-.PRECIOUS: $(obj)/aesbs-core.S
+$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
+ $(call cmd,perl)
+
+.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S
diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c
index 9ffe8ad..0449eca 100644
--- a/arch/arm/crypto/sha2-ce-glue.c
+++ b/arch/arm/crypto/sha2-ce-glue.c
@@ -163,7 +163,7 @@ static struct shash_alg algs[] = { {
.base = {
.cra_name = "sha224",
.cra_driver_name = "sha224-ce",
- .cra_priority = 200,
+ .cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
@@ -180,7 +180,7 @@ static struct shash_alg algs[] = { {
.base = {
.cra_name = "sha256",
.cra_driver_name = "sha256-ce",
- .cra_priority = 200,
+ .cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl
new file mode 100644
index 0000000..fac0533
--- /dev/null
+++ b/arch/arm/crypto/sha256-armv4.pl
@@ -0,0 +1,716 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <[email protected]> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
+# ====================================================================
+
+# SHA256 block procedure for ARMv4. May 2007.
+
+# Performance is ~2x better than gcc 3.4 generated code and in "abso-
+# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0"; $t0="r0";
+$inp="r1"; $t4="r1";
+$len="r2"; $t1="r2";
+$T1="r3"; $t3="r3";
+$A="r4";
+$B="r5";
+$C="r6";
+$D="r7";
+$E="r8";
+$F="r9";
+$G="r10";
+$H="r11";
+@V=($A,$B,$C,$D,$E,$F,$G,$H);
+$t2="r12";
+$Ktbl="r14";
+
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+ @ ldr $t1,[$inp],#4 @ $i
+# if $i==15
+ str $inp,[sp,#17*4] @ make room for $t4
+# endif
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+# ifndef __ARMEB__
+ rev $t1,$t1
+# endif
+#else
+ @ ldrb $t1,[$inp,#3] @ $i
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ ldrb $t2,[$inp,#2]
+ ldrb $t0,[$inp,#1]
+ orr $t1,$t1,$t2,lsl#8
+ ldrb $t2,[$inp],#4
+ orr $t1,$t1,$t0,lsl#16
+# if $i==15
+ str $inp,[sp,#17*4] @ make room for $t4
+# endif
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+ orr $t1,$t1,$t2,lsl#24
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+#endif
+___
+$code.=<<___;
+ ldr $t2,[$Ktbl],#4 @ *K256++
+ add $h,$h,$t1 @ h+=X[i]
+ str $t1,[sp,#`$i%16`*4]
+ eor $t1,$f,$g
+ add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
+ and $t1,$t1,$e
+ add $h,$h,$t2 @ h+=K256[i]
+ eor $t1,$t1,$g @ Ch(e,f,g)
+ eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
+ add $h,$h,$t1 @ h+=Ch(e,f,g)
+#if $i==31
+ and $t2,$t2,#0xff
+ cmp $t2,#0xf2 @ done?
+#endif
+#if $i<15
+# if __ARM_ARCH__>=7
+ ldr $t1,[$inp],#4 @ prefetch
+# else
+ ldrb $t1,[$inp,#3]
+# endif
+ eor $t2,$a,$b @ a^b, b^c in next round
+#else
+ ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
+ eor $t2,$a,$b @ a^b, b^c in next round
+ ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
+#endif
+ eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
+ and $t3,$t3,$t2 @ (b^c)&=(a^b)
+ add $d,$d,$h @ d+=h
+ eor $t3,$t3,$b @ Maj(a,b,c)
+ add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
+ @ add $h,$h,$t3 @ h+=Maj(a,b,c)
+___
+ ($t2,$t3)=($t3,$t2);
+}
+
+sub BODY_16_XX {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+ @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
+ @ ldr $t4,[sp,#`($i+14)%16`*4]
+ mov $t0,$t1,ror#$sigma0[0]
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ mov $t2,$t4,ror#$sigma1[0]
+ eor $t0,$t0,$t1,ror#$sigma0[1]
+ eor $t2,$t2,$t4,ror#$sigma1[1]
+ eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
+ ldr $t1,[sp,#`($i+0)%16`*4]
+ eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
+ ldr $t4,[sp,#`($i+9)%16`*4]
+
+ add $t2,$t2,$t0
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
+ add $t1,$t1,$t2
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+ add $t1,$t1,$t4 @ X[i]
+___
+ &BODY_00_15(@_);
+}
+
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+# define adrl adr
+.thumb
+# else
+.code 32
+# endif
+#endif
+
+.type K256,%object
+.align 5
+K256:
+.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size K256,.-K256
+.word 0 @ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align 5
+
+.global sha256_block_data_order
+.type sha256_block_data_order,%function
+sha256_block_data_order:
+#if __ARM_ARCH__<7
+ sub r3,pc,#8 @ sha256_block_data_order
+#else
+ adr r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+ tst r12,#ARMV8_SHA256
+ bne .LARMv8
+ tst r12,#ARMV7_NEON
+ bne .LNEON
+#endif
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+ stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
+ ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
+ sub $Ktbl,r3,#256+32 @ K256
+ sub sp,sp,#16*4 @ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+ ldr $t1,[$inp],#4
+# else
+ ldrb $t1,[$inp,#3]
+# endif
+ eor $t3,$B,$C @ magic
+ eor $t2,$t2,$t2
+___
+for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".Lrounds_16_xx:\n";
+for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+#if __ARM_ARCH__>=7
+ ite eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq $t3,[sp,#16*4] @ pull ctx
+ bne .Lrounds_16_xx
+
+ add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
+ ldr $t0,[$t3,#0]
+ ldr $t1,[$t3,#4]
+ ldr $t2,[$t3,#8]
+ add $A,$A,$t0
+ ldr $t0,[$t3,#12]
+ add $B,$B,$t1
+ ldr $t1,[$t3,#16]
+ add $C,$C,$t2
+ ldr $t2,[$t3,#20]
+ add $D,$D,$t0
+ ldr $t0,[$t3,#24]
+ add $E,$E,$t1
+ ldr $t1,[$t3,#28]
+ add $F,$F,$t2
+ ldr $inp,[sp,#17*4] @ pull inp
+ ldr $t2,[sp,#18*4] @ pull inp+len
+ add $G,$G,$t0
+ add $H,$H,$t1
+ stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
+ cmp $inp,$t2
+ sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
+ bne .Loop
+
+ add sp,sp,#`16+3`*4 @ destroy frame
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size sha256_block_data_order,.-sha256_block_data_order
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+ my $arg = pop;
+ $arg = "#$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T2,$T0,$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T1,$T0,$sigma0[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T2,$T0,32-$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T3,$T0,$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T1,$T1,$T2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T3,$T0,32-$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T1,$T1,$T3); # sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4); # sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_32 ("{$T0}","[$Ktbl,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4); # sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 ($T0,$T0,@X[0]);
+ while($#insns>=2) { eval(shift(@insns)); }
+ &vst1_32 ("{$T0}","[$Xfer,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_32 ("{$T0}","[$Ktbl,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vrev32_8 (@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 ($T0,$T0,@X[0]);
+ foreach (@insns) { eval; } # remaining instructions
+ &vst1_32 ("{$T0}","[$Xfer,:128]!");
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub body_00_15 () {
+ (
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+ '&add ($h,$h,$t1)', # h+=X[i]+K[i]
+ '&eor ($t1,$f,$g)',
+ '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+ '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
+ '&and ($t1,$t1,$e)',
+ '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
+ '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+ '&eor ($t1,$t1,$g)', # Ch(e,f,g)
+ '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
+ '&eor ($t2,$a,$b)', # a^b, b^c in next round
+ '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
+ '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
+ '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
+ '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
+ '&ldr ($t1,"[sp,#64]") if ($j==31)',
+ '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
+ '&add ($d,$d,$h)', # d+=h
+ '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
+ '&eor ($t3,$t3,$b)', # Maj(a,b,c)
+ '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+ )
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.global sha256_block_data_order_neon
+.type sha256_block_data_order_neon,%function
+.align 4
+sha256_block_data_order_neon:
+.LNEON:
+ stmdb sp!,{r4-r12,lr}
+
+ sub $H,sp,#16*4+16
+ adrl $Ktbl,K256
+ bic $H,$H,#15 @ align for 128-bit stores
+ mov $t2,sp
+ mov sp,$H @ alloca
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+
+ vld1.8 {@X[0]},[$inp]!
+ vld1.8 {@X[1]},[$inp]!
+ vld1.8 {@X[2]},[$inp]!
+ vld1.8 {@X[3]},[$inp]!
+ vld1.32 {$T0},[$Ktbl,:128]!
+ vld1.32 {$T1},[$Ktbl,:128]!
+ vld1.32 {$T2},[$Ktbl,:128]!
+ vld1.32 {$T3},[$Ktbl,:128]!
+ vrev32.8 @X[0],@X[0] @ yes, even on
+ str $ctx,[sp,#64]
+ vrev32.8 @X[1],@X[1] @ big-endian
+ str $inp,[sp,#68]
+ mov $Xfer,sp
+ vrev32.8 @X[2],@X[2]
+ str $len,[sp,#72]
+ vrev32.8 @X[3],@X[3]
+ str $t2,[sp,#76] @ save original sp
+ vadd.i32 $T0,$T0,@X[0]
+ vadd.i32 $T1,$T1,@X[1]
+ vst1.32 {$T0},[$Xfer,:128]!
+ vadd.i32 $T2,$T2,@X[2]
+ vst1.32 {$T1},[$Xfer,:128]!
+ vadd.i32 $T3,$T3,@X[3]
+ vst1.32 {$T2},[$Xfer,:128]!
+ vst1.32 {$T3},[$Xfer,:128]!
+
+ ldmia $ctx,{$A-$H}
+ sub $Xfer,$Xfer,#64
+ ldr $t1,[sp,#0]
+ eor $t2,$t2,$t2
+ eor $t3,$B,$C
+ b .L_00_48
+
+.align 4
+.L_00_48:
+___
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+$code.=<<___;
+ teq $t1,#0 @ check for K256 terminator
+ ldr $t1,[sp,#0]
+ sub $Xfer,$Xfer,#64
+ bne .L_00_48
+
+ ldr $inp,[sp,#68]
+ ldr $t0,[sp,#72]
+ sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
+ teq $inp,$t0
+ it eq
+ subeq $inp,$inp,#64 @ avoid SEGV
+ vld1.8 {@X[0]},[$inp]! @ load next input block
+ vld1.8 {@X[1]},[$inp]!
+ vld1.8 {@X[2]},[$inp]!
+ vld1.8 {@X[3]},[$inp]!
+ it ne
+ strne $inp,[sp,#68]
+ mov $Xfer,sp
+___
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+$code.=<<___;
+ ldr $t0,[$t1,#0]
+ add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
+ ldr $t2,[$t1,#4]
+ ldr $t3,[$t1,#8]
+ ldr $t4,[$t1,#12]
+ add $A,$A,$t0 @ accumulate
+ ldr $t0,[$t1,#16]
+ add $B,$B,$t2
+ ldr $t2,[$t1,#20]
+ add $C,$C,$t3
+ ldr $t3,[$t1,#24]
+ add $D,$D,$t4
+ ldr $t4,[$t1,#28]
+ add $E,$E,$t0
+ str $A,[$t1],#4
+ add $F,$F,$t2
+ str $B,[$t1],#4
+ add $G,$G,$t3
+ str $C,[$t1],#4
+ add $H,$H,$t4
+ str $D,[$t1],#4
+ stmia $t1,{$E-$H}
+
+ ittte ne
+ movne $Xfer,sp
+ ldrne $t1,[sp,#0]
+ eorne $t2,$t2,$t2
+ ldreq sp,[sp,#76] @ restore original sp
+ itt ne
+ eorne $t3,$B,$C
+ bne .L_00_48
+
+ ldmia sp!,{r4-r12,pc}
+.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+___
+}}}
+######################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
+my @MSG=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
+my $Ktbl="r3";
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d
+# endif
+
+.type sha256_block_data_order_armv8,%function
+.align 5
+sha256_block_data_order_armv8:
+.LARMv8:
+ vld1.32 {$ABCD,$EFGH},[$ctx]
+# ifdef __thumb2__
+ adr $Ktbl,.LARMv8
+ sub $Ktbl,$Ktbl,#.LARMv8-K256
+# else
+ adrl $Ktbl,K256
+# endif
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+
+.Loop_v8:
+ vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
+ vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
+ vld1.32 {$W0},[$Ktbl]!
+ vrev32.8 @MSG[0],@MSG[0]
+ vrev32.8 @MSG[1],@MSG[1]
+ vrev32.8 @MSG[2],@MSG[2]
+ vrev32.8 @MSG[3],@MSG[3]
+ vmov $ABCD_SAVE,$ABCD @ offload
+ vmov $EFGH_SAVE,$EFGH
+ teq $inp,$len
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+ vld1.32 {$W1},[$Ktbl]!
+ vadd.i32 $W0,$W0,@MSG[0]
+ sha256su0 @MSG[0],@MSG[1]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+ sha256su1 @MSG[0],@MSG[2],@MSG[3]
+___
+ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+ vld1.32 {$W1},[$Ktbl]!
+ vadd.i32 $W0,$W0,@MSG[0]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ vld1.32 {$W0},[$Ktbl]!
+ vadd.i32 $W1,$W1,@MSG[1]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ vld1.32 {$W1},[$Ktbl]
+ vadd.i32 $W0,$W0,@MSG[2]
+ sub $Ktbl,$Ktbl,#256-16 @ rewind
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ vadd.i32 $W1,$W1,@MSG[3]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
+ vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
+ it ne
+ bne .Loop_v8
+
+ vst1.32 {$ABCD,$EFGH},[$ctx]
+
+ ret @ bx lr
+.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm OPENSSL_armcap_P,4,4
+#endif
+___
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
+{ my %opcode = (
+ "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
+ "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
+
+ sub unsha256 {
+ my ($mnemonic,$arg)=@_;
+
+ if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+ my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<17)|(($2&8)<<4)
+ |(($3&7)<<1) |(($3&8)<<2);
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+ sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
+ $mnemonic,$arg;
+ }
+ }
+}
+
+foreach (split($/,$code)) {
+
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
+
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/arch/arm/crypto/sha256-core.S_shipped b/arch/arm/crypto/sha256-core.S_shipped
new file mode 100644
index 0000000..555a1a8
--- /dev/null
+++ b/arch/arm/crypto/sha256-core.S_shipped
@@ -0,0 +1,2808 @@
+
+@ ====================================================================
+@ Written by Andy Polyakov <[email protected]> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA256 block procedure for ARMv4. May 2007.
+
+@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
+@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+@ byte [on single-issue Xscale PXA250 core].
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
+@ Cortex A8 core and ~20 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+@ September 2013.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process one
+@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+@ code (meaning that latter performs sub-optimally, nothing was done
+@ about it).
+
+@ May 2014.
+@
+@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+# define adrl adr
+.thumb
+# else
+.code 32
+# endif
+#endif
+
+.type K256,%object
+.align 5
+K256:
+.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size K256,.-K256
+.word 0 @ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align 5
+
+.global sha256_block_data_order
+.type sha256_block_data_order,%function
+sha256_block_data_order:
+#if __ARM_ARCH__<7
+ sub r3,pc,#8 @ sha256_block_data_order
+#else
+ adr r3,sha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+ tst r12,#ARMV8_SHA256
+ bne .LARMv8
+ tst r12,#ARMV7_NEON
+ bne .LNEON
+#endif
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+ stmdb sp!,{r0,r1,r2,r4-r11,lr}
+ ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+ sub r14,r3,#256+32 @ K256
+ sub sp,sp,#16*4 @ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ magic
+ eor r12,r12,r12
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 0
+# if 0==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 0
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 0==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#0*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 0==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 0<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#2*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#15*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 1
+# if 1==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 1
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 1==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#1*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 1==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 1<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#3*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#0*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 2
+# if 2==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 2
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 2==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#2*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 2==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 2<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#4*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#1*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 3
+# if 3==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 3
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 3==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#3*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 3==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 3<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#5*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#2*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 4
+# if 4==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 4
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 4==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#4*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 4==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 4<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#6*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#3*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 5
+# if 5==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 5==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#5*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 5==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 5<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#7*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#4*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 6
+# if 6==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 6
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 6==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#6*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 6==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 6<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#8*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#5*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 7
+# if 7==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 7==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#7*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 7==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 7<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#9*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#6*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 8
+# if 8==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 8
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 8==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#8*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 8==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 8<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#10*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#7*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 9
+# if 9==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 9
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 9==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#9*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 9==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 9<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#11*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#8*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 10
+# if 10==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 10
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 10==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#10*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 10==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 10<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#12*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#9*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 11
+# if 11==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 11
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 11==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#11*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 11==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 11<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#13*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#10*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 12
+# if 12==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 12
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 12==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#12*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 12==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 12<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#14*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#11*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 13
+# if 13==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 13
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 13==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#13*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 13==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 13<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#15*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#12*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 14
+# if 14==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 14
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 14==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#14*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 14==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 14<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#0*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#13*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 15
+# if 15==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 15
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 15==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#15*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 15==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 15<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#1*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#14*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+.Lrounds_16_xx:
+ @ ldr r2,[sp,#1*4] @ 16
+ @ ldr r1,[sp,#14*4]
+ mov r0,r2,ror#7
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#0*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#9*4]
+
+ add r12,r12,r0
+ eor r0,r8,r8,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#0*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 16==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 16<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#2*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#15*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#2*4] @ 17
+ @ ldr r1,[sp,#15*4]
+ mov r0,r2,ror#7
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#1*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#10*4]
+
+ add r3,r3,r0
+ eor r0,r7,r7,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#1*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 17==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 17<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#3*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#0*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#3*4] @ 18
+ @ ldr r1,[sp,#0*4]
+ mov r0,r2,ror#7
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#2*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#11*4]
+
+ add r12,r12,r0
+ eor r0,r6,r6,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#2*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 18==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 18<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#4*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#1*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#4*4] @ 19
+ @ ldr r1,[sp,#1*4]
+ mov r0,r2,ror#7
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#3*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#12*4]
+
+ add r3,r3,r0
+ eor r0,r5,r5,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#3*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 19==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 19<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#5*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#2*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#5*4] @ 20
+ @ ldr r1,[sp,#2*4]
+ mov r0,r2,ror#7
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#4*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#13*4]
+
+ add r12,r12,r0
+ eor r0,r4,r4,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#4*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 20==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 20<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#6*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#3*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#6*4] @ 21
+ @ ldr r1,[sp,#3*4]
+ mov r0,r2,ror#7
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#5*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#14*4]
+
+ add r3,r3,r0
+ eor r0,r11,r11,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#5*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 21==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 21<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#7*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#4*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#7*4] @ 22
+ @ ldr r1,[sp,#4*4]
+ mov r0,r2,ror#7
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#6*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#15*4]
+
+ add r12,r12,r0
+ eor r0,r10,r10,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#6*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 22==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 22<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#8*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#5*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#8*4] @ 23
+ @ ldr r1,[sp,#5*4]
+ mov r0,r2,ror#7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#7*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#0*4]
+
+ add r3,r3,r0
+ eor r0,r9,r9,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#7*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 23==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 23<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#9*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#6*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#9*4] @ 24
+ @ ldr r1,[sp,#6*4]
+ mov r0,r2,ror#7
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#8*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#1*4]
+
+ add r12,r12,r0
+ eor r0,r8,r8,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#8*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 24==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 24<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#10*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#7*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#10*4] @ 25
+ @ ldr r1,[sp,#7*4]
+ mov r0,r2,ror#7
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#9*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#2*4]
+
+ add r3,r3,r0
+ eor r0,r7,r7,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#9*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 25==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 25<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#11*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#8*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#11*4] @ 26
+ @ ldr r1,[sp,#8*4]
+ mov r0,r2,ror#7
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#10*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#3*4]
+
+ add r12,r12,r0
+ eor r0,r6,r6,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#10*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 26==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 26<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#12*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#9*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#12*4] @ 27
+ @ ldr r1,[sp,#9*4]
+ mov r0,r2,ror#7
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#11*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#4*4]
+
+ add r3,r3,r0
+ eor r0,r5,r5,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#11*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 27==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 27<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#13*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#10*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#13*4] @ 28
+ @ ldr r1,[sp,#10*4]
+ mov r0,r2,ror#7
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#12*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#5*4]
+
+ add r12,r12,r0
+ eor r0,r4,r4,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#12*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 28==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 28<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#14*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#11*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#14*4] @ 29
+ @ ldr r1,[sp,#11*4]
+ mov r0,r2,ror#7
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#13*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#6*4]
+
+ add r3,r3,r0
+ eor r0,r11,r11,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#13*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 29==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 29<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#15*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#12*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#15*4] @ 30
+ @ ldr r1,[sp,#12*4]
+ mov r0,r2,ror#7
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#14*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#7*4]
+
+ add r12,r12,r0
+ eor r0,r10,r10,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#14*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 30==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 30<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#0*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#13*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#0*4] @ 31
+ @ ldr r1,[sp,#13*4]
+ mov r0,r2,ror#7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#15*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#8*4]
+
+ add r3,r3,r0
+ eor r0,r9,r9,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#15*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 31==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 31<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#1*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#14*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ ite eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq r3,[sp,#16*4] @ pull ctx
+ bne .Lrounds_16_xx
+
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldr r0,[r3,#0]
+ ldr r2,[r3,#4]
+ ldr r12,[r3,#8]
+ add r4,r4,r0
+ ldr r0,[r3,#12]
+ add r5,r5,r2
+ ldr r2,[r3,#16]
+ add r6,r6,r12
+ ldr r12,[r3,#20]
+ add r7,r7,r0
+ ldr r0,[r3,#24]
+ add r8,r8,r2
+ ldr r2,[r3,#28]
+ add r9,r9,r12
+ ldr r1,[sp,#17*4] @ pull inp
+ ldr r12,[sp,#18*4] @ pull inp+len
+ add r10,r10,r0
+ add r11,r11,r2
+ stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11}
+ cmp r1,r12
+ sub r14,r14,#256 @ rewind Ktbl
+ bne .Loop
+
+ add sp,sp,#19*4 @ destroy frame
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size sha256_block_data_order,.-sha256_block_data_order
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.global sha256_block_data_order_neon
+.type sha256_block_data_order_neon,%function
+.align 4
+sha256_block_data_order_neon:
+.LNEON:
+ stmdb sp!,{r4-r12,lr}
+
+ sub r11,sp,#16*4+16
+ adrl r14,K256
+ bic r11,r11,#15 @ align for 128-bit stores
+ mov r12,sp
+ mov sp,r11 @ alloca
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+
+ vld1.8 {q0},[r1]!
+ vld1.8 {q1},[r1]!
+ vld1.8 {q2},[r1]!
+ vld1.8 {q3},[r1]!
+ vld1.32 {q8},[r14,:128]!
+ vld1.32 {q9},[r14,:128]!
+ vld1.32 {q10},[r14,:128]!
+ vld1.32 {q11},[r14,:128]!
+ vrev32.8 q0,q0 @ yes, even on
+ str r0,[sp,#64]
+ vrev32.8 q1,q1 @ big-endian
+ str r1,[sp,#68]
+ mov r1,sp
+ vrev32.8 q2,q2
+ str r2,[sp,#72]
+ vrev32.8 q3,q3
+ str r12,[sp,#76] @ save original sp
+ vadd.i32 q8,q8,q0
+ vadd.i32 q9,q9,q1
+ vst1.32 {q8},[r1,:128]!
+ vadd.i32 q10,q10,q2
+ vst1.32 {q9},[r1,:128]!
+ vadd.i32 q11,q11,q3
+ vst1.32 {q10},[r1,:128]!
+ vst1.32 {q11},[r1,:128]!
+
+ ldmia r0,{r4-r11}
+ sub r1,r1,#64
+ ldr r2,[sp,#0]
+ eor r12,r12,r12
+ eor r3,r5,r6
+ b .L_00_48
+
+.align 4
+.L_00_48:
+ vext.8 q8,q0,q1,#4
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ vext.8 q9,q2,q3,#4
+ add r4,r4,r12
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vadd.i32 q0,q0,q9
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#4]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ veor q9,q9,q10
+ add r10,r10,r2
+ vsli.32 q11,q8,#14
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ vshr.u32 d24,d7,#17
+ add r11,r11,r3
+ and r2,r2,r7
+ veor q9,q9,q11
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ vsli.32 d24,d7,#15
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ vshr.u32 d25,d7,#10
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ vadd.i32 q0,q0,q9
+ add r10,r10,r2
+ ldr r2,[sp,#8]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r6,r6,r10
+ vshr.u32 d24,d7,#19
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ vsli.32 d24,d7,#13
+ add r9,r9,r2
+ eor r2,r7,r8
+ veor d25,d25,d24
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ vadd.i32 d0,d0,d25
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ vshr.u32 d24,d0,#17
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ vsli.32 d24,d0,#15
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ vshr.u32 d25,d0,#10
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#12]
+ and r3,r3,r12
+ vshr.u32 d24,d0,#19
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ vld1.32 {q8},[r14,:128]!
+ add r8,r8,r2
+ vsli.32 d24,d0,#13
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ veor d25,d25,d24
+ add r9,r9,r3
+ and r2,r2,r5
+ vadd.i32 d1,d1,d25
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ vadd.i32 q8,q8,q0
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#16]
+ and r12,r12,r3
+ add r4,r4,r8
+ vst1.32 {q8},[r1,:128]!
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vext.8 q8,q1,q2,#4
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ vext.8 q9,q3,q0,#4
+ add r8,r8,r12
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vadd.i32 q1,q1,q9
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#20]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ veor q9,q9,q10
+ add r6,r6,r2
+ vsli.32 q11,q8,#14
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ vshr.u32 d24,d1,#17
+ add r7,r7,r3
+ and r2,r2,r11
+ veor q9,q9,q11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ vsli.32 d24,d1,#15
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ vshr.u32 d25,d1,#10
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ vadd.i32 q1,q1,q9
+ add r6,r6,r2
+ ldr r2,[sp,#24]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r10,r10,r6
+ vshr.u32 d24,d1,#19
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ vsli.32 d24,d1,#13
+ add r5,r5,r2
+ eor r2,r11,r4
+ veor d25,d25,d24
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ vadd.i32 d2,d2,d25
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ vshr.u32 d24,d2,#17
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ vsli.32 d24,d2,#15
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ vshr.u32 d25,d2,#10
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#28]
+ and r3,r3,r12
+ vshr.u32 d24,d2,#19
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ vld1.32 {q8},[r14,:128]!
+ add r4,r4,r2
+ vsli.32 d24,d2,#13
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ veor d25,d25,d24
+ add r5,r5,r3
+ and r2,r2,r9
+ vadd.i32 d3,d3,d25
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ vadd.i32 q8,q8,q1
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#32]
+ and r12,r12,r3
+ add r8,r8,r4
+ vst1.32 {q8},[r1,:128]!
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vext.8 q8,q2,q3,#4
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ vext.8 q9,q0,q1,#4
+ add r4,r4,r12
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vadd.i32 q2,q2,q9
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#36]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ veor q9,q9,q10
+ add r10,r10,r2
+ vsli.32 q11,q8,#14
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ vshr.u32 d24,d3,#17
+ add r11,r11,r3
+ and r2,r2,r7
+ veor q9,q9,q11
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ vsli.32 d24,d3,#15
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ vshr.u32 d25,d3,#10
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ vadd.i32 q2,q2,q9
+ add r10,r10,r2
+ ldr r2,[sp,#40]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r6,r6,r10
+ vshr.u32 d24,d3,#19
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ vsli.32 d24,d3,#13
+ add r9,r9,r2
+ eor r2,r7,r8
+ veor d25,d25,d24
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ vadd.i32 d4,d4,d25
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ vshr.u32 d24,d4,#17
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ vsli.32 d24,d4,#15
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ vshr.u32 d25,d4,#10
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#44]
+ and r3,r3,r12
+ vshr.u32 d24,d4,#19
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ vld1.32 {q8},[r14,:128]!
+ add r8,r8,r2
+ vsli.32 d24,d4,#13
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ veor d25,d25,d24
+ add r9,r9,r3
+ and r2,r2,r5
+ vadd.i32 d5,d5,d25
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ vadd.i32 q8,q8,q2
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#48]
+ and r12,r12,r3
+ add r4,r4,r8
+ vst1.32 {q8},[r1,:128]!
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vext.8 q8,q3,q0,#4
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ vext.8 q9,q1,q2,#4
+ add r8,r8,r12
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vadd.i32 q3,q3,q9
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#52]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ veor q9,q9,q10
+ add r6,r6,r2
+ vsli.32 q11,q8,#14
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ vshr.u32 d24,d5,#17
+ add r7,r7,r3
+ and r2,r2,r11
+ veor q9,q9,q11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ vsli.32 d24,d5,#15
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ vshr.u32 d25,d5,#10
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ vadd.i32 q3,q3,q9
+ add r6,r6,r2
+ ldr r2,[sp,#56]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r10,r10,r6
+ vshr.u32 d24,d5,#19
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ vsli.32 d24,d5,#13
+ add r5,r5,r2
+ eor r2,r11,r4
+ veor d25,d25,d24
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ vadd.i32 d6,d6,d25
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ vshr.u32 d24,d6,#17
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ vsli.32 d24,d6,#15
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ vshr.u32 d25,d6,#10
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#60]
+ and r3,r3,r12
+ vshr.u32 d24,d6,#19
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ vld1.32 {q8},[r14,:128]!
+ add r4,r4,r2
+ vsli.32 d24,d6,#13
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ veor d25,d25,d24
+ add r5,r5,r3
+ and r2,r2,r9
+ vadd.i32 d7,d7,d25
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ vadd.i32 q8,q8,q3
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[r14]
+ and r12,r12,r3
+ add r8,r8,r4
+ vst1.32 {q8},[r1,:128]!
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ teq r2,#0 @ check for K256 terminator
+ ldr r2,[sp,#0]
+ sub r1,r1,#64
+ bne .L_00_48
+
+ ldr r1,[sp,#68]
+ ldr r0,[sp,#72]
+ sub r14,r14,#256 @ rewind r14
+ teq r1,r0
+ it eq
+ subeq r1,r1,#64 @ avoid SEGV
+ vld1.8 {q0},[r1]! @ load next input block
+ vld1.8 {q1},[r1]!
+ vld1.8 {q2},[r1]!
+ vld1.8 {q3},[r1]!
+ it ne
+ strne r1,[sp,#68]
+ mov r1,sp
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vrev32.8 q0,q0
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vadd.i32 q8,q8,q0
+ ldr r2,[sp,#4]
+ and r3,r3,r12
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ add r10,r10,r2
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3
+ and r2,r2,r7
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ add r10,r10,r2
+ ldr r2,[sp,#8]
+ and r12,r12,r3
+ add r6,r6,r10
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ add r9,r9,r2
+ eor r2,r7,r8
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ ldr r2,[sp,#12]
+ and r3,r3,r12
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ add r8,r8,r2
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3
+ and r2,r2,r5
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#16]
+ and r12,r12,r3
+ add r4,r4,r8
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vst1.32 {q8},[r1,:128]!
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vrev32.8 q1,q1
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vadd.i32 q8,q8,q1
+ ldr r2,[sp,#20]
+ and r3,r3,r12
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ add r6,r6,r2
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3
+ and r2,r2,r11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ add r6,r6,r2
+ ldr r2,[sp,#24]
+ and r12,r12,r3
+ add r10,r10,r6
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ add r5,r5,r2
+ eor r2,r11,r4
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ ldr r2,[sp,#28]
+ and r3,r3,r12
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ add r4,r4,r2
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3
+ and r2,r2,r9
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#32]
+ and r12,r12,r3
+ add r8,r8,r4
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vst1.32 {q8},[r1,:128]!
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vrev32.8 q2,q2
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vadd.i32 q8,q8,q2
+ ldr r2,[sp,#36]
+ and r3,r3,r12
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ add r10,r10,r2
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3
+ and r2,r2,r7
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ add r10,r10,r2
+ ldr r2,[sp,#40]
+ and r12,r12,r3
+ add r6,r6,r10
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ add r9,r9,r2
+ eor r2,r7,r8
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ ldr r2,[sp,#44]
+ and r3,r3,r12
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ add r8,r8,r2
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3
+ and r2,r2,r5
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#48]
+ and r12,r12,r3
+ add r4,r4,r8
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vst1.32 {q8},[r1,:128]!
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vrev32.8 q3,q3
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vadd.i32 q8,q8,q3
+ ldr r2,[sp,#52]
+ and r3,r3,r12
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ add r6,r6,r2
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3
+ and r2,r2,r11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ add r6,r6,r2
+ ldr r2,[sp,#56]
+ and r12,r12,r3
+ add r10,r10,r6
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ add r5,r5,r2
+ eor r2,r11,r4
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ ldr r2,[sp,#60]
+ and r3,r3,r12
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ add r4,r4,r2
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3
+ and r2,r2,r9
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#64]
+ and r12,r12,r3
+ add r8,r8,r4
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vst1.32 {q8},[r1,:128]!
+ ldr r0,[r2,#0]
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldr r12,[r2,#4]
+ ldr r3,[r2,#8]
+ ldr r1,[r2,#12]
+ add r4,r4,r0 @ accumulate
+ ldr r0,[r2,#16]
+ add r5,r5,r12
+ ldr r12,[r2,#20]
+ add r6,r6,r3
+ ldr r3,[r2,#24]
+ add r7,r7,r1
+ ldr r1,[r2,#28]
+ add r8,r8,r0
+ str r4,[r2],#4
+ add r9,r9,r12
+ str r5,[r2],#4
+ add r10,r10,r3
+ str r6,[r2],#4
+ add r11,r11,r1
+ str r7,[r2],#4
+ stmia r2,{r8-r11}
+
+ ittte ne
+ movne r1,sp
+ ldrne r2,[sp,#0]
+ eorne r12,r12,r12
+ ldreq sp,[sp,#76] @ restore original sp
+ itt ne
+ eorne r3,r5,r6
+ bne .L_00_48
+
+ ldmia sp!,{r4-r12,pc}
+.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d
+# endif
+
+.type sha256_block_data_order_armv8,%function
+.align 5
+sha256_block_data_order_armv8:
+.LARMv8:
+ vld1.32 {q0,q1},[r0]
+# ifdef __thumb2__
+ adr r3,.LARMv8
+ sub r3,r3,#.LARMv8-K256
+# else
+ adrl r3,K256
+# endif
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+
+.Loop_v8:
+ vld1.8 {q8-q9},[r1]!
+ vld1.8 {q10-q11},[r1]!
+ vld1.32 {q12},[r3]!
+ vrev32.8 q8,q8
+ vrev32.8 q9,q9
+ vrev32.8 q10,q10
+ vrev32.8 q11,q11
+ vmov q14,q0 @ offload
+ vmov q15,q1
+ teq r1,r2
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+
+ vld1.32 {q13},[r3]
+ vadd.i32 q12,q12,q10
+ sub r3,r3,#256-16 @ rewind
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+
+ vadd.i32 q13,q13,q11
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+
+ vadd.i32 q0,q0,q14
+ vadd.i32 q1,q1,q15
+ it ne
+ bne .Loop_v8
+
+ vst1.32 {q0,q1},[r0]
+
+ bx lr @ bx lr
+.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <[email protected]>"
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm OPENSSL_armcap_P,4,4
+#endif
diff --git a/arch/arm/crypto/sha256_glue.c b/arch/arm/crypto/sha256_glue.c
new file mode 100644
index 0000000..25ceba6
--- /dev/null
+++ b/arch/arm/crypto/sha256_glue.c
@@ -0,0 +1,246 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using optimized ARM assembler and NEON instructions.
+ *
+ * Copyright ? 2015 Google Inc.
+ *
+ * This file is based on sha256_ssse3_glue.c:
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+#include "sha256_glue.h"
+
+asmlinkage void sha256_block_data_order(u32 *digest, const void *data,
+ unsigned int num_blks);
+
+
+int sha256_init(struct shash_desc *desc)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ sctx->state[0] = SHA256_H0;
+ sctx->state[1] = SHA256_H1;
+ sctx->state[2] = SHA256_H2;
+ sctx->state[3] = SHA256_H3;
+ sctx->state[4] = SHA256_H4;
+ sctx->state[5] = SHA256_H5;
+ sctx->state[6] = SHA256_H6;
+ sctx->state[7] = SHA256_H7;
+ sctx->count = 0;
+
+ return 0;
+}
+
+int sha224_init(struct shash_desc *desc)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ sctx->state[0] = SHA224_H0;
+ sctx->state[1] = SHA224_H1;
+ sctx->state[2] = SHA224_H2;
+ sctx->state[3] = SHA224_H3;
+ sctx->state[4] = SHA224_H4;
+ sctx->state[5] = SHA224_H5;
+ sctx->state[6] = SHA224_H6;
+ sctx->state[7] = SHA224_H7;
+ sctx->count = 0;
+
+ return 0;
+}
+
+int __sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len,
+ unsigned int partial)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int done = 0;
+
+ sctx->count += len;
+
+ if (partial) {
+ done = SHA256_BLOCK_SIZE - partial;
+ memcpy(sctx->buf + partial, data, done);
+ sha256_block_data_order(sctx->state, sctx->buf, 1);
+ }
+
+ if (len - done >= SHA256_BLOCK_SIZE) {
+ const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+ sha256_block_data_order(sctx->state, data + done, rounds);
+ done += rounds * SHA256_BLOCK_SIZE;
+ }
+
+ memcpy(sctx->buf, data + done, len - done);
+
+ return 0;
+}
+
+int sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+
+ /* Handle the fast case right here */
+ if (partial + len < SHA256_BLOCK_SIZE) {
+ sctx->count += len;
+ memcpy(sctx->buf + partial, data, len);
+
+ return 0;
+ }
+
+ return __sha256_update(desc, data, len, partial);
+}
+
+/* Add padding and return the message digest. */
+static int sha256_final(struct shash_desc *desc, u8 *out)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int i, index, padlen;
+ __be32 *dst = (__be32 *)out;
+ __be64 bits;
+ static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+ /* save number of bits */
+ bits = cpu_to_be64(sctx->count << 3);
+
+ /* Pad out to 56 mod 64 and append length */
+ index = sctx->count % SHA256_BLOCK_SIZE;
+ padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+ /* We need to fill a whole block for __sha256_update */
+ if (padlen <= 56) {
+ sctx->count += padlen;
+ memcpy(sctx->buf + index, padding, padlen);
+ } else {
+ __sha256_update(desc, padding, padlen, index);
+ }
+ __sha256_update(desc, (const u8 *)&bits, sizeof(bits), 56);
+
+ /* Store state in digest */
+ for (i = 0; i < 8; i++)
+ dst[i] = cpu_to_be32(sctx->state[i]);
+
+ /* Wipe context */
+ memset(sctx, 0, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha224_final(struct shash_desc *desc, u8 *out)
+{
+ u8 D[SHA256_DIGEST_SIZE];
+
+ sha256_final(desc, D);
+
+ memcpy(out, D, SHA224_DIGEST_SIZE);
+ memzero_explicit(D, SHA256_DIGEST_SIZE);
+
+ return 0;
+}
+
+int sha256_export(struct shash_desc *desc, void *out)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(out, sctx, sizeof(*sctx));
+
+ return 0;
+}
+
+int sha256_import(struct shash_desc *desc, const void *in)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ memcpy(sctx, in, sizeof(*sctx));
+
+ return 0;
+}
+
+static struct shash_alg algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_init,
+ .update = sha256_update,
+ .final = sha256_final,
+ .export = sha256_export,
+ .import = sha256_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-asm",
+ .cra_priority = 150,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_init,
+ .update = sha256_update,
+ .final = sha224_final,
+ .export = sha256_export,
+ .import = sha256_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-asm",
+ .cra_priority = 150,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static int __init sha256_mod_init(void)
+{
+ int res = crypto_register_shashes(algs, ARRAY_SIZE(algs));
+
+ if (res < 0)
+ return res;
+
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) {
+ res = crypto_register_shashes(sha256_neon_algs,
+ ARRAY_SIZE(sha256_neon_algs));
+
+ if (res < 0)
+ crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+ }
+
+ return res;
+}
+
+static void __exit sha256_mod_fini(void)
+{
+ crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon())
+ crypto_unregister_shashes(sha256_neon_algs,
+ ARRAY_SIZE(sha256_neon_algs));
+}
+
+module_init(sha256_mod_init);
+module_exit(sha256_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm (ARM), including NEON");
+
+MODULE_ALIAS_CRYPTO("sha256");
diff --git a/arch/arm/crypto/sha256_glue.h b/arch/arm/crypto/sha256_glue.h
new file mode 100644
index 0000000..0312f4f
--- /dev/null
+++ b/arch/arm/crypto/sha256_glue.h
@@ -0,0 +1,23 @@
+#ifndef _CRYPTO_SHA256_GLUE_H
+#define _CRYPTO_SHA256_GLUE_H
+
+#include <linux/crypto.h>
+#include <crypto/sha.h>
+
+extern struct shash_alg sha256_neon_algs[2];
+
+extern int sha256_init(struct shash_desc *desc);
+
+extern int sha224_init(struct shash_desc *desc);
+
+extern int __sha256_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, unsigned int partial);
+
+extern int sha256_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len);
+
+extern int sha256_export(struct shash_desc *desc, void *out);
+
+extern int sha256_import(struct shash_desc *desc, const void *in);
+
+#endif /* _CRYPTO_SHA256_GLUE_H */
diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
new file mode 100644
index 0000000..d0f168d
--- /dev/null
+++ b/arch/arm/crypto/sha256_neon_glue.c
@@ -0,0 +1,172 @@
+/*
+ * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
+ * using NEON instructions.
+ *
+ * Copyright ? 2015 Google Inc.
+ *
+ * This file is based on sha512_neon_glue.c:
+ * Copyright ? 2014 Jussi Kivilinna <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+#include "sha256_glue.h"
+
+asmlinkage void sha256_block_data_order_neon(u32 *digest, const void *data,
+ unsigned int num_blks);
+
+
+static int __sha256_neon_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, unsigned int partial)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int done = 0;
+
+ sctx->count += len;
+
+ if (partial) {
+ done = SHA256_BLOCK_SIZE - partial;
+ memcpy(sctx->buf + partial, data, done);
+ sha256_block_data_order_neon(sctx->state, sctx->buf, 1);
+ }
+
+ if (len - done >= SHA256_BLOCK_SIZE) {
+ const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+ sha256_block_data_order_neon(sctx->state, data + done, rounds);
+ done += rounds * SHA256_BLOCK_SIZE;
+ }
+
+ memcpy(sctx->buf, data + done, len - done);
+
+ return 0;
+}
+
+static int sha256_neon_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+ int res;
+
+ /* Handle the fast case right here */
+ if (partial + len < SHA256_BLOCK_SIZE) {
+ sctx->count += len;
+ memcpy(sctx->buf + partial, data, len);
+
+ return 0;
+ }
+
+ if (!may_use_simd()) {
+ res = __sha256_update(desc, data, len, partial);
+ } else {
+ kernel_neon_begin();
+ res = __sha256_neon_update(desc, data, len, partial);
+ kernel_neon_end();
+ }
+
+ return res;
+}
+
+/* Add padding and return the message digest. */
+static int sha256_neon_final(struct shash_desc *desc, u8 *out)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+ unsigned int i, index, padlen;
+ __be32 *dst = (__be32 *)out;
+ __be64 bits;
+ static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+ /* save number of bits */
+ bits = cpu_to_be64(sctx->count << 3);
+
+ /* Pad out to 56 mod 64 and append length */
+ index = sctx->count % SHA256_BLOCK_SIZE;
+ padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+ if (!may_use_simd()) {
+ sha256_update(desc, padding, padlen);
+ sha256_update(desc, (const u8 *)&bits, sizeof(bits));
+ } else {
+ kernel_neon_begin();
+ /* We need to fill a whole block for __sha256_neon_update() */
+ if (padlen <= 56) {
+ sctx->count += padlen;
+ memcpy(sctx->buf + index, padding, padlen);
+ } else {
+ __sha256_neon_update(desc, padding, padlen, index);
+ }
+ __sha256_neon_update(desc, (const u8 *)&bits,
+ sizeof(bits), 56);
+ kernel_neon_end();
+ }
+
+ /* Store state in digest */
+ for (i = 0; i < 8; i++)
+ dst[i] = cpu_to_be32(sctx->state[i]);
+
+ /* Wipe context */
+ memzero_explicit(sctx, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha224_neon_final(struct shash_desc *desc, u8 *out)
+{
+ u8 D[SHA256_DIGEST_SIZE];
+
+ sha256_neon_final(desc, D);
+
+ memcpy(out, D, SHA224_DIGEST_SIZE);
+ memzero_explicit(D, SHA256_DIGEST_SIZE);
+
+ return 0;
+}
+
+struct shash_alg sha256_neon_algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_init,
+ .update = sha256_neon_update,
+ .final = sha256_neon_final,
+ .export = sha256_export,
+ .import = sha256_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-neon",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_init,
+ .update = sha256_neon_update,
+ .final = sha224_neon_final,
+ .export = sha256_export,
+ .import = sha256_import,
+ .descsize = sizeof(struct sha256_state),
+ .statesize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-neon",
+ .cra_priority = 250,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };

2015-04-01 12:43:44

[permalink] [raw]

Subject: Re: [PATCHv3] arm: crypto: Add optimized SHA-256/224

On Mon, Mar 30, 2015 at 09:37:27AM +0100, Sami Tolvanen wrote:
> Add Andy Polyakov's optimized assembly and NEON implementations for
> SHA-256/224.
>
> The sha256-armv4.pl script for generating the assembly code is from
> OpenSSL commit 51f8d095562f36cdaa6893597b5c609e943b0565.
>
> Compared to sha256-generic these implementations have the following
> tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):

I cannot merge this because none of your emails are making it
to the mailing list so nobody except those on the cc list can
see the patch.

Please refer to section 7 in Documentation/SubmittingPatches.

Thanks,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2015-04-01 13:19:29

by Sami Tolvanen

[permalink] [raw]

Subject: Re: [PATCHv3] arm: crypto: Add optimized SHA-256/224

On Wed, Apr 01, 2015 at 08:43:25PM +0800, Herbert Xu wrote:
> I cannot merge this because none of your emails are making it
> to the mailing list so nobody except those on the cc list can
> see the patch.
>
> Please refer to section 7 in Documentation/SubmittingPatches.

Please find a hosted version of the patch here:

https://raw.githubusercontent.com/samitolvanen/kernelpatches/master/sha256.v3.patch

Sami

2015-04-03 10:04:45

[permalink] [raw]

Subject: Re: [PATCHv3] arm: crypto: Add optimized SHA-256/224

On Wed, Apr 01, 2015 at 02:19:24PM +0100, Sami Tolvanen wrote:
> On Wed, Apr 01, 2015 at 08:43:25PM +0800, Herbert Xu wrote:
> > I cannot merge this because none of your emails are making it
> > to the mailing list so nobody except those on the cc list can
> > see the patch.
> >
> > Please refer to section 7 in Documentation/SubmittingPatches.
>
> Please find a hosted version of the patch here:
>
> https://raw.githubusercontent.com/samitolvanen/kernelpatches/master/sha256.v3.patch

Patch applied. Thanks!
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt