From: Ard Biesheuvel Subject: [PATCH 4/5] crypto/arm64: sha3 - new implementation based on special instructions Date: Fri, 12 Jan 2018 13:15:21 +0000 Message-ID: <20180112131522.25663-5-ard.biesheuvel@linaro.org> References: <20180112131522.25663-1-ard.biesheuvel@linaro.org> Cc: herbert@gondor.apana.org.au, will.deacon@arm.com, catalin.marinas@arm.com, steve.capper@linaro.org, jgarzik@redhat.com, Ard Biesheuvel To: linux-arm-kernel@lists.infradead.org, linux-crypto@vger.kernel.org Return-path: Received: from mail-wr0-f195.google.com ([209.85.128.195]:36910 "EHLO mail-wr0-f195.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S933581AbeALNPr (ORCPT ); Fri, 12 Jan 2018 08:15:47 -0500 Received: by mail-wr0-f195.google.com with SMTP id f8so5283386wre.4 for ; Fri, 12 Jan 2018 05:15:46 -0800 (PST) In-Reply-To: <20180112131522.25663-1-ard.biesheuvel@linaro.org> Sender: linux-crypto-owner@vger.kernel.org List-ID: Implement the various flavours of SHA3 using the new optional EOR3/RAX1/XAR/BCAX instructions introduced by ARMv8.2. Signed-off-by: Ard Biesheuvel --- arch/arm64/crypto/Kconfig | 6 + arch/arm64/crypto/Makefile | 3 + arch/arm64/crypto/sha3-ce-core.S | 224 ++++++++++++++++++++ arch/arm64/crypto/sha3-ce-glue.c | 156 ++++++++++++++ 4 files changed, 389 insertions(+) diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index aad288f4b9de..4f2974687606 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -35,6 +35,12 @@ config CRYPTO_SHA512_ARM64_CE select CRYPTO_HASH select CRYPTO_SHA512_ARM64 +config CRYPTO_SHA3_ARM64_CE + tristate "SHA3 digest algorithm (ARMv8 Crypto Extensions)" + depends on KERNEL_MODE_NEON + select CRYPTO_HASH + select CRYPTO_SHA3 + config CRYPTO_GHASH_ARM64_CE tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions" depends on KERNEL_MODE_NEON diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index d7573d31d397..04eaf8b78816 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -17,6 +17,9 @@ sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o obj-$(CONFIG_CRYPTO_SHA512_ARM64_CE) += sha512-ce.o sha512-ce-y := sha512-ce-glue.o sha512-ce-core.o +obj-$(CONFIG_CRYPTO_SHA3_ARM64_CE) += sha3-ce.o +sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o + obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o diff --git a/arch/arm64/crypto/sha3-ce-core.S b/arch/arm64/crypto/sha3-ce-core.S new file mode 100644 index 000000000000..b0b3d68ef3d3 --- /dev/null +++ b/arch/arm64/crypto/sha3-ce-core.S @@ -0,0 +1,224 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * sha512-ce-core.S - core SHA-384/SHA-512 transform using v8 Crypto Extensions + * + * Copyright (C) 2018 Linaro Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + + .text + + .irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 + .set .Lv\b\().2d, \b + .set .Lv\b\().16b, \b + .endr + + .macro eor3, rd, rn, ra, rm + .inst 0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16) + .endm + + .macro rax1, rd, rn, rm + .inst 0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16) + .endm + + .macro bcax, rd, rn, ra, rm + .inst 0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16) + .endm + + .macro xar, rd, rn, rm, imm6 + .inst 0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16) + .endm + + /* + * sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size); + */ +ENTRY(sha3_ce_transform) + /* load state */ + mov x8, x0 + ld1 { v0.1d- v3.1d}, [x8], #32 + ld1 { v4.1d- v7.1d}, [x8], #32 + ld1 { v8.1d-v11.1d}, [x8], #32 + ld1 {v12.1d-v15.1d}, [x8], #32 + ld1 {v16.1d-v19.1d}, [x8], #32 + ld1 {v20.1d-v23.1d}, [x8], #32 + ld1 {v24.1d}, [x8] + +0: sub w2, w2, #1 + mov w8, #24 + adr_l x9, .Lsha3_rcon + + /* load input */ + ld1 {v25.8b-v28.8b}, [x1], #32 + ld1 {v29.8b-v31.8b}, [x1], #24 + eor v0.8b, v0.8b, v25.8b + eor v1.8b, v1.8b, v26.8b + eor v2.8b, v2.8b, v27.8b + eor v3.8b, v3.8b, v28.8b + eor v4.8b, v4.8b, v29.8b + eor v5.8b, v5.8b, v30.8b + eor v6.8b, v6.8b, v31.8b + + tbnz x3, #6, 2f // SHA3-512 + + ld1 {v25.8b-v28.8b}, [x1], #32 + ld1 {v29.8b-v30.8b}, [x1], #16 + eor v7.8b, v7.8b, v25.8b + eor v8.8b, v8.8b, v26.8b + eor v9.8b, v9.8b, v27.8b + eor v10.8b, v10.8b, v28.8b + eor v11.8b, v11.8b, v29.8b + eor v12.8b, v12.8b, v30.8b + + tbnz x3, #4, 1f // SHA3-384 or SHA3-224 + + // SHA3-256 + ld1 {v25.8b-v28.8b}, [x1], #32 + eor v13.8b, v13.8b, v25.8b + eor v14.8b, v14.8b, v26.8b + eor v15.8b, v15.8b, v27.8b + eor v16.8b, v16.8b, v28.8b + b 3f + +1: tbz x3, #2, 3f // bit 2 cleared? SHA-384 + + // SHA3-224 + ld1 {v25.8b-v28.8b}, [x1], #32 + ld1 {v29.8b}, [x1], #8 + eor v13.8b, v13.8b, v25.8b + eor v14.8b, v14.8b, v26.8b + eor v15.8b, v15.8b, v27.8b + eor v16.8b, v16.8b, v28.8b + eor v17.8b, v17.8b, v29.8b + b 3f + + // SHA3-512 +2: ld1 {v25.8b-v26.8b}, [x1], #16 + eor v7.8b, v7.8b, v25.8b + eor v8.8b, v8.8b, v26.8b + +3: sub w8, w8, #1 + + eor3 v25.16b, v0.16b, v5.16b, v10.16b + eor3 v25.16b, v25.16b, v15.16b, v20.16b + eor3 v26.16b, v1.16b, v6.16b, v11.16b + eor3 v26.16b, v26.16b, v16.16b, v21.16b + eor3 v27.16b, v2.16b, v7.16b, v12.16b + eor3 v27.16b, v27.16b, v17.16b, v22.16b + eor3 v28.16b, v3.16b, v8.16b, v13.16b + eor3 v28.16b, v28.16b, v18.16b, v23.16b + eor3 v29.16b, v4.16b, v9.16b, v14.16b + eor3 v29.16b, v29.16b, v19.16b, v24.16b + + rax1 v30.2d, v29.2d, v26.2d // bc[0] + rax1 v31.2d, v27.2d, v29.2d // bc[3] + rax1 v29.2d, v25.2d, v27.2d // bc[1] + rax1 v27.2d, v28.2d, v25.2d // bc[4] + rax1 v25.2d, v26.2d, v28.2d // bc[2] + + eor v0.8b, v0.8b, v30.8b + mov v26.16b, v1.16b + xar v1.2d, v6.2d, v29.2d, (64 - 44) + xar v6.2d, v9.2d, v27.2d, (64 - 20) + xar v9.2d, v22.2d, v25.2d, (64 - 61) + xar v22.2d, v14.2d, v27.2d, (64 - 39) + xar v14.2d, v20.2d, v30.2d, (64 - 18) + xar v20.2d, v2.2d, v25.2d, (64 - 62) + xar v2.2d, v12.2d, v25.2d, (64 - 43) + xar v12.2d, v13.2d, v31.2d, (64 - 25) + xar v13.2d, v19.2d, v27.2d, (64 - 8) + xar v19.2d, v23.2d, v31.2d, (64 - 56) + xar v23.2d, v15.2d, v30.2d, (64 - 41) + xar v15.2d, v4.2d, v27.2d, (64 - 27) + xar v4.2d, v24.2d, v27.2d, (64 - 14) + xar v24.2d, v21.2d, v29.2d, (64 - 2) + xar v21.2d, v8.2d, v31.2d, (64 - 55) + xar v8.2d, v16.2d, v29.2d, (64 - 45) + xar v16.2d, v5.2d, v30.2d, (64 - 36) + xar v5.2d, v3.2d, v31.2d, (64 - 28) + xar v3.2d, v18.2d, v31.2d, (64 - 21) + xar v18.2d, v17.2d, v25.2d, (64 - 15) + xar v17.2d, v11.2d, v29.2d, (64 - 10) + xar v11.2d, v7.2d, v25.2d, (64 - 6) + xar v7.2d, v10.2d, v30.2d, (64 - 3) + xar v10.2d, v26.2d, v29.2d, (64 - 1) + + ld1 {v27.1d}, [x9], #8 + + bcax v25.16b, v0.16b, v1.16b, v2.16b + bcax v26.16b, v1.16b, v2.16b, v3.16b + bcax v2.16b, v2.16b, v3.16b, v4.16b + bcax v3.16b, v3.16b, v4.16b, v0.16b + bcax v4.16b, v4.16b, v0.16b, v1.16b + mov v0.16b, v25.16b + mov v1.16b, v26.16b + + bcax v25.16b, v5.16b, v6.16b, v7.16b + bcax v26.16b, v6.16b, v7.16b, v8.16b + bcax v7.16b, v7.16b, v8.16b, v9.16b + bcax v8.16b, v8.16b, v9.16b, v5.16b + bcax v9.16b, v9.16b, v5.16b, v6.16b + mov v5.16b, v25.16b + mov v6.16b, v26.16b + + bcax v25.16b, v10.16b, v11.16b, v12.16b + bcax v26.16b, v11.16b, v12.16b, v13.16b + bcax v12.16b, v12.16b, v13.16b, v14.16b + bcax v13.16b, v13.16b, v14.16b, v10.16b + bcax v14.16b, v14.16b, v10.16b, v11.16b + mov v10.16b, v25.16b + mov v11.16b, v26.16b + + bcax v25.16b, v15.16b, v16.16b, v17.16b + bcax v26.16b, v16.16b, v17.16b, v18.16b + bcax v17.16b, v17.16b, v18.16b, v19.16b + bcax v18.16b, v18.16b, v19.16b, v15.16b + bcax v19.16b, v19.16b, v15.16b, v16.16b + mov v15.16b, v25.16b + mov v16.16b, v26.16b + + bcax v25.16b, v20.16b, v21.16b, v22.16b + bcax v26.16b, v21.16b, v22.16b, v23.16b + bcax v22.16b, v22.16b, v23.16b, v24.16b + bcax v23.16b, v23.16b, v24.16b, v20.16b + bcax v24.16b, v24.16b, v20.16b, v21.16b + mov v20.16b, v25.16b + mov v21.16b, v26.16b + + eor v0.8b, v0.8b, v27.8b + + cbnz w8, 3b + cbnz w2, 0b + + /* save state */ + mov x8, x0 + st1 { v0.1d- v3.1d}, [x8], #32 + st1 { v4.1d- v7.1d}, [x8], #32 + st1 { v8.1d-v11.1d}, [x8], #32 + st1 {v12.1d-v15.1d}, [x8], #32 + st1 {v16.1d-v19.1d}, [x8], #32 + st1 {v20.1d-v23.1d}, [x8], #32 + st1 {v24.1d}, [x8] + ret +ENDPROC(sha3_ce_transform) + + .section ".rodata", "a" + .align 4 +.Lsha3_rcon: + .quad 0x0000000000000001, 0x0000000000008082 + .quad 0x800000000000808a, 0x8000000080008000 + .quad 0x000000000000808b, 0x0000000080000001 + .quad 0x8000000080008081, 0x8000000000008009 + .quad 0x000000000000008a, 0x0000000000000088 + .quad 0x0000000080008009, 0x000000008000000a + .quad 0x000000008000808b, 0x800000000000008b + .quad 0x8000000000008089, 0x8000000000008003 + .quad 0x8000000000008002, 0x8000000000000080 + .quad 0x000000000000800a, 0x800000008000000a + .quad 0x8000000080008081, 0x8000000000008080 + .quad 0x0000000080000001, 0x8000000080008008 diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c new file mode 100644 index 000000000000..a81377c16f1c --- /dev/null +++ b/arch/arm64/crypto/sha3-ce-glue.c @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * sha3-ce-glue.c - SHA3 using ARMv8 Crypto Extensions + * + * Copyright (C) 2018 Linaro Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_DESCRIPTION("SHA3 secure hash using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_LICENSE("GPL v2"); + +asmlinkage void sha3_ce_transform(u64 *st, const u8 *data, int blocks, + int md_len); + +static int sha3_ce_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha3_state *sctx = shash_desc_ctx(desc); + unsigned int digest_size = crypto_shash_digestsize(desc->tfm); + + if (!may_use_simd()) + return crypto_sha3_update(desc, data, len); + + if ((sctx->partial + len) >= sctx->rsiz) { + int blocks; + + if (sctx->partial) { + int p = sctx->rsiz - sctx->partial; + + memcpy(sctx->buf + sctx->partial, data, p); + kernel_neon_begin(); + sha3_ce_transform(sctx->st, sctx->buf, 1, digest_size); + kernel_neon_end(); + + data += p; + len -= p; + sctx->partial = 0; + } + + blocks = len / sctx->rsiz; + len %= sctx->rsiz; + + if (blocks) { + kernel_neon_begin(); + sha3_ce_transform(sctx->st, data, blocks, digest_size); + kernel_neon_end(); + data += blocks * sctx->rsiz; + } + } + + if (len) { + memcpy(sctx->buf + sctx->partial, data, len); + sctx->partial += len; + } + return 0; +} + +static int sha3_ce_final(struct shash_desc *desc, u8 *out) +{ + struct sha3_state *sctx = shash_desc_ctx(desc); + unsigned int digest_size = crypto_shash_digestsize(desc->tfm); + __le64 *digest = (__le64 *)out; + int i; + + if (!may_use_simd()) + return crypto_sha3_final(desc, out); + + sctx->buf[sctx->partial++] = 0x06; + memset(sctx->buf + sctx->partial, 0, sctx->rsiz - sctx->partial); + sctx->buf[sctx->rsiz - 1] |= 0x80; + + kernel_neon_begin(); + sha3_ce_transform(sctx->st, sctx->buf, 1, digest_size); + kernel_neon_end(); + + for (i = 0; i < digest_size / 8; i++) + put_unaligned_le64(sctx->st[i], digest++); + + if (digest_size & 4) + put_unaligned_le32(sctx->st[i], (__le32 *)digest); + + *sctx = (struct sha3_state){}; + return 0; +} + +static struct shash_alg algs[] = { { + .digestsize = SHA3_224_DIGEST_SIZE, + .init = crypto_sha3_init, + .update = sha3_ce_update, + .final = sha3_ce_final, + .descsize = sizeof(struct sha3_state), + .base.cra_name = "sha3-224", + .base.cra_driver_name = "sha3-224-ce", + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = SHA3_224_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +}, { + .digestsize = SHA3_256_DIGEST_SIZE, + .init = crypto_sha3_init, + .update = sha3_ce_update, + .final = sha3_ce_final, + .descsize = sizeof(struct sha3_state), + .base.cra_name = "sha3-256", + .base.cra_driver_name = "sha3-256-ce", + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = SHA3_256_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +}, { + .digestsize = SHA3_384_DIGEST_SIZE, + .init = crypto_sha3_init, + .update = sha3_ce_update, + .final = sha3_ce_final, + .descsize = sizeof(struct sha3_state), + .base.cra_name = "sha3-384", + .base.cra_driver_name = "sha3-384-ce", + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = SHA3_384_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +}, { + .digestsize = SHA3_512_DIGEST_SIZE, + .init = crypto_sha3_init, + .update = sha3_ce_update, + .final = sha3_ce_final, + .descsize = sizeof(struct sha3_state), + .base.cra_name = "sha3-512", + .base.cra_driver_name = "sha3-512-ce", + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = SHA3_512_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +} }; + +static int __init sha3_ce_mod_init(void) +{ + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); +} + +static void __exit sha3_ce_mod_fini(void) +{ + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); +} + +module_cpu_feature_match(SHA3, sha3_ce_mod_init); +module_exit(sha3_ce_mod_fini); -- 2.11.0