From: chandramouli narayanan Subject: Re: [PATCH 1/1] SHA1 transform: x86_64 AVX2 optimization -v3 Date: Tue, 18 Mar 2014 17:46:42 -0700 Message-ID: <1395190002.2367.17.camel@pegasus.jf.intel.com> References: <1395187947.2367.9.camel@pegasus.jf.intel.com> Mime-Version: 1.0 Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: 7bit Cc: davem@davemloft.net, hpa@zytor.com, ilya.albrekht@intel.com, maxim.locktyukhin@intel.com, ronen.zohar@intel.com, wajdi.k.feghali@intel.com, tim.c.chen@linux.intel.com, linux-crypto@vger.kernel.org To: herbert@gondor.apana.org.au Return-path: Received: from mga02.intel.com ([134.134.136.20]:41544 "EHLO mga02.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S933312AbaCSAqK (ORCPT ); Tue, 18 Mar 2014 20:46:10 -0400 In-Reply-To: <1395187947.2367.9.camel@pegasus.jf.intel.com> Sender: linux-crypto-owner@vger.kernel.org List-ID: Sorry, there seems to be a problem with the patch. Let me retest from the list again and repost. thanks - mouli On Tue, 2014-03-18 at 17:12 -0700, chandramouli narayanan wrote: > This git patch adds x86_64 AVX2 optimization of SHA1 transform > to crypto support. The patch has been tested with 3.14.0-rc1 > kernel. > > On a Haswell desktop, with turbo disabled and all cpus running > at maximum frequency, tcrypt shows AVX2 performance improvement > from 3% for 256 bytes update to 16% for 1024 bytes update over > AVX implementation. > > This patch adds sha1_avx2_transform(), the glue, build and > configuration changes needed for AVX2 optimization of SHA1 transform to > crypto support. > > Changes noted from the initial version of this patch are based on the > feedback from the community: > a) check for BMI2 in addition to AVX2 support since > __sha1_transform_avx2() uses rorx > b) Since the module build has dependency on 64bit, it is > redundant to check it in the code here. > c) coding style cleanup > d) simplification of the assembly code where macros are repetitively used. > > With regard to clean up the sha1-ssse3 module configuration on lines simlar > to Camellia: > > On a cursory look at the Camellia implementation, there are separate modules for > AVX/AVX2. However, sha1-ssse3 is one module which adds the necessary optimization > support (SSSE3/AVX/AVX2) for the low-level SHA1 transform function. With better > optimization support, transform function is overridden as the case may be. > In the case of AVX2, due to performance reasons across datablock sizes, > the AVX or AVX2 transform function is used at run-time as it suits best. > The Makefile change therefore appends the necessary objects to the linkage. > Due to this, the patch appends AVX2 transform to the build mix and leaves > the configuration build support as is. > > Signed-off-by: Chandramouli Narayanan > > diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile > index 6ba54d6..61d6e28 100644 > --- a/arch/x86/crypto/Makefile > +++ b/arch/x86/crypto/Makefile > @@ -79,6 +79,9 @@ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o > aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o > ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o > sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o > +ifeq ($(avx2_supported),yes) > +sha1-ssse3-y += sha1_avx2_x86_64_asm.o > +endif > crc32c-intel-y := crc32c-intel_glue.o > crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o > crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o > diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c > index 4a11a9d..bdd6295 100644 > --- a/arch/x86/crypto/sha1_ssse3_glue.c > +++ b/arch/x86/crypto/sha1_ssse3_glue.c > @@ -10,6 +10,7 @@ > * Copyright (c) Andrew McDonald > * Copyright (c) Jean-Francois Dive > * Copyright (c) Mathias Krause > + * Copyright (c) Chandramouli Narayanan > * > * This program is free software; you can redistribute it and/or modify it > * under the terms of the GNU General Public License as published by the Free > @@ -39,6 +40,12 @@ asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, > asmlinkage void sha1_transform_avx(u32 *digest, const char *data, > unsigned int rounds); > #endif > +#ifdef CONFIG_AS_AVX2 > +#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ > + > +asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, > + unsigned int rounds); > +#endif > > static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int); > > @@ -165,6 +172,19 @@ static int sha1_ssse3_import(struct shash_desc *desc, const void *in) > return 0; > } > > +#ifdef CONFIG_AS_AVX2 > +static void __sha1_transform_avx2(u32 *digest, const char *data, > + unsigned int rounds) > +{ > + > + /* Select the optimal transform based on data block size */ > + if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE) > + sha1_transform_avx2(digest, data, rounds); > + else > + sha1_transform_avx(digest, data, rounds); > +} > +#endif > + > static struct shash_alg alg = { > .digestsize = SHA1_DIGEST_SIZE, > .init = sha1_ssse3_init, > @@ -189,7 +209,11 @@ static bool __init avx_usable(void) > { > u64 xcr0; > > +#if defined(CONFIG_AS_AVX2) > + if (!cpu_has_avx || !cpu_has_avx2 || !cpu_has_osxsave) > +#else > if (!cpu_has_avx || !cpu_has_osxsave) > +#endif > return false; > > xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); > @@ -205,23 +229,35 @@ static bool __init avx_usable(void) > > static int __init sha1_ssse3_mod_init(void) > { > + char *algo_name; > /* test for SSSE3 first */ > - if (cpu_has_ssse3) > + if (cpu_has_ssse3) { > sha1_transform_asm = sha1_transform_ssse3; > + algo_name = "SSSE3"; > + } > > #ifdef CONFIG_AS_AVX > /* allow AVX to override SSSE3, it's a little faster */ > - if (avx_usable()) > - sha1_transform_asm = sha1_transform_avx; > + if (avx_usable()) { > + if (cpu_has_avx) { > + sha1_transform_asm = sha1_transform_avx; > + algo_name = "AVX"; > + } > +#ifdef CONFIG_AS_AVX2 > + if (cpu_has_avx2 && boot_cpu_has(X86_FEATURE_BMI2)) { > + /* allow AVX2 to override AVX, it's a little faster */ > + sha1_transform_asm = __sha1_transform_avx2; > + algo_name = "AVX2"; > + } > +#endif > + } > #endif > > if (sha1_transform_asm) { > - pr_info("Using %s optimized SHA-1 implementation\n", > - sha1_transform_asm == sha1_transform_ssse3 ? "SSSE3" > - : "AVX"); > + pr_info("Using %s optimized SHA-1 implementation\n", algo_name); > return crypto_register_shash(&alg); > } > - pr_info("Neither AVX nor SSSE3 is available/usable.\n"); > + pr_info("Neither AVX nor AVX2 nor SSSE3 is available/usable.\n"); > > return -ENODEV; > } > diff --git a/crypto/Kconfig b/crypto/Kconfig > index 7bcb70d..ce4012a 100644 > --- a/crypto/Kconfig > +++ b/crypto/Kconfig > @@ -491,14 +491,14 @@ config CRYPTO_SHA1 > SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2). > > config CRYPTO_SHA1_SSSE3 > - tristate "SHA1 digest algorithm (SSSE3/AVX)" > + tristate "SHA1 digest algorithm (SSSE3/AVX/AVX2)" > depends on X86 && 64BIT > select CRYPTO_SHA1 > select CRYPTO_HASH > help > SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2) implemented > using Supplemental SSE3 (SSSE3) instructions or Advanced Vector > - Extensions (AVX), when available. > + Extensions (AVX/AVX2), when available. > > config CRYPTO_SHA256_SSSE3 > tristate "SHA256 digest algorithm (SSSE3/AVX/AVX2)" > diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S > new file mode 100644 > index 0000000..559eb6c > --- /dev/null > +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S > @@ -0,0 +1,706 @@ > +/* > + * Implement fast SHA-1 with AVX2 instructions. (x86_64) > + * > + * This file is provided under a dual BSD/GPLv2 license. When using or > + * redistributing this file, you may do so under either license. > + * > + * GPL LICENSE SUMMARY > + * > + * Copyright(c) 2014 Intel Corporation. > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of version 2 of the GNU General Public License as > + * published by the Free Software Foundation. > + * > + * This program is distributed in the hope that it will be useful, but > + * WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * General Public License for more details. > + * > + * Contact Information: > + * Ilya Albrekht > + * Maxim Locktyukhin > + * Ronen Zohar > + * Chandramouli Narayanan > + * > + * BSD LICENSE > + * > + * Copyright(c) 2014 Intel Corporation. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * > + * Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in > + * the documentation and/or other materials provided with the > + * distribution. > + * Neither the name of Intel Corporation nor the names of its > + * contributors may be used to endorse or promote products derived > + * from this software without specific prior written permission. > + * > + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > + * > + */ > + > +/* > + * > + * SHA-1 implementation with Intel(R) AVX2 instruction set extensions. > + * > + *This implementation is based on the previous SSSE3 release: > + *Visit http://software.intel.com/en-us/articles/ > + *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/ > + * > + *Updates 20-byte SHA-1 record in 'hash' for even number of > + *'num_blocks' consecutive 64-byte blocks > + * > + *extern "C" void sha1_transform_avx2( > + * int *hash, const char* input, size_t num_blocks ); > + */ > + > +#include > + > +#define CTX %rdi /* arg1 */ > +#define BUF %rsi /* arg2 */ > +#define CNT %rdx /* arg3 */ > + > +#define REG_A %ecx > +#define REG_B %esi > +#define REG_C %edi > +#define REG_D %eax > +#define REG_E %edx > +#define REG_TB %ebx > +#define REG_TA %r12d > +#define REG_RA %rcx > +#define REG_RB %rsi > +#define REG_RC %rdi > +#define REG_RD %rax > +#define REG_RE %rdx > +#define REG_RTA %r12 > +#define REG_RTB %rbx > +#define REG_T1 %ebp > +#define xmm_mov vmovups > +#define avx2_zeroupper vzeroupper > +#define RND_F1 1 > +#define RND_F2 2 > +#define RND_F3 3 > + > +.macro REGALLOC > + .set A, REG_A > + .set B, REG_B > + .set C, REG_C > + .set D, REG_D > + .set E, REG_E > + .set TB, REG_TB > + .set TA, REG_TA > + > + .set RA, REG_RA > + .set RB, REG_RB > + .set RC, REG_RC > + .set RD, REG_RD > + .set RE, REG_RE > + > + .set RTA, REG_RTA > + .set RTB, REG_RTB > + > + .set T1, REG_T1 > +.endm > + > +#define K_BASE %r8 > +#define HASH_PTR %r9 > +#define BUFFER_PTR %r10 > +#define BUFFER_PTR2 %r13 > +#define BUFFER_END %r11 > + > +#define PRECALC_BUF %r14 > +#define WK_BUF %r15 > + > +#define W_TMP %xmm0 > +#define WY_TMP %ymm0 > +#define WY_TMP2 %ymm9 > + > +# AVX2 variables > +#define WY0 %ymm3 > +#define WY4 %ymm5 > +#define WY08 %ymm7 > +#define WY12 %ymm8 > +#define WY16 %ymm12 > +#define WY20 %ymm13 > +#define WY24 %ymm14 > +#define WY28 %ymm15 > + > +#define YMM_SHUFB_BSWAP %ymm10 > + > +/* Keep 2 iterations precalculated at a time: > + * - 80 DWORDs per iteration * 2 > + */ > +#define W_SIZE (80*2*2 +16) > + > +#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF) > +#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF) > + > + > +.macro UPDATE_HASH hash, val > + add \hash, \val > + mov \val, \hash > +.endm > + > +.macro PRECALC_RESET_WY > + .set WY_00, WY0 > + .set WY_04, WY4 > + .set WY_08, WY08 > + .set WY_12, WY12 > + .set WY_16, WY16 > + .set WY_20, WY20 > + .set WY_24, WY24 > + .set WY_28, WY28 > + .set WY_32, WY_00 > +.endm > + > +.macro PRECALC_ROTATE_WY > + /* Rotate macros */ > + .set WY_32, WY_28 > + .set WY_28, WY_24 > + .set WY_24, WY_20 > + .set WY_20, WY_16 > + .set WY_16, WY_12 > + .set WY_12, WY_08 > + .set WY_08, WY_04 > + .set WY_04, WY_00 > + .set WY_00, WY_32 > + > + /* Define register aliases */ > + .set WY, WY_00 > + .set WY_minus_04, WY_04 > + .set WY_minus_08, WY_08 > + .set WY_minus_12, WY_12 > + .set WY_minus_16, WY_16 > + .set WY_minus_20, WY_20 > + .set WY_minus_24, WY_24 > + .set WY_minus_28, WY_28 > + .set WY_minus_32, WY > +.endm > + > +.macro PRECALC_00_15 > + .if (i == 0) # Initialize and rotate registers > + PRECALC_RESET_WY > + PRECALC_ROTATE_WY > + .endif > + > + /* message scheduling pre-compute for rounds 0-15 */ > + .if ((i & 7) == 0) > + /* > + * blended AVX2 and ALU instruction scheduling > + * 1 vector iteration per 8 rounds > + */ > + vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP > + .elseif ((i & 7) == 1) > + vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\ > + WY_TMP, WY_TMP > + .elseif ((i & 7) == 2) > + vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY > + .elseif ((i & 7) == 4) > + vpaddd K_XMM(K_BASE), WY, WY_TMP > + .elseif ((i & 7) == 7) > + vmovdqu WY_TMP, PRECALC_WK(i&~7) > + > + PRECALC_ROTATE_WY > + .endif > +.endm > + > +.macro PRECALC_16_31 > + /* > + * message scheduling pre-compute for rounds 16-31 > + * calculating last 32 w[i] values in 8 XMM registers > + * pre-calculate K+w[i] values and store to mem > + * for later load by ALU add instruction > + * > + * "brute force" vectorization for rounds 16-31 only > + * due to w[i]->w[i-3] dependency > + */ > + .if ((i & 7) == 0) > + /* > + * blended AVX2 and ALU instruction scheduling > + * 1 vector iteration per 8 rounds > + */ > + vpalignr $8, WY_minus_16, WY_minus_12, WY /* w[i-14] */ > + vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */ > + .elseif ((i & 7) == 1) > + vpxor WY_minus_08, WY, WY > + vpxor WY_minus_16, WY_TMP, WY_TMP > + .elseif ((i & 7) == 2) > + vpxor WY_TMP, WY, WY > + vpslldq $12, WY, WY_TMP2 > + .elseif ((i & 7) == 3) > + vpslld $1, WY, WY_TMP > + vpsrld $31, WY, WY > + .elseif ((i & 7) == 4) > + vpor WY, WY_TMP, WY_TMP > + vpslld $2, WY_TMP2, WY > + .elseif ((i & 7) == 5) > + vpsrld $30, WY_TMP2, WY_TMP2 > + vpxor WY, WY_TMP, WY_TMP > + .elseif ((i & 7) == 7) > + vpxor WY_TMP2, WY_TMP, WY > + vpaddd K_XMM(K_BASE), WY, WY_TMP > + vmovdqu WY_TMP, PRECALC_WK(i&~7) > + > + PRECALC_ROTATE_WY > + .endif > +.endm > + > +.macro PRECALC_32_79 > + /* > + * in SHA-1 specification: > + * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 > + * instead we do equal: > + * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 > + * allows more efficient vectorization > + * since w[i]=>w[i-3] dependency is broken > + */ > + > + .if ((i & 7) == 0) > + /* > + * blended AVX2 and ALU instruction scheduling > + * 1 vector iteration per 8 rounds > + */ > + vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP > + .elseif ((i & 7) == 1) > + /* W is W_minus_32 before xor */ > + vpxor WY_minus_28, WY, WY > + .elseif ((i & 7) == 2) > + vpxor WY_minus_16, WY_TMP, WY_TMP > + .elseif ((i & 7) == 3) > + vpxor WY_TMP, WY, WY > + .elseif ((i & 7) == 4) > + vpslld $2, WY, WY_TMP > + .elseif ((i & 7) == 5) > + vpsrld $30, WY, WY > + vpor WY, WY_TMP, WY > + .elseif ((i & 7) == 7) > + vpaddd K_XMM(K_BASE), WY, WY_TMP > + vmovdqu WY_TMP, PRECALC_WK(i&~7) > + > + PRECALC_ROTATE_WY > + .endif > +.endm > + > +.macro PRECALC r, s > + .set i, \r > + > + .if (i < 40) > + .set K_XMM, 32*0 > + .elseif (i < 80) > + .set K_XMM, 32*1 > + .elseif (i < 120) > + .set K_XMM, 32*2 > + .else > + .set K_XMM, 32*3 > + .endif > + > + .if (i<32) > + PRECALC_00_15 \s > + .elseif (i<64) > + PRECALC_16_31 \s > + .elseif (i < 160) > + PRECALC_32_79 \s > + .endif > +.endm > + > +.macro ROTATE_STATE > + .set T_REG, E > + .set E, D > + .set D, C > + .set C, B > + .set B, TB > + .set TB, A > + .set A, T_REG > + > + .set T_REG, RE > + .set RE, RD > + .set RD, RC > + .set RC, RB > + .set RB, RTB > + .set RTB, RA > + .set RA, T_REG > +.endm > + > +/* Macro relies on saved ROUND_Fx */ > + > +.macro RND_FUN f, r > + .if (\f == RND_F1) > + ROUND_F1 \r > + .elseif (\f == RND_F2) > + ROUND_F2 \r > + .elseif (\f == RND_F3) > + ROUND_F3 \r > + .endif > +.endm > + > +.macro RR r > + .set round_id, (\r % 80) > + > + .if (round_id == 0) /* Precalculate F for first round */ > + .set ROUND_FUNC, RND_F1 > + mov B, TB > + > + rorx $(32-30), B, B /* b>>>2 */ > + andn D, TB, T1 > + and C, TB > + xor T1, TB > + .endif > + > + RND_FUN ROUND_FUNC, \r > + ROTATE_STATE > + > + .if (round_id == 18) > + .set ROUND_FUNC, RND_F2 > + .elseif (round_id == 38) > + .set ROUND_FUNC, RND_F3 > + .elseif (round_id == 58) > + .set ROUND_FUNC, RND_F2 > + .endif > + > + .set round_id, ( (\r+1) % 80) > + > + RND_FUN ROUND_FUNC, (\r+1) > + ROTATE_STATE > +.endm > + > +.macro ROUND_F1 r > + add WK(\r), E > + > + andn C, A, T1 /* ~b&d */ > + lea (RE,RTB), E /* Add F from the previous round */ > + > + rorx $(32-5), A, TA /* T2 = A >>> 5 */ > + rorx $(32-30),A, TB /* b>>>2 for next round */ > + > + PRECALC (\r) /* msg scheduling for next 2 blocks */ > + > + /* Calculate F for the next round > + * (b & c) ^ andn[b, d] > + */ > + and B, A /* b&c */ > + xor T1, A /* F1 = (b&c) ^ (~b&d) */ > + > + lea (RE,RTA), E /* E += A >>> 5 */ > +.endm > + > +.macro ROUND_F2 r > + add WK(\r), E > + lea (RE,RTB), E /* Add F from the previous round */ > + > + /* Calculate F for the next round */ > + rorx $(32-5), A, TA /* T2 = A >>> 5 */ > + .if ((round_id) < 79) > + rorx $(32-30), A, TB /* b>>>2 for next round */ > + .endif > + PRECALC (\r) /* msg scheduling for next 2 blocks */ > + > + .if ((round_id) < 79) > + xor B, A > + .endif > + > + add TA, E /* E += A >>> 5 */ > + > + .if ((round_id) < 79) > + xor C, A > + .endif > +.endm > + > +.macro ROUND_F3 r > + add WK(\r), E > + PRECALC (\r) /* msg scheduling for next 2 blocks */ > + > + lea (RE,RTB), E /* Add F from the previous round */ > + > + mov B, T1 > + or A, T1 > + > + rorx $(32-5), A, TA /* T2 = A >>> 5 */ > + rorx $(32-30), A, TB /* b>>>2 for next round */ > + > + /* Calculate F for the next round > + * (b and c) or (d and (b or c)) > + */ > + and C, T1 > + and B, A > + or T1, A > + > + add TA, E /* E += A >>> 5 */ > + > +.endm > + > +/* > + * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining > + */ > +.macro SHA1_PIPELINED_MAIN_BODY > + > + REGALLOC > + > + mov (HASH_PTR), A > + mov 4(HASH_PTR), B > + mov 8(HASH_PTR), C > + mov 12(HASH_PTR), D > + mov 16(HASH_PTR), E > + > + mov %rsp, PRECALC_BUF > + lea (2*4*80+32)(%rsp), WK_BUF > + > + # Precalc WK for first 2 blocks > + PRECALC_OFFSET = 0 > + .set i, 0 > + .rept 160 > + PRECALC i > + .set i, i + 1 > + .endr > + PRECALC_OFFSET = 128 > + xchg WK_BUF, PRECALC_BUF > + > + .align 32 > +_loop: > + /* code loops through more than one block > + * we use K_BASE value as a signal of a last block, > + * it is set below by: cmovae BUFFER_PTR, K_BASE > + */ > + cmp K_BASE, BUFFER_PTR > + jne _begin > + .align 32 > + jmp _end > + .align 32 > +_begin: > + > + /* Do first block > + * rounds: 0,2,4,6,8 > + */ > + .set j, 0 > + .rept 5 > + RR j > + .set j, j+2 > + .endr > + > + jmp _loop0 > +_loop0: > + > + /* rounds: > + * 10,12,14,16,18 > + * 20,22,24,26,28 > + * 30,32,34,36,38 > + * 40,42,44,46,48 > + * 50,52,54,56,58 > + */ > + .rept 25 > + RR j > + .set j, j+2 > + .endr > + > + add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */ > + cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */ > + cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ > + > + /* rounds > + * 60,62,64,66,68 > + * 70,72,74,76,78 > + */ > + .rept 10 > + RR j > + .set j, j+2 > + .endr > + > + UPDATE_HASH (HASH_PTR), A > + UPDATE_HASH 4(HASH_PTR), TB > + UPDATE_HASH 8(HASH_PTR), C > + UPDATE_HASH 12(HASH_PTR), D > + UPDATE_HASH 16(HASH_PTR), E > + > + cmp K_BASE, BUFFER_PTR /* is current block the last one? */ > + je _loop > + > + mov TB, B > + > + /* Process second block */ > + /* rounds > + * 0+80, 2+80, 4+80, 6+80, 8+80 > + * 10+80,12+80,14+80,16+80,18+80 > + */ > + > + .set j, 0 > + .rept 10 > + RR j+80 > + .set j, j+2 > + .endr > + > + jmp _loop1 > +_loop1: > + /* rounds > + * 20+80,22+80,24+80,26+80,28+80 > + * 30+80,32+80,34+80,36+80,38+80 > + */ > + .rept 10 > + RR j+80 > + .set j, j+2 > + .endr > + > + jmp _loop2 > +_loop2: > + > + /* rounds > + * 40+80,42+80,44+80,46+80,48+80 > + * 50+80,52+80,54+80,56+80,58+80 > + */ > + .rept 10 > + RR j+80 > + .set j, j+2 > + .endr > + > + add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */ > + > + cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */ > + cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ > + > + jmp _loop3 > +_loop3: > + > + /* rounds > + * 60+80,62+80,64+80,66+80,68+80 > + * 70+80,72+80,74+80,76+80,78+80 > + */ > + .rept 10 > + RR j+80 > + .set j, j+2 > + .endr > + > + UPDATE_HASH (HASH_PTR), A > + UPDATE_HASH 4(HASH_PTR), TB > + UPDATE_HASH 8(HASH_PTR), C > + UPDATE_HASH 12(HASH_PTR), D > + UPDATE_HASH 16(HASH_PTR), E > + > + /* Reset state for AVX2 reg permutation */ > + mov A, TA > + mov TB, A > + mov C, TB > + mov E, C > + mov D, B > + mov TA, D > + > + REGALLOC > + > + xchg WK_BUF, PRECALC_BUF > + > + jmp _loop > + > + .align 32 > + _end: > + > +.endm > +/* > + * macro implements SHA-1 function's body for several 64-byte blocks > + * param: function's name > + */ > +.macro SHA1_VECTOR_ASM name > + ENTRY(\name) > + .align 4096 > + > + push %rbx > + push %rbp > + push %r12 > + push %r13 > + push %r14 > + push %r15 > + > + RESERVE_STACK = (W_SIZE*4 + 8+24) > + > + /* Align stack */ > + mov %rsp, %rbx > + and $(0x1000-1), %rbx > + sub $(8+32), %rbx > + sub %rbx, %rsp > + push %rbx > + sub $RESERVE_STACK, %rsp > + > + avx2_zeroupper > + > + lea K_XMM_AR(%rip), K_BASE > + > + mov CTX, HASH_PTR > + mov BUF, BUFFER_PTR > + lea 64(BUF), BUFFER_PTR2 > + > + shl $6, CNT /* mul by 64 */ > + add BUF, CNT > + add $64, CNT > + mov CNT, BUFFER_END > + > + cmp BUFFER_END, BUFFER_PTR2 > + cmovae K_BASE, BUFFER_PTR2 > + > + xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP > + > + SHA1_PIPELINED_MAIN_BODY > + > + avx2_zeroupper > + > + add $RESERVE_STACK, %rsp > + pop %rbx > + add %rbx, %rsp > + > + pop %r15 > + pop %r14 > + pop %r13 > + pop %r12 > + pop %rbp > + pop %rbx > + > + ret > + > + ENDPROC(\name) > +.endm > +/* > + */ > +.section .rodata > + > +#define K1 0x5a827999 > +#define K2 0x6ed9eba1 > +#define K3 0x8f1bbcdc > +#define K4 0xca62c1d6 > + > +.align 128 > +K_XMM_AR: > + .long K1, K1, K1, K1 > + .long K1, K1, K1, K1 > + .long K2, K2, K2, K2 > + .long K2, K2, K2, K2 > + .long K3, K3, K3, K3 > + .long K3, K3, K3, K3 > + .long K4, K4, K4, K4 > + .long K4, K4, K4, K4 > + > +BSWAP_SHUFB_CTL: > + .long 0x00010203 > + .long 0x04050607 > + .long 0x08090a0b > + .long 0x0c0d0e0f > + .long 0x00010203 > + .long 0x04050607 > + .long 0x08090a0b > + .long 0x0c0d0e0f > + > +/* > + */ > +.text > + > +SHA1_VECTOR_ASM sha1_transform_avx2 >