From: chandramouli narayanan Subject: [PATCH 1/2] SHA1 transform: x86_64 AVX2 optimization - assembly code Date: Thu, 27 Feb 2014 09:06:55 -0800 Message-ID: <1393520815.7495.88.camel@pegasus.jf.intel.com> Mime-Version: 1.0 Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: 7bit Cc: ilya.albrekht@intel.com, maxim.locktyukhin@intel.com, ronen.zohar@intel.com, wajdi.k.feghali@intel.com, tim.c.chen@linux.intel.com, linux-crypto@vger.kernel.org To: herbert@gondor.apana.org.au, davem@davemloft.net, hpa@zytor.com Return-path: Received: from mga09.intel.com ([134.134.136.24]:55879 "EHLO mga09.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751063AbaB0RGU (ORCPT ); Thu, 27 Feb 2014 12:06:20 -0500 Sender: linux-crypto-owner@vger.kernel.org List-ID: This git patch adds x86_64 AVX2 optimization of SHA1 transform to crypto support. The patch has been tested with 3.14.0-rc1 kernel. On a Haswell desktop, with turbo disabled and all cpus running at maximum frequency, tcrypt shows AVX2 performance improvement from 3% for 256 bytes update to 16% for 1024 bytes update over AVX implementation. Signed-off-by: Chandramouli Narayanan diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S new file mode 100644 index 0000000..2f71294 --- /dev/null +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -0,0 +1,732 @@ +/* + Implement fast SHA-1 with AVX2 instructions. (x86_64) + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2014 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Ilya Albrekht + Maxim Locktyukhin + Ronen Zohar + Chandramouli Narayanan + + BSD LICENSE + + Copyright(c) 2014 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#--------------------- +# +#SHA-1 implementation with Intel(R) AVX2 instruction set extensions. +# +#This implementation is based on the previous SSSE3 release: +#Visit http://software.intel.com/en-us/articles/ +#and refer to improving-the-performance-of-the-secure-hash-algorithm-1/ +# +#Updates 20-byte SHA-1 record in 'hash' for even number of +#'num_blocks' consecutive 64-byte blocks +# +#extern "C" void sha1_transform_avx2( +# int *hash, const char* input, size_t num_blocks ); +# + +#ifdef CONFIG_AS_AVX2 +#include + +#define CTX %rdi /* arg1 */ +#define BUF %rsi /* arg2 */ +#define CNT %rdx /* arg3 */ + +#define REG_A %ecx +#define REG_B %esi +#define REG_C %edi +#define REG_D %eax +#define REG_E %edx +#define REG_TB %ebx +#define REG_TA %r12d +#define REG_RA %rcx +#define REG_RB %rsi +#define REG_RC %rdi +#define REG_RD %rax +#define REG_RE %rdx +#define REG_RTA %r12 +#define REG_RTB %rbx +#define REG_T1 %ebp +#define xmm_mov vmovups +#define avx2_zeroupper vzeroupper +#define RND_F1 1 +#define RND_F2 2 +#define RND_F3 3 + +.macro REGALLOC + .set A, REG_A + .set B, REG_B + .set C, REG_C + .set D, REG_D + .set E, REG_E + .set TB, REG_TB + .set TA, REG_TA + + .set RA, REG_RA + .set RB, REG_RB + .set RC, REG_RC + .set RD, REG_RD + .set RE, REG_RE + + .set RTA, REG_RTA + .set RTB, REG_RTB + + .set T1, REG_T1 +.endm + +#define K_BASE %r8 +#define HASH_PTR %r9 +#define BUFFER_PTR %r10 +#define BUFFER_PTR2 %r13 +#define BUFFER_END %r11 + +#define PRECALC_BUF %r14 +#define WK_BUF %r15 + +#define W_TMP %xmm0 +#define WY_TMP %ymm0 +#define WY_TMP2 %ymm9 + +# AVX2 variables +#define WY0 %ymm3 +#define WY4 %ymm5 +#define WY08 %ymm7 +#define WY12 %ymm8 +#define WY16 %ymm12 +#define WY20 %ymm13 +#define WY24 %ymm14 +#define WY28 %ymm15 + +#define YMM_SHUFB_BSWAP %ymm10 + +# Keep 2 iterations precalculated at a time: +# - 80 DWORDs per iteration * 2 +#define W_SIZE (80*2*2 +16) + +#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF) +#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF) + +#----------------------------------------------------------------------------- +# +# macro implements SHA-1 function's body for several 64-byte blocks +# param: function's name +# +.macro SHA1_VECTOR_ASM name + ENTRY(\name) +.align 4096 + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + #FIXME: Save rsp + + RESERVE_STACK = (W_SIZE*4 + 8+24) + + # Align stack + mov %rsp, %rbx + and $(0x1000-1), %rbx + sub $(8+32), %rbx + sub %rbx, %rsp + push %rbx + sub $RESERVE_STACK, %rsp + + avx2_zeroupper + + lea K_XMM_AR(%rip), K_BASE + + mov CTX, HASH_PTR + mov BUF, BUFFER_PTR + lea 64(BUF), BUFFER_PTR2 + + shl $6, CNT # mul by 64 + add BUF, CNT + add $64, CNT + mov CNT, BUFFER_END + + cmp BUFFER_END, BUFFER_PTR2 + cmovae K_BASE, BUFFER_PTR2 + + xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP + + SHA1_PIPELINED_MAIN_BODY + + avx2_zeroupper + + add $RESERVE_STACK, %rsp + pop %rbx + add %rbx, %rsp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + + ENDPROC(\name) +.endm + +#-------------------------------------------- +# macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining +# +.macro SHA1_PIPELINED_MAIN_BODY + + REGALLOC + + mov (HASH_PTR), A + mov 4(HASH_PTR), B + mov 8(HASH_PTR), C + mov 12(HASH_PTR), D + mov 16(HASH_PTR), E + + mov %rsp, PRECALC_BUF + lea (2*4*80+32)(%rsp), WK_BUF + + # Precalc WK for first 2 blocks + PRECALC_OFFSET = 0 + .set i, 0 + .rept 160 + PRECALC i + .set i, i + 1 + .endr + PRECALC_OFFSET = 128 + xchg WK_BUF, PRECALC_BUF + + .align 32 + _loop: + # code loops through more than one block + # we use K_BASE value as a signal of a last block, + # it is set below by: cmovae BUFFER_PTR, K_BASE + cmp K_BASE, BUFFER_PTR + jne _begin + .align 32 + jmp _end + .align 32 + _begin: + + # Do first block + RR 0 + RR 2 + RR 4 + RR 6 + RR 8 + + jmp _loop0 +_loop0: + + RR 10 + RR 12 + RR 14 + RR 16 + RR 18 + + RR 20 + RR 22 + RR 24 + RR 26 + RR 28 + + RR 30 + RR 32 + RR 34 + RR 36 + RR 38 + + RR 40 + RR 42 + RR 44 + RR 46 + RR 48 + + RR 50 + RR 52 + RR 54 + RR 56 + RR 58 + + add $(2*64), BUFFER_PTR # move to next odd-64-byte block + cmp BUFFER_END, BUFFER_PTR # is current block the last one? + cmovae K_BASE, BUFFER_PTR # signal the last iteration smartly + + RR 60 + RR 62 + RR 64 + RR 66 + RR 68 + + RR 70 + RR 72 + RR 74 + RR 76 + RR 78 + + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), TB + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E + + cmp K_BASE, BUFFER_PTR # is current block the last one? + je _loop + + mov TB, B + + # Process second block + + RR 0+80 + RR 2+80 + RR 4+80 + RR 6+80 + RR 8+80 + + RR 10+80 + RR 12+80 + RR 14+80 + RR 16+80 + RR 18+80 + + jmp _loop1 +_loop1: + + RR 20+80 + RR 22+80 + RR 24+80 + RR 26+80 + RR 28+80 + + RR 30+80 + RR 32+80 + RR 34+80 + RR 36+80 + RR 38+80 + + jmp _loop2 +_loop2: + + RR 40+80 + RR 42+80 + RR 44+80 + RR 46+80 + RR 48+80 + + RR 50+80 + RR 52+80 + RR 54+80 + RR 56+80 + RR 58+80 + + + add $(2*64), BUFFER_PTR2 # move to next even-64-byte block + + cmp BUFFER_END, BUFFER_PTR2 # is current block the last one + cmovae K_BASE, BUFFER_PTR # signal the last iteration smartly + + jmp _loop3 +_loop3: + + RR 60+80 + RR 62+80 + RR 64+80 + RR 66+80 + RR 68+80 + + RR 70+80 + RR 72+80 + RR 74+80 + RR 76+80 + RR 78+80 + + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), TB + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E + + # Reset state for AVX2 reg permutation + mov A, TA + mov TB, A + mov C, TB + mov E, C + mov D, B + mov TA, D + + REGALLOC + + xchg WK_BUF, PRECALC_BUF + + jmp _loop + + .align 32 + _end: + +.endm + +.macro UPDATE_HASH hash, val + add \hash, \val + mov \val, \hash +.endm + +.macro PRECALC r, s + .set i, \r + + .if (i < 40) + .set K_XMM, 32*0 + .elseif (i < 80) + .set K_XMM, 32*1 + .elseif (i < 120) + .set K_XMM, 32*2 + .else + .set K_XMM, 32*3 + .endif + + .if (i<32) + PRECALC_00_15 \s + .elseif (i<64) + PRECALC_16_31 \s + .elseif (i < 160) + PRECALC_32_79 \s + .endif +.endm + +.macro PRECALC_RESET_WY + .set WY_00, WY0 + .set WY_04, WY4 + .set WY_08, WY08 + .set WY_12, WY12 + .set WY_16, WY16 + .set WY_20, WY20 + .set WY_24, WY24 + .set WY_28, WY28 + .set WY_32, WY_00 +.endm + +.macro PRECALC_ROTATE_WY + # Rotate macros + .set WY_32, WY_28 + .set WY_28, WY_24 + .set WY_24, WY_20 + .set WY_20, WY_16 + .set WY_16, WY_12 + .set WY_12, WY_08 + .set WY_08, WY_04 + .set WY_04, WY_00 + .set WY_00, WY_32 + + # Define register aliases + .set WY, WY_00 + .set WY_minus_04, WY_04 + .set WY_minus_08, WY_08 + .set WY_minus_12, WY_12 + .set WY_minus_16, WY_16 + .set WY_minus_20, WY_20 + .set WY_minus_24, WY_24 + .set WY_minus_28, WY_28 + .set WY_minus_32, WY +.endm + +.macro PRECALC_00_15 + + .if (i == 0) # Initialize and rotate registers + PRECALC_RESET_WY + PRECALC_ROTATE_WY + .endif + + # message scheduling pre-compute for rounds 0-15 + .if ((i & 7) == 0) + #blended AVX2 and ALU instruction scheduling + #1 vector iteration per 8 rounds + vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP + .elseif ((i & 7) == 1) + vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2), WY_TMP, WY_TMP + .elseif ((i & 7) == 2) + vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY + .elseif ((i & 7) == 4) + vpaddd K_XMM(K_BASE), WY, WY_TMP + .elseif ((i & 7) == 7) + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif + +.endm + +.macro PRECALC_16_31 + # message scheduling pre-compute for rounds 16-31 + # calculating last 32 w[i] values in 8 XMM registers + # pre-calculate K+w[i] values and store to mem + # for later load by ALU add instruction + # + # "brute force" vectorization for rounds 16-31 only + # due to w[i]->w[i-3] dependency + # + .if ((i & 7) == 0) + #blended AVX2 and ALU instruction scheduling + #1 vector iteration per 8 rounds + vpalignr $8, WY_minus_16, WY_minus_12, WY # w[i-14] + vpsrldq $4, WY_minus_04, WY_TMP # w[i-3] + .elseif ((i & 7) == 1) + vpxor WY_minus_08, WY, WY + vpxor WY_minus_16, WY_TMP, WY_TMP + .elseif ((i & 7) == 2) + vpxor WY_TMP, WY, WY + vpslldq $12, WY, WY_TMP2 + .elseif ((i & 7) == 3) + vpslld $1, WY, WY_TMP + vpsrld $31, WY, WY + .elseif ((i & 7) == 4) + vpor WY, WY_TMP, WY_TMP + vpslld $2, WY_TMP2, WY + .elseif ((i & 7) == 5) + vpsrld $30, WY_TMP2, WY_TMP2 + vpxor WY, WY_TMP, WY_TMP + .elseif ((i & 7) == 7) + vpxor WY_TMP2, WY_TMP, WY + vpaddd K_XMM(K_BASE), WY, WY_TMP + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro PRECALC_32_79 + # in SHA-1 specification: + # w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 + # instead we do equal: + # w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 + # allows more efficient vectorization + # since w[i]=>w[i-3] dependency is broken + # + .if ((i & 7) == 0) + #blended AVX2 and ALU instruction scheduling + #1 vector iteration per 8 rounds + vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP + .elseif ((i & 7) == 1) + vpxor WY_minus_28, WY, WY # W is W_minus_32 before xor + .elseif ((i & 7) == 2) + vpxor WY_minus_16, WY_TMP, WY_TMP + .elseif ((i & 7) == 3) + vpxor WY_TMP, WY, WY + .elseif ((i & 7) == 4) + vpslld $2, WY, WY_TMP + .elseif ((i & 7) == 5) + vpsrld $30, WY, WY + vpor WY, WY_TMP, WY + .elseif ((i & 7) == 7) + vpaddd K_XMM(K_BASE), WY, WY_TMP + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro ROTATE_STATE + .set T_REG, E + .set E, D + .set D, C + .set C, B + .set B, TB + .set TB, A + .set A, T_REG + + .set T_REG, RE + .set RE, RD + .set RD, RC + .set RC, RB + .set RB, RTB + .set RTB, RA + .set RA, T_REG +.endm + +# Macro relies on saved ROUND_Fx + +.macro RND_FUN f, r + .if (\f == RND_F1) + ROUND_F1 \r + .elseif (\f == RND_F2) + ROUND_F2 \r + .elseif (\f == RND_F3) + ROUND_F3 \r + .endif +.endm + +.macro RR r + .set round_id, (\r % 80) + + .if (round_id == 0) # Precalculate F for first round + .set ROUND_FUNC, RND_F1 + mov B, TB + + rorx $(32-30), B, B # b>>>2 + andn D, TB, T1 + and C, TB + xor T1, TB + .endif + + RND_FUN ROUND_FUNC, \r + ROTATE_STATE + + .if (round_id == 18) + .set ROUND_FUNC, RND_F2 + .elseif (round_id == 38) + .set ROUND_FUNC, RND_F3 + .elseif (round_id == 58) + .set ROUND_FUNC, RND_F2 + .endif + + .set round_id, ( (\r+1) % 80) + + RND_FUN ROUND_FUNC, (\r+1) + ROTATE_STATE +.endm + +.macro ROUND_F1 r + add WK(\r), E + + andn C, A, T1 # ~b&d + lea (RE,RTB), E # Add F from the previous round + + rorx $(32-5), A, TA # T2 = A >>> 5 + rorx $(32-30),A, TB # b>>>2 for next round + + PRECALC (\r) # message scheduling for next 2 blocks + + # Calculate F for the next round + # (b & c) ^ andn[b, d] + and B, A # b&c + xor T1, A # F1 = (b&c) ^ (~b&d) + + lea (RE,RTA), E # E += A >>> 5 +.endm + +.macro ROUND_F2 r + add WK(\r), E + lea (RE,RTB), E # Add F from the previous round + + # Calculate F for the next round + rorx $(32-5), A, TA # T2 = A >>> 5 + .if ((round_id) < 79) + rorx $(32-30), A, TB # b>>>2 for next round + .endif + PRECALC (\r) # message scheduling for next 2 blocks + + .if ((round_id) < 79) + xor B, A + .endif + + add TA, E # E += A >>> 5 + + .if ((round_id) < 79) + xor C, A + .endif +.endm + +.macro ROUND_F3 r + add WK(\r), E + PRECALC (\r) # message scheduling for next 2 blocks + + lea (RE,RTB), E # Add F from the previous round + + mov B, T1 + or A, T1 + + rorx $(32-5), A, TA # T2 = A >>> 5 + rorx $(32-30), A, TB # b>>>2 for next round + + # Calculate F for the next round + # (b and c) or (d and (b or c)) + and C, T1 + and B, A + or T1, A + + add TA, E # E += A >>> 5 + +.endm + +#---------------------- +.section .rodata + +#define K1 0x5a827999 +#define K2 0x6ed9eba1 +#define K3 0x8f1bbcdc +#define K4 0xca62c1d6 + +.align 128 +K_XMM_AR: + .long K1, K1, K1, K1 + .long K1, K1, K1, K1 + .long K2, K2, K2, K2 + .long K2, K2, K2, K2 + .long K3, K3, K3, K3 + .long K3, K3, K3, K3 + .long K4, K4, K4, K4 + .long K4, K4, K4, K4 + +BSWAP_SHUFB_CTL: + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f + +##---------------------- +.text + +/* AVX2 optimized implementation: + * extern "C" void sha1_transform_avx2( + * int *hash, const char* input, size_t num_blocks ); + */ +SHA1_VECTOR_ASM sha1_transform_avx2 +#endif /* CONFIG_AS_AVX2 */