From: chandramouli narayanan <mouli@linux.intel.com>
Subject: [PATCH 1/2] SHA1 transform: x86_64 AVX2 optimization - assembly
 code
Date: Thu, 27 Feb 2014 09:06:55 -0800
Message-ID: <1393520815.7495.88.camel@pegasus.jf.intel.com>
Mime-Version: 1.0
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: 7bit
Cc: ilya.albrekht@intel.com, maxim.locktyukhin@intel.com,
	ronen.zohar@intel.com, wajdi.k.feghali@intel.com,
	tim.c.chen@linux.intel.com, linux-crypto@vger.kernel.org
To: herbert@gondor.apana.org.au, davem@davemloft.net, hpa@zytor.com
Sender: linux-crypto-owner@vger.kernel.org

This git patch adds x86_64 AVX2 optimization of SHA1 transform
to crypto support. The patch has been tested with 3.14.0-rc1
kernel.

On a Haswell desktop, with turbo disabled and all cpus running
at maximum frequency, tcrypt shows AVX2 performance improvement
from 3% for 256 bytes update to 16% for 1024 bytes update over
AVX implementation. 

Signed-off-by: Chandramouli Narayanan <mouli@linux.intel.com>

diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
new file mode 100644
index 0000000..2f71294
--- /dev/null
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -0,0 +1,732 @@
+/*
+	Implement fast SHA-1 with AVX2 instructions. (x86_64)
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2014 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+     Ilya Albrekht <ilya.albrekht@intel.com>
+     Maxim Locktyukhin <maxim.locktyukhin@intel.com>
+     Ronen Zohar <ronen.zohar@intel.com>
+     Chandramouli Narayanan <mouli@linux.intel.com>
+
+  BSD LICENSE
+
+  Copyright(c) 2014 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#---------------------
+#
+#SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
+#
+#This implementation is based on the previous SSSE3 release:
+#Visit http://software.intel.com/en-us/articles/
+#and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
+#
+#Updates 20-byte SHA-1 record in 'hash' for even number of
+#'num_blocks' consecutive 64-byte blocks
+#
+#extern "C" void sha1_transform_avx2(
+#	int *hash, const char* input, size_t num_blocks );
+#
+
+#ifdef CONFIG_AS_AVX2
+#include <linux/linkage.h>
+
+#define CTX	%rdi	/* arg1 */
+#define BUF	%rsi	/* arg2 */
+#define CNT	%rdx	/* arg3 */
+
+#define REG_A	%ecx
+#define REG_B	%esi
+#define REG_C	%edi
+#define REG_D	%eax
+#define REG_E	%edx
+#define REG_TB  %ebx
+#define REG_TA  %r12d
+#define REG_RA  %rcx
+#define REG_RB  %rsi
+#define REG_RC  %rdi
+#define REG_RD  %rax
+#define REG_RE  %rdx
+#define REG_RTA %r12
+#define REG_RTB %rbx
+#define REG_T1  %ebp
+#define xmm_mov	vmovups
+#define avx2_zeroupper	vzeroupper
+#define RND_F1	1
+#define RND_F2	2
+#define RND_F3	3
+
+.macro REGALLOC
+  .set A, REG_A
+  .set B, REG_B
+  .set C, REG_C
+  .set D, REG_D
+  .set E, REG_E
+  .set TB, REG_TB
+  .set TA, REG_TA
+
+  .set RA, REG_RA
+  .set RB, REG_RB
+  .set RC, REG_RC
+  .set RD, REG_RD
+  .set RE, REG_RE
+
+  .set RTA, REG_RTA
+  .set RTB, REG_RTB
+
+  .set T1, REG_T1
+.endm
+
+#define K_BASE         %r8
+#define HASH_PTR       %r9
+#define BUFFER_PTR     %r10
+#define BUFFER_PTR2    %r13
+#define BUFFER_END     %r11
+
+#define PRECALC_BUF    %r14
+#define WK_BUF         %r15
+
+#define W_TMP   %xmm0
+#define WY_TMP  %ymm0
+#define WY_TMP2 %ymm9
+
+# AVX2 variables
+#define WY0  %ymm3
+#define WY4  %ymm5
+#define WY08 %ymm7
+#define WY12 %ymm8
+#define WY16 %ymm12
+#define WY20 %ymm13
+#define WY24 %ymm14
+#define WY28 %ymm15
+
+#define YMM_SHUFB_BSWAP  %ymm10
+
+# Keep 2 iterations precalculated at a time:
+#   - 80 DWORDs per iteration * 2
+#define W_SIZE (80*2*2 +16)
+
+#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 +
((t)/80)*16 )(WK_BUF)
+#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
+
+#-----------------------------------------------------------------------------
+#
+# macro implements SHA-1 function's body for several 64-byte blocks
+# param: function's name
+#
+.macro SHA1_VECTOR_ASM  name
+	ENTRY(\name)
+.align 4096
+
+        push %rbx
+        push %rbp
+        push %r12
+        push %r13
+        push %r14
+        push %r15
+	#FIXME: Save rsp
+
+        RESERVE_STACK  = (W_SIZE*4 + 8+24)
+
+        # Align stack
+        mov     %rsp, %rbx
+        and     $(0x1000-1), %rbx
+        sub     $(8+32), %rbx
+        sub     %rbx, %rsp
+        push    %rbx
+        sub     $RESERVE_STACK, %rsp
+
+        avx2_zeroupper
+
+	lea	K_XMM_AR(%rip), K_BASE
+
+        mov     CTX, HASH_PTR
+        mov     BUF, BUFFER_PTR
+        lea     64(BUF), BUFFER_PTR2
+
+        shl     $6, CNT           # mul by 64
+        add     BUF, CNT
+        add     $64, CNT
+        mov     CNT, BUFFER_END
+
+        cmp     BUFFER_END, BUFFER_PTR2
+        cmovae  K_BASE, BUFFER_PTR2
+
+        xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
+
+        SHA1_PIPELINED_MAIN_BODY
+
+        avx2_zeroupper
+
+        add $RESERVE_STACK, %rsp
+        pop %rbx
+        add %rbx, %rsp
+
+        pop %r15
+        pop %r14
+        pop %r13
+        pop %r12
+        pop %rbp
+        pop %rbx
+
+        ret
+
+	ENDPROC(\name)
+.endm
+
+#--------------------------------------------
+# macro implements 80 rounds of SHA-1, for multiple blocks with s/w
pipelining
+#
+.macro SHA1_PIPELINED_MAIN_BODY
+
+        REGALLOC
+
+        mov   (HASH_PTR), A
+        mov  4(HASH_PTR), B
+        mov  8(HASH_PTR), C
+        mov 12(HASH_PTR), D
+        mov 16(HASH_PTR), E
+
+        mov %rsp, PRECALC_BUF
+        lea (2*4*80+32)(%rsp), WK_BUF
+
+        # Precalc WK for first 2 blocks
+        PRECALC_OFFSET = 0
+        .set i, 0
+        .rept    160
+            PRECALC i
+            .set i, i + 1
+        .endr
+        PRECALC_OFFSET = 128
+        xchg WK_BUF, PRECALC_BUF
+
+    .align 32
+    _loop:
+	# code loops through more than one block
+	# we use K_BASE value as a signal of a last block,
+	# it is set below by: cmovae BUFFER_PTR, K_BASE
+        cmp K_BASE, BUFFER_PTR
+        jne _begin
+    .align 32
+        jmp _end
+    .align 32
+    _begin:
+
+        # Do first block
+        RR 0
+        RR 2
+        RR 4
+        RR 6
+        RR 8
+
+        jmp _loop0
+_loop0:
+
+        RR 10
+        RR 12
+        RR 14
+        RR 16
+        RR 18
+
+        RR 20
+        RR 22
+        RR 24
+        RR 26
+        RR 28
+
+        RR 30
+        RR 32
+        RR 34
+        RR 36
+        RR 38
+
+        RR 40
+        RR 42
+        RR 44
+        RR 46
+        RR 48
+
+        RR 50
+        RR 52
+        RR 54
+        RR 56
+        RR 58
+
+        add   $(2*64), BUFFER_PTR       # move to next odd-64-byte
block
+        cmp   BUFFER_END, BUFFER_PTR    # is current block the last
one?
+        cmovae K_BASE, BUFFER_PTR	# signal the last iteration smartly
+
+        RR 60
+        RR 62
+        RR 64
+        RR 66
+        RR 68
+
+        RR 70
+        RR 72
+        RR 74
+        RR 76
+        RR 78
+
+        UPDATE_HASH   (HASH_PTR), A
+        UPDATE_HASH  4(HASH_PTR), TB
+        UPDATE_HASH  8(HASH_PTR), C
+        UPDATE_HASH 12(HASH_PTR), D
+        UPDATE_HASH 16(HASH_PTR), E
+
+        cmp   K_BASE, BUFFER_PTR	# is current block the last one?
+        je _loop
+
+        mov TB, B
+
+        # Process second block
+
+        RR 0+80
+        RR 2+80
+        RR 4+80
+        RR 6+80
+        RR 8+80
+
+        RR 10+80
+        RR 12+80
+        RR 14+80
+        RR 16+80
+        RR 18+80
+
+        jmp _loop1
+_loop1:
+
+        RR 20+80
+        RR 22+80
+        RR 24+80
+        RR 26+80
+        RR 28+80
+
+        RR 30+80
+        RR 32+80
+        RR 34+80
+        RR 36+80
+        RR 38+80
+
+        jmp _loop2
+_loop2:
+
+        RR 40+80
+        RR 42+80
+        RR 44+80
+        RR 46+80
+        RR 48+80
+
+        RR 50+80
+        RR 52+80
+        RR 54+80
+        RR 56+80
+        RR 58+80
+
+
+        add   $(2*64), BUFFER_PTR2      # move to next even-64-byte
block
+
+        cmp   BUFFER_END, BUFFER_PTR2   # is current block the last one
+        cmovae K_BASE, BUFFER_PTR       # signal the last iteration
smartly
+
+        jmp _loop3
+_loop3:
+
+        RR 60+80
+        RR 62+80
+        RR 64+80
+        RR 66+80
+        RR 68+80
+
+        RR 70+80
+        RR 72+80
+        RR 74+80
+        RR 76+80
+        RR 78+80
+
+        UPDATE_HASH   (HASH_PTR), A
+        UPDATE_HASH  4(HASH_PTR), TB
+        UPDATE_HASH  8(HASH_PTR), C
+        UPDATE_HASH 12(HASH_PTR), D
+        UPDATE_HASH 16(HASH_PTR), E
+
+        # Reset state for AVX2 reg permutation
+        mov A, TA
+        mov TB, A
+        mov C, TB
+        mov E, C
+        mov D, B
+        mov TA, D
+
+        REGALLOC
+
+        xchg WK_BUF, PRECALC_BUF
+
+        jmp _loop
+
+    .align 32
+    _end:
+
+.endm
+
+.macro UPDATE_HASH  hash, val
+	add	\hash, \val
+	mov	\val, \hash
+.endm
+
+.macro PRECALC r, s
+    .set i, \r
+
+    .if (i < 40)
+        .set K_XMM, 32*0
+    .elseif (i < 80)
+        .set K_XMM, 32*1
+    .elseif (i < 120)
+        .set K_XMM, 32*2
+    .else
+        .set K_XMM, 32*3
+    .endif
+
+    .if (i<32)
+        PRECALC_00_15 \s
+    .elseif (i<64)
+        PRECALC_16_31 \s
+    .elseif (i < 160)
+        PRECALC_32_79 \s
+    .endif
+.endm
+
+.macro PRECALC_RESET_WY
+    .set WY_00, WY0
+    .set WY_04, WY4
+    .set WY_08, WY08
+    .set WY_12, WY12
+    .set WY_16, WY16
+    .set WY_20, WY20
+    .set WY_24, WY24
+    .set WY_28, WY28
+    .set WY_32, WY_00
+.endm
+
+.macro PRECALC_ROTATE_WY
+    # Rotate macros
+    .set WY_32, WY_28
+    .set WY_28, WY_24
+    .set WY_24, WY_20
+    .set WY_20, WY_16
+    .set WY_16, WY_12
+    .set WY_12, WY_08
+    .set WY_08, WY_04
+    .set WY_04, WY_00
+    .set WY_00, WY_32
+
+    # Define register aliases
+    .set WY, WY_00
+    .set WY_minus_04, WY_04
+    .set WY_minus_08, WY_08
+    .set WY_minus_12, WY_12
+    .set WY_minus_16, WY_16
+    .set WY_minus_20, WY_20
+    .set WY_minus_24, WY_24
+    .set WY_minus_28, WY_28
+    .set WY_minus_32, WY
+.endm
+
+.macro PRECALC_00_15
+
+  .if (i == 0) # Initialize and rotate registers
+	PRECALC_RESET_WY
+	PRECALC_ROTATE_WY
+  .endif
+
+  # message scheduling pre-compute for rounds 0-15
+  .if   ((i & 7) == 0)
+	#blended AVX2 and ALU instruction scheduling
+	#1 vector iteration per 8 rounds
+      vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP
+  .elseif ((i & 7) == 1)
+      vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),
WY_TMP, WY_TMP
+  .elseif ((i & 7) == 2)
+      vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
+  .elseif ((i & 7) == 4)
+      vpaddd  K_XMM(K_BASE), WY, WY_TMP
+  .elseif ((i & 7) == 7)
+      vmovdqu  WY_TMP, PRECALC_WK(i&~7)
+
+      PRECALC_ROTATE_WY
+  .endif
+
+.endm
+
+.macro PRECALC_16_31
+      # message scheduling pre-compute for rounds 16-31
+      # calculating last 32 w[i] values in 8 XMM registers
+      # pre-calculate K+w[i] values and store to mem
+      # for later load by ALU add instruction
+      #
+      # "brute force" vectorization for rounds 16-31 only
+      # due to w[i]->w[i-3] dependency
+      #
+  .if   ((i & 7) == 0)
+	#blended AVX2 and ALU instruction scheduling
+	#1 vector iteration per 8 rounds
+	vpalignr $8, WY_minus_16, WY_minus_12, WY      # w[i-14]
+        vpsrldq  $4, WY_minus_04, WY_TMP               # w[i-3]
+  .elseif ((i & 7) == 1)
+	vpxor    WY_minus_08, WY, WY
+	vpxor    WY_minus_16, WY_TMP, WY_TMP
+  .elseif ((i & 7) == 2)
+	vpxor    WY_TMP, WY, WY
+	vpslldq  $12, WY, WY_TMP2
+  .elseif ((i & 7) == 3)
+	vpslld   $1, WY, WY_TMP
+	vpsrld   $31, WY, WY
+  .elseif ((i & 7) == 4)
+	vpor     WY, WY_TMP, WY_TMP
+	vpslld   $2, WY_TMP2, WY
+  .elseif ((i & 7) == 5)
+	vpsrld   $30, WY_TMP2, WY_TMP2
+	vpxor    WY, WY_TMP, WY_TMP
+  .elseif ((i & 7) == 7)
+	vpxor    WY_TMP2, WY_TMP, WY
+        vpaddd  K_XMM(K_BASE), WY, WY_TMP
+	vmovdqu  WY_TMP, PRECALC_WK(i&~7)
+
+        PRECALC_ROTATE_WY
+  .endif
+.endm
+
+.macro PRECALC_32_79
+    # in SHA-1 specification:
+    # w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
+    # instead we do equal:
+    # w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+    # allows more efficient vectorization
+    # since w[i]=>w[i-3] dependency is broken
+    #
+  .if   ((i & 7) == 0)
+	#blended AVX2 and ALU instruction scheduling
+	#1 vector iteration per 8 rounds
+	vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
+  .elseif ((i & 7) == 1)
+	vpxor    WY_minus_28, WY, WY      # W is W_minus_32 before xor
+  .elseif ((i & 7) == 2)
+	vpxor    WY_minus_16, WY_TMP, WY_TMP
+  .elseif ((i & 7) == 3)
+	vpxor    WY_TMP, WY, WY
+  .elseif ((i & 7) == 4)
+	vpslld   $2, WY, WY_TMP
+  .elseif ((i & 7) == 5)
+	vpsrld   $30, WY, WY
+	vpor     WY, WY_TMP, WY
+  .elseif ((i & 7) == 7)
+	vpaddd  K_XMM(K_BASE), WY, WY_TMP
+	vmovdqu  WY_TMP, PRECALC_WK(i&~7)
+
+	PRECALC_ROTATE_WY
+  .endif
+.endm
+
+.macro ROTATE_STATE
+    .set T_REG, E
+    .set E, D
+    .set D, C
+    .set C, B
+    .set B, TB
+    .set TB, A
+    .set A, T_REG
+
+    .set T_REG, RE
+    .set RE, RD
+    .set RD, RC
+    .set RC, RB
+    .set RB, RTB
+    .set RTB, RA
+    .set RA, T_REG
+.endm
+
+# Macro relies on saved ROUND_Fx
+
+.macro RND_FUN f, r
+	.if (\f == RND_F1)
+		ROUND_F1 \r
+	.elseif (\f == RND_F2)
+		ROUND_F2 \r
+	.elseif (\f == RND_F3)
+		ROUND_F3 \r
+	.endif
+.endm
+
+.macro RR r
+    .set round_id, (\r % 80)
+
+    .if (round_id == 0)        # Precalculate F for first round
+	.set ROUND_FUNC, RND_F1
+        mov B, TB
+
+        rorx $(32-30), B, B    # b>>>2
+        andn D, TB, T1
+        and  C, TB
+        xor  T1, TB
+    .endif
+
+    RND_FUN ROUND_FUNC, \r
+    ROTATE_STATE
+
+    .if   (round_id == 18)
+        .set ROUND_FUNC, RND_F2
+    .elseif (round_id == 38)
+        .set ROUND_FUNC, RND_F3
+    .elseif (round_id == 58)
+        .set ROUND_FUNC, RND_F2
+    .endif
+
+    .set round_id, ( (\r+1) % 80)
+
+    RND_FUN ROUND_FUNC, (\r+1)
+    ROTATE_STATE
+.endm
+
+.macro ROUND_F1 r
+    add  WK(\r), E
+
+    andn C, A, T1                       # ~b&d
+    lea  (RE,RTB), E                    # Add F from the previous round
+
+    rorx $(32-5), A, TA                 # T2 = A >>> 5
+    rorx $(32-30),A, TB                 # b>>>2 for next round
+
+    PRECALC     (\r)                    # message scheduling for next 2
blocks
+
+    # Calculate F for the next round
+    # (b & c) ^ andn[b, d]
+    and  B, A                           # b&c
+    xor  T1, A                          # F1 = (b&c) ^ (~b&d)
+
+    lea  (RE,RTA), E                    # E += A >>> 5
+.endm
+
+.macro ROUND_F2 r
+    add  WK(\r), E
+    lea  (RE,RTB), E                    # Add F from the previous round
+
+    # Calculate F for the next round
+    rorx $(32-5), A, TA                 # T2 = A >>> 5
+    .if ((round_id) < 79)
+        rorx $(32-30), A, TB            # b>>>2 for next round
+    .endif
+    PRECALC     (\r)                    # message scheduling for next 2
blocks
+
+    .if ((round_id) < 79)
+        xor B, A
+    .endif
+
+    add  TA, E                          # E += A >>> 5
+
+    .if ((round_id) < 79)
+        xor C, A
+    .endif
+.endm
+
+.macro ROUND_F3 r
+    add  WK(\r), E
+    PRECALC     (\r)                    # message scheduling for next 2
blocks
+
+    lea  (RE,RTB), E                    # Add F from the previous round
+
+    mov B, T1
+    or  A, T1
+
+    rorx $(32-5), A, TA                 # T2 = A >>> 5
+    rorx $(32-30), A, TB                 # b>>>2 for next round
+
+    # Calculate F for the next round
+    # (b and c) or (d and (b or c))
+    and C, T1
+    and B, A
+    or  T1, A
+
+    add  TA, E                          # E += A >>> 5
+
+.endm
+
+#----------------------
+.section .rodata
+
+#define K1 0x5a827999
+#define K2 0x6ed9eba1
+#define K3 0x8f1bbcdc
+#define K4 0xca62c1d6
+
+.align 128
+K_XMM_AR:
+    .long K1, K1, K1, K1
+    .long K1, K1, K1, K1
+    .long K2, K2, K2, K2
+    .long K2, K2, K2, K2
+    .long K3, K3, K3, K3
+    .long K3, K3, K3, K3
+    .long K4, K4, K4, K4
+    .long K4, K4, K4, K4
+
+BSWAP_SHUFB_CTL:
+    .long 0x00010203
+    .long 0x04050607
+    .long 0x08090a0b
+    .long 0x0c0d0e0f
+    .long 0x00010203
+    .long 0x04050607
+    .long 0x08090a0b
+    .long 0x0c0d0e0f
+
+##----------------------
+.text
+
+/* AVX2 optimized implementation:
+ *   extern "C" void sha1_transform_avx2(
+ *	int *hash, const char* input, size_t num_blocks );
+ */
+SHA1_VECTOR_ASM     sha1_transform_avx2
+#endif /* CONFIG_AS_AVX2 */