2014-03-12 18:47:15

by chandramouli narayanan

[permalink] [raw]
Subject: [PATCH 1/2] SHA1 transform: x86_64 AVX2 optimization - assembly code-v2

This git patch adds x86_64 AVX2 optimization of SHA1 transform
to crypto support. The patch has been tested with 3.14.0-rc1
kernel.

On a Haswell desktop, with turbo disabled and all cpus running
at maximum frequency, tcrypt shows AVX2 performance improvement
from 3% for 256 bytes update to 16% for 1024 bytes update over
AVX implementation.

Signed-off-by: Chandramouli Narayanan <[email protected]>

diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
new file mode 100644
index 0000000..2f71294
--- /dev/null
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -0,0 +1,732 @@
+/*
+ Implement fast SHA-1 with AVX2 instructions. (x86_64)
+
+ This file is provided under a dual BSD/GPLv2 license. When using or
+ redistributing this file, you may do so under either license.
+
+ GPL LICENSE SUMMARY
+
+ Copyright(c) 2014 Intel Corporation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of version 2 of the GNU General Public License as
+ published by the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ Contact Information:
+ Ilya Albrekht <[email protected]>
+ Maxim Locktyukhin <[email protected]>
+ Ronen Zohar <[email protected]>
+ Chandramouli Narayanan <[email protected]>
+
+ BSD LICENSE
+
+ Copyright(c) 2014 Intel Corporation.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#---------------------
+#
+#SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
+#
+#This implementation is based on the previous SSSE3 release:
+#Visit http://software.intel.com/en-us/articles/
+#and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
+#
+#Updates 20-byte SHA-1 record in 'hash' for even number of
+#'num_blocks' consecutive 64-byte blocks
+#
+#extern "C" void sha1_transform_avx2(
+# int *hash, const char* input, size_t num_blocks );
+#
+
+#ifdef CONFIG_AS_AVX2
+#include <linux/linkage.h>
+
+#define CTX %rdi /* arg1 */
+#define BUF %rsi /* arg2 */
+#define CNT %rdx /* arg3 */
+
+#define REG_A %ecx
+#define REG_B %esi
+#define REG_C %edi
+#define REG_D %eax
+#define REG_E %edx
+#define REG_TB %ebx
+#define REG_TA %r12d
+#define REG_RA %rcx
+#define REG_RB %rsi
+#define REG_RC %rdi
+#define REG_RD %rax
+#define REG_RE %rdx
+#define REG_RTA %r12
+#define REG_RTB %rbx
+#define REG_T1 %ebp
+#define xmm_mov vmovups
+#define avx2_zeroupper vzeroupper
+#define RND_F1 1
+#define RND_F2 2
+#define RND_F3 3
+
+.macro REGALLOC
+ .set A, REG_A
+ .set B, REG_B
+ .set C, REG_C
+ .set D, REG_D
+ .set E, REG_E
+ .set TB, REG_TB
+ .set TA, REG_TA
+
+ .set RA, REG_RA
+ .set RB, REG_RB
+ .set RC, REG_RC
+ .set RD, REG_RD
+ .set RE, REG_RE
+
+ .set RTA, REG_RTA
+ .set RTB, REG_RTB
+
+ .set T1, REG_T1
+.endm
+
+#define K_BASE %r8
+#define HASH_PTR %r9
+#define BUFFER_PTR %r10
+#define BUFFER_PTR2 %r13
+#define BUFFER_END %r11
+
+#define PRECALC_BUF %r14
+#define WK_BUF %r15
+
+#define W_TMP %xmm0
+#define WY_TMP %ymm0
+#define WY_TMP2 %ymm9
+
+# AVX2 variables
+#define WY0 %ymm3
+#define WY4 %ymm5
+#define WY08 %ymm7
+#define WY12 %ymm8
+#define WY16 %ymm12
+#define WY20 %ymm13
+#define WY24 %ymm14
+#define WY28 %ymm15
+
+#define YMM_SHUFB_BSWAP %ymm10
+
+# Keep 2 iterations precalculated at a time:
+# - 80 DWORDs per iteration * 2
+#define W_SIZE (80*2*2 +16)
+
+#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
+#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
+
+#-----------------------------------------------------------------------------
+#
+# macro implements SHA-1 function's body for several 64-byte blocks
+# param: function's name
+#
+.macro SHA1_VECTOR_ASM name
+ ENTRY(\name)
+.align 4096
+
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ #FIXME: Save rsp
+
+ RESERVE_STACK = (W_SIZE*4 + 8+24)
+
+ # Align stack
+ mov %rsp, %rbx
+ and $(0x1000-1), %rbx
+ sub $(8+32), %rbx
+ sub %rbx, %rsp
+ push %rbx
+ sub $RESERVE_STACK, %rsp
+
+ avx2_zeroupper
+
+ lea K_XMM_AR(%rip), K_BASE
+
+ mov CTX, HASH_PTR
+ mov BUF, BUFFER_PTR
+ lea 64(BUF), BUFFER_PTR2
+
+ shl $6, CNT # mul by 64
+ add BUF, CNT
+ add $64, CNT
+ mov CNT, BUFFER_END
+
+ cmp BUFFER_END, BUFFER_PTR2
+ cmovae K_BASE, BUFFER_PTR2
+
+ xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
+
+ SHA1_PIPELINED_MAIN_BODY
+
+ avx2_zeroupper
+
+ add $RESERVE_STACK, %rsp
+ pop %rbx
+ add %rbx, %rsp
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+
+ ret
+
+ ENDPROC(\name)
+.endm
+
+#--------------------------------------------
+# macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
+#
+.macro SHA1_PIPELINED_MAIN_BODY
+
+ REGALLOC
+
+ mov (HASH_PTR), A
+ mov 4(HASH_PTR), B
+ mov 8(HASH_PTR), C
+ mov 12(HASH_PTR), D
+ mov 16(HASH_PTR), E
+
+ mov %rsp, PRECALC_BUF
+ lea (2*4*80+32)(%rsp), WK_BUF
+
+ # Precalc WK for first 2 blocks
+ PRECALC_OFFSET = 0
+ .set i, 0
+ .rept 160
+ PRECALC i
+ .set i, i + 1
+ .endr
+ PRECALC_OFFSET = 128
+ xchg WK_BUF, PRECALC_BUF
+
+ .align 32
+ _loop:
+ # code loops through more than one block
+ # we use K_BASE value as a signal of a last block,
+ # it is set below by: cmovae BUFFER_PTR, K_BASE
+ cmp K_BASE, BUFFER_PTR
+ jne _begin
+ .align 32
+ jmp _end
+ .align 32
+ _begin:
+
+ # Do first block
+ RR 0
+ RR 2
+ RR 4
+ RR 6
+ RR 8
+
+ jmp _loop0
+_loop0:
+
+ RR 10
+ RR 12
+ RR 14
+ RR 16
+ RR 18
+
+ RR 20
+ RR 22
+ RR 24
+ RR 26
+ RR 28
+
+ RR 30
+ RR 32
+ RR 34
+ RR 36
+ RR 38
+
+ RR 40
+ RR 42
+ RR 44
+ RR 46
+ RR 48
+
+ RR 50
+ RR 52
+ RR 54
+ RR 56
+ RR 58
+
+ add $(2*64), BUFFER_PTR # move to next odd-64-byte block
+ cmp BUFFER_END, BUFFER_PTR # is current block the last one?
+ cmovae K_BASE, BUFFER_PTR # signal the last iteration smartly
+
+ RR 60
+ RR 62
+ RR 64
+ RR 66
+ RR 68
+
+ RR 70
+ RR 72
+ RR 74
+ RR 76
+ RR 78
+
+ UPDATE_HASH (HASH_PTR), A
+ UPDATE_HASH 4(HASH_PTR), TB
+ UPDATE_HASH 8(HASH_PTR), C
+ UPDATE_HASH 12(HASH_PTR), D
+ UPDATE_HASH 16(HASH_PTR), E
+
+ cmp K_BASE, BUFFER_PTR # is current block the last one?
+ je _loop
+
+ mov TB, B
+
+ # Process second block
+
+ RR 0+80
+ RR 2+80
+ RR 4+80
+ RR 6+80
+ RR 8+80
+
+ RR 10+80
+ RR 12+80
+ RR 14+80
+ RR 16+80
+ RR 18+80
+
+ jmp _loop1
+_loop1:
+
+ RR 20+80
+ RR 22+80
+ RR 24+80
+ RR 26+80
+ RR 28+80
+
+ RR 30+80
+ RR 32+80
+ RR 34+80
+ RR 36+80
+ RR 38+80
+
+ jmp _loop2
+_loop2:
+
+ RR 40+80
+ RR 42+80
+ RR 44+80
+ RR 46+80
+ RR 48+80
+
+ RR 50+80
+ RR 52+80
+ RR 54+80
+ RR 56+80
+ RR 58+80
+
+
+ add $(2*64), BUFFER_PTR2 # move to next even-64-byte block
+
+ cmp BUFFER_END, BUFFER_PTR2 # is current block the last one
+ cmovae K_BASE, BUFFER_PTR # signal the last iteration smartly
+
+ jmp _loop3
+_loop3:
+
+ RR 60+80
+ RR 62+80
+ RR 64+80
+ RR 66+80
+ RR 68+80
+
+ RR 70+80
+ RR 72+80
+ RR 74+80
+ RR 76+80
+ RR 78+80
+
+ UPDATE_HASH (HASH_PTR), A
+ UPDATE_HASH 4(HASH_PTR), TB
+ UPDATE_HASH 8(HASH_PTR), C
+ UPDATE_HASH 12(HASH_PTR), D
+ UPDATE_HASH 16(HASH_PTR), E
+
+ # Reset state for AVX2 reg permutation
+ mov A, TA
+ mov TB, A
+ mov C, TB
+ mov E, C
+ mov D, B
+ mov TA, D
+
+ REGALLOC
+
+ xchg WK_BUF, PRECALC_BUF
+
+ jmp _loop
+
+ .align 32
+ _end:
+
+.endm
+
+.macro UPDATE_HASH hash, val
+ add \hash, \val
+ mov \val, \hash
+.endm
+
+.macro PRECALC r, s
+ .set i, \r
+
+ .if (i < 40)
+ .set K_XMM, 32*0
+ .elseif (i < 80)
+ .set K_XMM, 32*1
+ .elseif (i < 120)
+ .set K_XMM, 32*2
+ .else
+ .set K_XMM, 32*3
+ .endif
+
+ .if (i<32)
+ PRECALC_00_15 \s
+ .elseif (i<64)
+ PRECALC_16_31 \s
+ .elseif (i < 160)
+ PRECALC_32_79 \s
+ .endif
+.endm
+
+.macro PRECALC_RESET_WY
+ .set WY_00, WY0
+ .set WY_04, WY4
+ .set WY_08, WY08
+ .set WY_12, WY12
+ .set WY_16, WY16
+ .set WY_20, WY20
+ .set WY_24, WY24
+ .set WY_28, WY28
+ .set WY_32, WY_00
+.endm
+
+.macro PRECALC_ROTATE_WY
+ # Rotate macros
+ .set WY_32, WY_28
+ .set WY_28, WY_24
+ .set WY_24, WY_20
+ .set WY_20, WY_16
+ .set WY_16, WY_12
+ .set WY_12, WY_08
+ .set WY_08, WY_04
+ .set WY_04, WY_00
+ .set WY_00, WY_32
+
+ # Define register aliases
+ .set WY, WY_00
+ .set WY_minus_04, WY_04
+ .set WY_minus_08, WY_08
+ .set WY_minus_12, WY_12
+ .set WY_minus_16, WY_16
+ .set WY_minus_20, WY_20
+ .set WY_minus_24, WY_24
+ .set WY_minus_28, WY_28
+ .set WY_minus_32, WY
+.endm
+
+.macro PRECALC_00_15
+
+ .if (i == 0) # Initialize and rotate registers
+ PRECALC_RESET_WY
+ PRECALC_ROTATE_WY
+ .endif
+
+ # message scheduling pre-compute for rounds 0-15
+ .if ((i & 7) == 0)
+ #blended AVX2 and ALU instruction scheduling
+ #1 vector iteration per 8 rounds
+ vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP
+ .elseif ((i & 7) == 1)
+ vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2), WY_TMP, WY_TMP
+ .elseif ((i & 7) == 2)
+ vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
+ .elseif ((i & 7) == 4)
+ vpaddd K_XMM(K_BASE), WY, WY_TMP
+ .elseif ((i & 7) == 7)
+ vmovdqu WY_TMP, PRECALC_WK(i&~7)
+
+ PRECALC_ROTATE_WY
+ .endif
+
+.endm
+
+.macro PRECALC_16_31
+ # message scheduling pre-compute for rounds 16-31
+ # calculating last 32 w[i] values in 8 XMM registers
+ # pre-calculate K+w[i] values and store to mem
+ # for later load by ALU add instruction
+ #
+ # "brute force" vectorization for rounds 16-31 only
+ # due to w[i]->w[i-3] dependency
+ #
+ .if ((i & 7) == 0)
+ #blended AVX2 and ALU instruction scheduling
+ #1 vector iteration per 8 rounds
+ vpalignr $8, WY_minus_16, WY_minus_12, WY # w[i-14]
+ vpsrldq $4, WY_minus_04, WY_TMP # w[i-3]
+ .elseif ((i & 7) == 1)
+ vpxor WY_minus_08, WY, WY
+ vpxor WY_minus_16, WY_TMP, WY_TMP
+ .elseif ((i & 7) == 2)
+ vpxor WY_TMP, WY, WY
+ vpslldq $12, WY, WY_TMP2
+ .elseif ((i & 7) == 3)
+ vpslld $1, WY, WY_TMP
+ vpsrld $31, WY, WY
+ .elseif ((i & 7) == 4)
+ vpor WY, WY_TMP, WY_TMP
+ vpslld $2, WY_TMP2, WY
+ .elseif ((i & 7) == 5)
+ vpsrld $30, WY_TMP2, WY_TMP2
+ vpxor WY, WY_TMP, WY_TMP
+ .elseif ((i & 7) == 7)
+ vpxor WY_TMP2, WY_TMP, WY
+ vpaddd K_XMM(K_BASE), WY, WY_TMP
+ vmovdqu WY_TMP, PRECALC_WK(i&~7)
+
+ PRECALC_ROTATE_WY
+ .endif
+.endm
+
+.macro PRECALC_32_79
+ # in SHA-1 specification:
+ # w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
+ # instead we do equal:
+ # w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+ # allows more efficient vectorization
+ # since w[i]=>w[i-3] dependency is broken
+ #
+ .if ((i & 7) == 0)
+ #blended AVX2 and ALU instruction scheduling
+ #1 vector iteration per 8 rounds
+ vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
+ .elseif ((i & 7) == 1)
+ vpxor WY_minus_28, WY, WY # W is W_minus_32 before xor
+ .elseif ((i & 7) == 2)
+ vpxor WY_minus_16, WY_TMP, WY_TMP
+ .elseif ((i & 7) == 3)
+ vpxor WY_TMP, WY, WY
+ .elseif ((i & 7) == 4)
+ vpslld $2, WY, WY_TMP
+ .elseif ((i & 7) == 5)
+ vpsrld $30, WY, WY
+ vpor WY, WY_TMP, WY
+ .elseif ((i & 7) == 7)
+ vpaddd K_XMM(K_BASE), WY, WY_TMP
+ vmovdqu WY_TMP, PRECALC_WK(i&~7)
+
+ PRECALC_ROTATE_WY
+ .endif
+.endm
+
+.macro ROTATE_STATE
+ .set T_REG, E
+ .set E, D
+ .set D, C
+ .set C, B
+ .set B, TB
+ .set TB, A
+ .set A, T_REG
+
+ .set T_REG, RE
+ .set RE, RD
+ .set RD, RC
+ .set RC, RB
+ .set RB, RTB
+ .set RTB, RA
+ .set RA, T_REG
+.endm
+
+# Macro relies on saved ROUND_Fx
+
+.macro RND_FUN f, r
+ .if (\f == RND_F1)
+ ROUND_F1 \r
+ .elseif (\f == RND_F2)
+ ROUND_F2 \r
+ .elseif (\f == RND_F3)
+ ROUND_F3 \r
+ .endif
+.endm
+
+.macro RR r
+ .set round_id, (\r % 80)
+
+ .if (round_id == 0) # Precalculate F for first round
+ .set ROUND_FUNC, RND_F1
+ mov B, TB
+
+ rorx $(32-30), B, B # b>>>2
+ andn D, TB, T1
+ and C, TB
+ xor T1, TB
+ .endif
+
+ RND_FUN ROUND_FUNC, \r
+ ROTATE_STATE
+
+ .if (round_id == 18)
+ .set ROUND_FUNC, RND_F2
+ .elseif (round_id == 38)
+ .set ROUND_FUNC, RND_F3
+ .elseif (round_id == 58)
+ .set ROUND_FUNC, RND_F2
+ .endif
+
+ .set round_id, ( (\r+1) % 80)
+
+ RND_FUN ROUND_FUNC, (\r+1)
+ ROTATE_STATE
+.endm
+
+.macro ROUND_F1 r
+ add WK(\r), E
+
+ andn C, A, T1 # ~b&d
+ lea (RE,RTB), E # Add F from the previous round
+
+ rorx $(32-5), A, TA # T2 = A >>> 5
+ rorx $(32-30),A, TB # b>>>2 for next round
+
+ PRECALC (\r) # message scheduling for next 2 blocks
+
+ # Calculate F for the next round
+ # (b & c) ^ andn[b, d]
+ and B, A # b&c
+ xor T1, A # F1 = (b&c) ^ (~b&d)
+
+ lea (RE,RTA), E # E += A >>> 5
+.endm
+
+.macro ROUND_F2 r
+ add WK(\r), E
+ lea (RE,RTB), E # Add F from the previous round
+
+ # Calculate F for the next round
+ rorx $(32-5), A, TA # T2 = A >>> 5
+ .if ((round_id) < 79)
+ rorx $(32-30), A, TB # b>>>2 for next round
+ .endif
+ PRECALC (\r) # message scheduling for next 2 blocks
+
+ .if ((round_id) < 79)
+ xor B, A
+ .endif
+
+ add TA, E # E += A >>> 5
+
+ .if ((round_id) < 79)
+ xor C, A
+ .endif
+.endm
+
+.macro ROUND_F3 r
+ add WK(\r), E
+ PRECALC (\r) # message scheduling for next 2 blocks
+
+ lea (RE,RTB), E # Add F from the previous round
+
+ mov B, T1
+ or A, T1
+
+ rorx $(32-5), A, TA # T2 = A >>> 5
+ rorx $(32-30), A, TB # b>>>2 for next round
+
+ # Calculate F for the next round
+ # (b and c) or (d and (b or c))
+ and C, T1
+ and B, A
+ or T1, A
+
+ add TA, E # E += A >>> 5
+
+.endm
+
+#----------------------
+.section .rodata
+
+#define K1 0x5a827999
+#define K2 0x6ed9eba1
+#define K3 0x8f1bbcdc
+#define K4 0xca62c1d6
+
+.align 128
+K_XMM_AR:
+ .long K1, K1, K1, K1
+ .long K1, K1, K1, K1
+ .long K2, K2, K2, K2
+ .long K2, K2, K2, K2
+ .long K3, K3, K3, K3
+ .long K3, K3, K3, K3
+ .long K4, K4, K4, K4
+ .long K4, K4, K4, K4
+
+BSWAP_SHUFB_CTL:
+ .long 0x00010203
+ .long 0x04050607
+ .long 0x08090a0b
+ .long 0x0c0d0e0f
+ .long 0x00010203
+ .long 0x04050607
+ .long 0x08090a0b
+ .long 0x0c0d0e0f
+
+##----------------------
+.text
+
+/* AVX2 optimized implementation:
+ * extern "C" void sha1_transform_avx2(
+ * int *hash, const char* input, size_t num_blocks );
+ */
+SHA1_VECTOR_ASM sha1_transform_avx2
+#endif /* CONFIG_AS_AVX2 */


2014-03-14 05:40:35

by Marek Vasut

[permalink] [raw]
Subject: Re: [PATCH 1/2] SHA1 transform: x86_64 AVX2 optimization - assembly code-v2

On Wednesday, March 12, 2014 at 07:47:43 PM, chandramouli narayanan wrote:
> This git patch adds x86_64 AVX2 optimization of SHA1 transform
> to crypto support. The patch has been tested with 3.14.0-rc1
> kernel.
>
> On a Haswell desktop, with turbo disabled and all cpus running
> at maximum frequency, tcrypt shows AVX2 performance improvement
> from 3% for 256 bytes update to 16% for 1024 bytes update over
> AVX implementation.
>
> Signed-off-by: Chandramouli Narayanan <[email protected]>
>
> diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
> b/arch/x86/crypto/sha1_avx2_x86_64_asm.S new file mode 100644
> index 0000000..2f71294
> --- /dev/null
> +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
> @@ -0,0 +1,732 @@
> +/*
> + Implement fast SHA-1 with AVX2 instructions. (x86_64)
> +
> + This file is provided under a dual BSD/GPLv2 license. When using or
> + redistributing this file, you may do so under either license.
> +
> + GPL LICENSE SUMMARY

Please see Documentation/CodingStyle chapter 8 for the preffered comment style.

[...]

> +*/
> +
> +#---------------------
> +#
> +#SHA-1 implementation with Intel(R) AVX2 instruction set extensions.

DTTO here.

> +#This implementation is based on the previous SSSE3 release:
> +#Visit http://software.intel.com/en-us/articles/
> +#and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
> +#
> +#Updates 20-byte SHA-1 record in 'hash' for even number of
> +#'num_blocks' consecutive 64-byte blocks
> +#
> +#extern "C" void sha1_transform_avx2(
> +# int *hash, const char* input, size_t num_blocks );
> +#
> +
> +#ifdef CONFIG_AS_AVX2

I wonder, is this large #ifdef around the entire file needed here? Can you not
just handle not-compiling this file in in the Makefile ?

[...]

> + push %rbx
> + push %rbp
> + push %r12
> + push %r13
> + push %r14
> + push %r15
> + #FIXME: Save rsp
> +
> + RESERVE_STACK = (W_SIZE*4 + 8+24)
> +
> + # Align stack
> + mov %rsp, %rbx
> + and $(0x1000-1), %rbx
> + sub $(8+32), %rbx
> + sub %rbx, %rsp
> + push %rbx
> + sub $RESERVE_STACK, %rsp
> +
> + avx2_zeroupper
> +
> + lea K_XMM_AR(%rip), K_BASE

Can you please use TABs for indent consistently (see the CodingStyle again) ?

[...]

> + .align 32
> + _loop:
> + # code loops through more than one block
> + # we use K_BASE value as a signal of a last block,
> + # it is set below by: cmovae BUFFER_PTR, K_BASE
> + cmp K_BASE, BUFFER_PTR
> + jne _begin
> + .align 32
> + jmp _end
> + .align 32
> + _begin:
> +
> + # Do first block
> + RR 0
> + RR 2
> + RR 4
> + RR 6
> + RR 8
> +
> + jmp _loop0
> +_loop0:
> +
> + RR 10
> + RR 12
> + RR 14
> + RR 16
> + RR 18
> +
> + RR 20
> + RR 22
> + RR 24
> + RR 26
> + RR 28

Can you not generate these repeated sequences with some of the AS's macro voodoo
? Like .rept or somesuch ?

[...]

> +.macro UPDATE_HASH hash, val
> + add \hash, \val
> + mov \val, \hash
> +.endm

This macro is defined below the point where it's used, which is a little
counter-intuitive.
[...]

> +
> +/* AVX2 optimized implementation:
> + * extern "C" void sha1_transform_avx2(
> + * int *hash, const char* input, size_t num_blocks );

What does this comment tell me ?

btw. you might want to squash 1/2 and 2/2 , since they are not two logical
separate blocks I think.

2014-03-17 15:58:52

by chandramouli narayanan

[permalink] [raw]
Subject: Re: [PATCH 1/2] SHA1 transform: x86_64 AVX2 optimization - assembly code-v2

On Fri, 2014-03-14 at 06:34 +0100, Marek Vasut wrote:
> On Wednesday, March 12, 2014 at 07:47:43 PM, chandramouli narayanan wrote:
> > This git patch adds x86_64 AVX2 optimization of SHA1 transform
> > to crypto support. The patch has been tested with 3.14.0-rc1
> > kernel.
> >
> > On a Haswell desktop, with turbo disabled and all cpus running
> > at maximum frequency, tcrypt shows AVX2 performance improvement
> > from 3% for 256 bytes update to 16% for 1024 bytes update over
> > AVX implementation.
> >
> > Signed-off-by: Chandramouli Narayanan <[email protected]>
> >
> > diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
> > b/arch/x86/crypto/sha1_avx2_x86_64_asm.S new file mode 100644
> > index 0000000..2f71294
> > --- /dev/null
> > +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
> > @@ -0,0 +1,732 @@
> > +/*
> > + Implement fast SHA-1 with AVX2 instructions. (x86_64)
> > +
> > + This file is provided under a dual BSD/GPLv2 license. When using or
> > + redistributing this file, you may do so under either license.
> > +
> > + GPL LICENSE SUMMARY
>
> Please see Documentation/CodingStyle chapter 8 for the preffered comment style.
>
I will fix the coding style issues.

> [...]
>
> > +*/
> > +
> > +#---------------------
> > +#
> > +#SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
>
> DTTO here.
>
I will fix style issues.

> > +#This implementation is based on the previous SSSE3 release:
> > +#Visit http://software.intel.com/en-us/articles/
> > +#and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
> > +#
> > +#Updates 20-byte SHA-1 record in 'hash' for even number of
> > +#'num_blocks' consecutive 64-byte blocks
> > +#
> > +#extern "C" void sha1_transform_avx2(
> > +# int *hash, const char* input, size_t num_blocks );
> > +#
> > +
> > +#ifdef CONFIG_AS_AVX2
>
> I wonder, is this large #ifdef around the entire file needed here? Can you not
> just handle not-compiling this file in in the Makefile ?
>
> [...]
>
> > + push %rbx
> > + push %rbp
> > + push %r12
> > + push %r13
> > + push %r14
> > + push %r15
> > + #FIXME: Save rsp
> > +
> > + RESERVE_STACK = (W_SIZE*4 + 8+24)
> > +
> > + # Align stack
> > + mov %rsp, %rbx
> > + and $(0x1000-1), %rbx
> > + sub $(8+32), %rbx
> > + sub %rbx, %rsp
> > + push %rbx
> > + sub $RESERVE_STACK, %rsp
> > +
> > + avx2_zeroupper
> > +
> > + lea K_XMM_AR(%rip), K_BASE
>
> Can you please use TABs for indent consistently (see the CodingStyle again) ?
>
Agreed.

> [...]
>
> > + .align 32
> > + _loop:
> > + # code loops through more than one block
> > + # we use K_BASE value as a signal of a last block,
> > + # it is set below by: cmovae BUFFER_PTR, K_BASE
> > + cmp K_BASE, BUFFER_PTR
> > + jne _begin
> > + .align 32
> > + jmp _end
> > + .align 32
> > + _begin:
> > +
> > + # Do first block
> > + RR 0
> > + RR 2
> > + RR 4
> > + RR 6
> > + RR 8
> > +
> > + jmp _loop0
> > +_loop0:
> > +
> > + RR 10
> > + RR 12
> > + RR 14
> > + RR 16
> > + RR 18
> > +
> > + RR 20
> > + RR 22
> > + RR 24
> > + RR 26
> > + RR 28
>
> Can you not generate these repeated sequences with some of the AS's macro voodoo
> ? Like .rept or somesuch ?
>
Will incorporate a .rept.

> [...]
>
> > +.macro UPDATE_HASH hash, val
> > + add \hash, \val
> > + mov \val, \hash
> > +.endm
>
> This macro is defined below the point where it's used, which is a little
> counter-intuitive.
> [...]
>
Will reorganize.

> > +
> > +/* AVX2 optimized implementation:
> > + * extern "C" void sha1_transform_avx2(
> > + * int *hash, const char* input, size_t num_blocks );
>
> What does this comment tell me ?
>
> btw. you might want to squash 1/2 and 2/2 , since they are not two logical
> separate blocks I think.
In summary, I will fix these issues, retest and post the next version.

thanks,
- mouli