From: Jussi Kivilinna Subject: Re: [PATCH 1/3] crypto: twofish-avx - tune assembler code for ~10% more performance Date: Thu, 16 Aug 2012 17:30:49 +0300 Message-ID: <20120816173049.142814pli51b0qas@www.81.fi> References: <20120730113620.23527.64087.stgit@localhost6.localdomain6> Mime-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-1; DelSp="Yes"; format="flowed" Content-Transfer-Encoding: 7bit Cc: linux-crypto@vger.kernel.org, Johannes Goetzfried , "David S. Miller" To: Herbert Xu Return-path: Received: from sd-mail-sa-01.sanoma.fi ([158.127.18.161]:43928 "EHLO sd-mail-sa-01.sanoma.fi" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753442Ab2HPOaw (ORCPT ); Thu, 16 Aug 2012 10:30:52 -0400 In-Reply-To: <20120730113620.23527.64087.stgit@localhost6.localdomain6> Content-Disposition: inline Sender: linux-crypto-owner@vger.kernel.org List-ID: Please, ignore this patchset as it causes performance regression on Bulldozer. I'll make new patchset with this issue fixed. -Jussi Quoting Jussi Kivilinna : > Patch replaces 'movb' instructions with 'movzbl' to break false register > dependencies and interleaves instructions better for out-of-order scheduling. > > Also move common round code to separate function to reduce object size. > > Tested on Core i5-2450M. > > Cc: Johannes Goetzfried > Signed-off-by: Jussi Kivilinna > --- > arch/x86/crypto/twofish-avx-x86_64-asm_64.S | 144 > +++++++++++++++++---------- > 1 file changed, 92 insertions(+), 52 deletions(-) > > diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S > b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S > index 35f4557..42b27b7 100644 > --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S > +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S > @@ -47,15 +47,22 @@ > #define RC2 %xmm6 > #define RD2 %xmm7 > > -#define RX %xmm8 > -#define RY %xmm9 > +#define RX0 %xmm8 > +#define RY0 %xmm9 > > -#define RK1 %xmm10 > -#define RK2 %xmm11 > +#define RX1 %xmm10 > +#define RY1 %xmm11 > + > +#define RK1 %xmm12 > +#define RK2 %xmm13 > + > +#define RT %xmm14 > > #define RID1 %rax > +#define RID1d %eax > #define RID1b %al > #define RID2 %rbx > +#define RID2d %ebx > #define RID2b %bl > > #define RGI1 %rdx > @@ -73,40 +80,45 @@ > #define RGS3d %r10d > > > -#define lookup_32bit(t0, t1, t2, t3, src, dst) \ > - movb src ## bl, RID1b; \ > - movb src ## bh, RID2b; \ > +#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \ > + movzbl src ## bl, RID1d; \ > + movzbl src ## bh, RID2d; \ > + shrq $16, src; \ > movl t0(CTX, RID1, 4), dst ## d; \ > xorl t1(CTX, RID2, 4), dst ## d; \ > - shrq $16, src; \ > - movb src ## bl, RID1b; \ > - movb src ## bh, RID2b; \ > + movzbl src ## bl, RID1d; \ > + movzbl src ## bh, RID2d; \ > + interleave_op(il_reg); \ > xorl t2(CTX, RID1, 4), dst ## d; \ > xorl t3(CTX, RID2, 4), dst ## d; > > +#define dummy(d) /* do nothing */ > + > +#define shr_next(reg) \ > + shrq $16, reg; > + > #define G(a, x, t0, t1, t2, t3) \ > vmovq a, RGI1; \ > - vpsrldq $8, a, x; \ > - vmovq x, RGI2; \ > + vpextrq $1, a, RGI2; \ > \ > - lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \ > - shrq $16, RGI1; \ > - lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \ > - shlq $32, RGS2; \ > - orq RGS1, RGS2; \ > + lookup_32bit(t0, t1, t2, t3, RGI1, RGS1, shr_next, RGI1); \ > + vmovd RGS1d, x; \ > + lookup_32bit(t0, t1, t2, t3, RGI1, RGS2, dummy, none); \ > + vpinsrd $1, RGS2d, x, x; \ > \ > - lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \ > - shrq $16, RGI2; \ > - lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \ > - shlq $32, RGS3; \ > - orq RGS1, RGS3; \ > - \ > - vmovq RGS2, x; \ > - vpinsrq $1, RGS3, x, x; > + lookup_32bit(t0, t1, t2, t3, RGI2, RGS1, shr_next, RGI2); \ > + vpinsrd $2, RGS1d, x, x; \ > + lookup_32bit(t0, t1, t2, t3, RGI2, RGS3, dummy, none); \ > + vpinsrd $3, RGS3d, x, x; > + > +#define encround_g1g2(a, b, c, d, x, y) \ > + G(a, x, s0, s1, s2, s3); \ > + G(b, y, s1, s2, s3, s0); > > -#define encround(a, b, c, d, x, y) \ > - G(a, x, s0, s1, s2, s3); \ > - G(b, y, s1, s2, s3, s0); \ > +#define encround_end(a, b, c, d, x, y) \ > + vpslld $1, d, RT; \ > + vpsrld $(32 - 1), d, d; \ > + vpor d, RT, d; \ > vpaddd x, y, x; \ > vpaddd y, x, y; \ > vpaddd x, RK1, x; \ > @@ -115,14 +127,16 @@ > vpsrld $1, c, x; \ > vpslld $(32 - 1), c, c; \ > vpor c, x, c; \ > - vpslld $1, d, x; \ > - vpsrld $(32 - 1), d, d; \ > - vpor d, x, d; \ > vpxor d, y, d; > > -#define decround(a, b, c, d, x, y) \ > - G(a, x, s0, s1, s2, s3); \ > - G(b, y, s1, s2, s3, s0); \ > +#define decround_g1g2(a, b, c, d, x, y) \ > + G(a, x, s0, s1, s2, s3); \ > + G(b, y, s1, s2, s3, s0); > + > +#define decround_end(a, b, c, d, x, y) \ > + vpslld $1, c, RT; \ > + vpsrld $(32 - 1), c, c; \ > + vpor c, RT, c; \ > vpaddd x, y, x; \ > vpaddd y, x, y; \ > vpaddd y, RK2, y; \ > @@ -130,23 +144,50 @@ > vpsrld $1, d, y; \ > vpslld $(32 - 1), d, d; \ > vpor d, y, d; \ > - vpslld $1, c, y; \ > - vpsrld $(32 - 1), c, c; \ > - vpor c, y, c; \ > vpaddd x, RK1, x; \ > vpxor x, c, c; > > +.align 4 > +encround_RARBRCRD: > + encround_g1g2(RA1, RB1, RC1, RD1, RX0, RY0); > + encround_g1g2(RA2, RB2, RC2, RD2, RX1, RY1); > + encround_end(RA1, RB1, RC1, RD1, RX0, RY0); > + encround_end(RA2, RB2, RC2, RD2, RX1, RY1); > + ret; > + > +.align 4 > +encround_RCRDRARB: > + encround_g1g2(RC1, RD1, RA1, RB1, RX0, RY0); > + encround_g1g2(RC2, RD2, RA2, RB2, RX1, RY1); > + encround_end(RC1, RD1, RA1, RB1, RX0, RY0); > + encround_end(RC2, RD2, RA2, RB2, RX1, RY1); > + ret; > + > #define encrypt_round(n, a, b, c, d) \ > vbroadcastss (k+4*(2*(n)))(CTX), RK1; \ > vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \ > - encround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \ > - encround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY); > + call encround_ ## a ## b ## c ## d; > + > +.align 4 > +decround_RARBRCRD: > + decround_g1g2(RA1, RB1, RC1, RD1, RX0, RY0); > + decround_g1g2(RA2, RB2, RC2, RD2, RX1, RY1); > + decround_end(RA1, RB1, RC1, RD1, RX0, RY0); > + decround_end(RA2, RB2, RC2, RD2, RX1, RY1); > + ret; > + > +.align 4 > +decround_RCRDRARB: > + decround_g1g2(RC1, RD1, RA1, RB1, RX0, RY0); > + decround_g1g2(RC2, RD2, RA2, RB2, RX1, RY1); > + decround_end(RC1, RD1, RA1, RB1, RX0, RY0); > + decround_end(RC2, RD2, RA2, RB2, RX1, RY1); > + ret; > > #define decrypt_round(n, a, b, c, d) \ > vbroadcastss (k+4*(2*(n)))(CTX), RK1; \ > vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \ > - decround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \ > - decround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY); > + call decround_ ## a ## b ## c ## d; > > #define encrypt_cycle(n) \ > encrypt_round((2*n), RA, RB, RC, RD); \ > @@ -156,7 +197,6 @@ > decrypt_round(((2*n) + 1), RC, RD, RA, RB); \ > decrypt_round((2*n), RA, RB, RC, RD); > > - > #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ > vpunpckldq x1, x0, t0; \ > vpunpckhdq x1, x0, t2; \ > @@ -222,8 +262,8 @@ __twofish_enc_blk_8way: > vmovdqu w(CTX), RK1; > > leaq (4*4*4)(%rdx), %rax; > - inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2); > - inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2); > + inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); > + inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); > > xorq RID1, RID1; > xorq RID2, RID2; > @@ -247,14 +287,14 @@ __twofish_enc_blk_8way: > testb %cl, %cl; > jnz __enc_xor8; > > - outunpack_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2); > - outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2); > + outunpack_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); > + outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); > > ret; > > __enc_xor8: > - outunpack_xor_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2); > - outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2); > + outunpack_xor_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); > + outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); > > ret; > > @@ -274,8 +314,8 @@ twofish_dec_blk_8way: > vmovdqu (w+4*4)(CTX), RK1; > > leaq (4*4*4)(%rdx), %rax; > - inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2); > - inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2); > + inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); > + inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); > > xorq RID1, RID1; > xorq RID2, RID2; > @@ -294,7 +334,7 @@ twofish_dec_blk_8way: > popq %rbx; > > leaq (4*4*4)(%rsi), %rax; > - outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2); > - outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2); > + outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); > + outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); > > ret; > > -- > To unsubscribe from this list: send the line "unsubscribe linux-crypto" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > >