2012-07-30 11:36:24

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 1/3] crypto: twofish-avx - tune assembler code for ~10% more performance

Patch replaces 'movb' instructions with 'movzbl' to break false register
dependencies and interleaves instructions better for out-of-order scheduling.

Also move common round code to separate function to reduce object size.

Tested on Core i5-2450M.

Cc: Johannes Goetzfried <[email protected]>
Signed-off-by: Jussi Kivilinna <[email protected]>
---
arch/x86/crypto/twofish-avx-x86_64-asm_64.S | 144 +++++++++++++++++----------
1 file changed, 92 insertions(+), 52 deletions(-)

diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 35f4557..42b27b7 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -47,15 +47,22 @@
#define RC2 %xmm6
#define RD2 %xmm7

-#define RX %xmm8
-#define RY %xmm9
+#define RX0 %xmm8
+#define RY0 %xmm9

-#define RK1 %xmm10
-#define RK2 %xmm11
+#define RX1 %xmm10
+#define RY1 %xmm11
+
+#define RK1 %xmm12
+#define RK2 %xmm13
+
+#define RT %xmm14

#define RID1 %rax
+#define RID1d %eax
#define RID1b %al
#define RID2 %rbx
+#define RID2d %ebx
#define RID2b %bl

#define RGI1 %rdx
@@ -73,40 +80,45 @@
#define RGS3d %r10d


-#define lookup_32bit(t0, t1, t2, t3, src, dst) \
- movb src ## bl, RID1b; \
- movb src ## bh, RID2b; \
+#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
+ movzbl src ## bl, RID1d; \
+ movzbl src ## bh, RID2d; \
+ shrq $16, src; \
movl t0(CTX, RID1, 4), dst ## d; \
xorl t1(CTX, RID2, 4), dst ## d; \
- shrq $16, src; \
- movb src ## bl, RID1b; \
- movb src ## bh, RID2b; \
+ movzbl src ## bl, RID1d; \
+ movzbl src ## bh, RID2d; \
+ interleave_op(il_reg); \
xorl t2(CTX, RID1, 4), dst ## d; \
xorl t3(CTX, RID2, 4), dst ## d;

+#define dummy(d) /* do nothing */
+
+#define shr_next(reg) \
+ shrq $16, reg;
+
#define G(a, x, t0, t1, t2, t3) \
vmovq a, RGI1; \
- vpsrldq $8, a, x; \
- vmovq x, RGI2; \
+ vpextrq $1, a, RGI2; \
\
- lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
- shrq $16, RGI1; \
- lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
- shlq $32, RGS2; \
- orq RGS1, RGS2; \
+ lookup_32bit(t0, t1, t2, t3, RGI1, RGS1, shr_next, RGI1); \
+ vmovd RGS1d, x; \
+ lookup_32bit(t0, t1, t2, t3, RGI1, RGS2, dummy, none); \
+ vpinsrd $1, RGS2d, x, x; \
\
- lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \
- shrq $16, RGI2; \
- lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \
- shlq $32, RGS3; \
- orq RGS1, RGS3; \
- \
- vmovq RGS2, x; \
- vpinsrq $1, RGS3, x, x;
+ lookup_32bit(t0, t1, t2, t3, RGI2, RGS1, shr_next, RGI2); \
+ vpinsrd $2, RGS1d, x, x; \
+ lookup_32bit(t0, t1, t2, t3, RGI2, RGS3, dummy, none); \
+ vpinsrd $3, RGS3d, x, x;
+
+#define encround_g1g2(a, b, c, d, x, y) \
+ G(a, x, s0, s1, s2, s3); \
+ G(b, y, s1, s2, s3, s0);

-#define encround(a, b, c, d, x, y) \
- G(a, x, s0, s1, s2, s3); \
- G(b, y, s1, s2, s3, s0); \
+#define encround_end(a, b, c, d, x, y) \
+ vpslld $1, d, RT; \
+ vpsrld $(32 - 1), d, d; \
+ vpor d, RT, d; \
vpaddd x, y, x; \
vpaddd y, x, y; \
vpaddd x, RK1, x; \
@@ -115,14 +127,16 @@
vpsrld $1, c, x; \
vpslld $(32 - 1), c, c; \
vpor c, x, c; \
- vpslld $1, d, x; \
- vpsrld $(32 - 1), d, d; \
- vpor d, x, d; \
vpxor d, y, d;

-#define decround(a, b, c, d, x, y) \
- G(a, x, s0, s1, s2, s3); \
- G(b, y, s1, s2, s3, s0); \
+#define decround_g1g2(a, b, c, d, x, y) \
+ G(a, x, s0, s1, s2, s3); \
+ G(b, y, s1, s2, s3, s0);
+
+#define decround_end(a, b, c, d, x, y) \
+ vpslld $1, c, RT; \
+ vpsrld $(32 - 1), c, c; \
+ vpor c, RT, c; \
vpaddd x, y, x; \
vpaddd y, x, y; \
vpaddd y, RK2, y; \
@@ -130,23 +144,50 @@
vpsrld $1, d, y; \
vpslld $(32 - 1), d, d; \
vpor d, y, d; \
- vpslld $1, c, y; \
- vpsrld $(32 - 1), c, c; \
- vpor c, y, c; \
vpaddd x, RK1, x; \
vpxor x, c, c;

+.align 4
+encround_RARBRCRD:
+ encround_g1g2(RA1, RB1, RC1, RD1, RX0, RY0);
+ encround_g1g2(RA2, RB2, RC2, RD2, RX1, RY1);
+ encround_end(RA1, RB1, RC1, RD1, RX0, RY0);
+ encround_end(RA2, RB2, RC2, RD2, RX1, RY1);
+ ret;
+
+.align 4
+encround_RCRDRARB:
+ encround_g1g2(RC1, RD1, RA1, RB1, RX0, RY0);
+ encround_g1g2(RC2, RD2, RA2, RB2, RX1, RY1);
+ encround_end(RC1, RD1, RA1, RB1, RX0, RY0);
+ encround_end(RC2, RD2, RA2, RB2, RX1, RY1);
+ ret;
+
#define encrypt_round(n, a, b, c, d) \
vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
- encround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
- encround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
+ call encround_ ## a ## b ## c ## d;
+
+.align 4
+decround_RARBRCRD:
+ decround_g1g2(RA1, RB1, RC1, RD1, RX0, RY0);
+ decround_g1g2(RA2, RB2, RC2, RD2, RX1, RY1);
+ decround_end(RA1, RB1, RC1, RD1, RX0, RY0);
+ decround_end(RA2, RB2, RC2, RD2, RX1, RY1);
+ ret;
+
+.align 4
+decround_RCRDRARB:
+ decround_g1g2(RC1, RD1, RA1, RB1, RX0, RY0);
+ decround_g1g2(RC2, RD2, RA2, RB2, RX1, RY1);
+ decround_end(RC1, RD1, RA1, RB1, RX0, RY0);
+ decround_end(RC2, RD2, RA2, RB2, RX1, RY1);
+ ret;

#define decrypt_round(n, a, b, c, d) \
vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
- decround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
- decround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
+ call decround_ ## a ## b ## c ## d;

#define encrypt_cycle(n) \
encrypt_round((2*n), RA, RB, RC, RD); \
@@ -156,7 +197,6 @@
decrypt_round(((2*n) + 1), RC, RD, RA, RB); \
decrypt_round((2*n), RA, RB, RC, RD);

-
#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
vpunpckldq x1, x0, t0; \
vpunpckhdq x1, x0, t2; \
@@ -222,8 +262,8 @@ __twofish_enc_blk_8way:
vmovdqu w(CTX), RK1;

leaq (4*4*4)(%rdx), %rax;
- inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
- inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
+ inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
+ inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);

xorq RID1, RID1;
xorq RID2, RID2;
@@ -247,14 +287,14 @@ __twofish_enc_blk_8way:
testb %cl, %cl;
jnz __enc_xor8;

- outunpack_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
- outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
+ outunpack_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
+ outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);

ret;

__enc_xor8:
- outunpack_xor_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
- outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
+ outunpack_xor_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
+ outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);

ret;

@@ -274,8 +314,8 @@ twofish_dec_blk_8way:
vmovdqu (w+4*4)(CTX), RK1;

leaq (4*4*4)(%rdx), %rax;
- inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
- inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
+ inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
+ inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);

xorq RID1, RID1;
xorq RID2, RID2;
@@ -294,7 +334,7 @@ twofish_dec_blk_8way:
popq %rbx;

leaq (4*4*4)(%rsi), %rax;
- outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
- outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
+ outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
+ outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);

ret;


2012-07-30 11:36:29

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 2/3] crypto: cast5-avx - tune assembler code for ~11% more performance

Patch replaces 'movb' instructions with 'movzbl' to break false register
dependencies, interleaves instructions better for out-of-order scheduling
and merges constant 16-bit rotation with round-key variable rotation.

Tested on Core i5-2450M.

Cc: Johannes Goetzfried <[email protected]>
Signed-off-by: Jussi Kivilinna <[email protected]>
---
arch/x86/crypto/cast5-avx-x86_64-asm_64.S | 273 ++++++++++++++++-------------
1 file changed, 151 insertions(+), 122 deletions(-)

diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index 94693c8..6d064d0 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -56,18 +56,20 @@

#define RX %xmm8

-#define RKM %xmm9
-#define RKRF %xmm10
-#define RKRR %xmm11
+#define RKM0 %xmm9
+#define RKRL0 %xmm10
+#define RKRR0 %xmm11

-#define RTMP %xmm12
-#define RMASK %xmm13
-#define R32 %xmm14
+#define RKM1 %xmm12
+#define RKRL1 %xmm13
+#define RKRR1 %xmm14
+
+#define RTMP %xmm15

#define RID1 %rax
-#define RID1b %al
+#define RID1d %eax
#define RID2 %rbx
-#define RID2b %bl
+#define RID2d %ebx

#define RGI1 %rdx
#define RGI1bl %dl
@@ -84,60 +86,76 @@
#define RFS3d %r10d


-#define lookup_32bit(src, dst, op1, op2, op3) \
- movb src ## bl, RID1b; \
- movb src ## bh, RID2b; \
+#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
+ movzbl src ## bh, RID1d; \
+ movzbl src ## bl, RID2d; \
+ shrq $16, src; \
movl s1(, RID1, 4), dst ## d; \
op1 s2(, RID2, 4), dst ## d; \
- shrq $16, src; \
- movb src ## bl, RID1b; \
- movb src ## bh, RID2b; \
+ movzbl src ## bh, RID1d; \
+ movzbl src ## bl, RID2d; \
+ interleave_op(il_reg); \
op2 s3(, RID1, 4), dst ## d; \
op3 s4(, RID2, 4), dst ## d;

-#define F(a, x, op0, op1, op2, op3) \
- op0 a, RKM, x; \
- vpslld RKRF, x, RTMP; \
- vpsrld RKRR, x, x; \
+#define dummy(d) /* do nothing */
+
+#define shr_next(reg) \
+ shrq $16, reg;
+
+#define F(a, x, op0, op1, op2, op3, rkm, rkrl, rkrr) \
+ op0 a, rkm, x; \
+ vpslld rkrl, x, RTMP; \
+ vpsrld rkrr, x, x; \
vpor RTMP, x, x; \
\
- vpshufb RMASK, x, x; \
vmovq x, RGI1; \
- vpsrldq $8, x, x; \
- vmovq x, RGI2; \
- \
- lookup_32bit(RGI1, RFS1, op1, op2, op3); \
- shrq $16, RGI1; \
- lookup_32bit(RGI1, RFS2, op1, op2, op3); \
- shlq $32, RFS2; \
- orq RFS1, RFS2; \
+ vpextrq $1, x, RGI2; \
\
- lookup_32bit(RGI2, RFS1, op1, op2, op3); \
- shrq $16, RGI2; \
- lookup_32bit(RGI2, RFS3, op1, op2, op3); \
- shlq $32, RFS3; \
- orq RFS1, RFS3; \
+ lookup_32bit(RGI1, RFS1, op1, op2, op3, shr_next, RGI1); \
+ vmovd RFS1d, x; \
+ lookup_32bit(RGI1, RFS2, op1, op2, op3, dummy, none); \
+ vpinsrd $1, RFS2d, x, x; \
\
- vmovq RFS2, x; \
- vpinsrq $1, RFS3, x, x;
-
-#define F1(b, x) F(b, x, vpaddd, xorl, subl, addl)
-#define F2(b, x) F(b, x, vpxor, subl, addl, xorl)
-#define F3(b, x) F(b, x, vpsubd, addl, xorl, subl)
-
-#define subround(a, b, x, n, f) \
- F ## f(b, x); \
+ lookup_32bit(RGI2, RFS1, op1, op2, op3, shr_next, RGI2); \
+ vpinsrd $2, RFS1d, x, x; \
+ lookup_32bit(RGI2, RFS3, op1, op2, op3, dummy, none); \
+ vpinsrd $3, RFS3d, x, x;
+
+#define F1(b, x, rkm, rkrl, rkrr) \
+ F(b, x, vpaddd, xorl, subl, addl, rkm, rkrl, rkrr)
+#define F2(b, x, rkm, rkrl, rkrr) \
+ F(b, x, vpxor, subl, addl, xorl, rkm, rkrl, rkrr)
+#define F3(b, x, rkm, rkrl, rkrr) \
+ F(b, x, vpsubd, addl, xorl, subl, rkm, rkrl, rkrr)
+
+#define subround(a, b, x, f, rkm, rkrl, rkrr) \
+ F ## f(b, x, rkm, rkrl, rkrr); \
vpxor a, x, a;

-#define round(l, r, n, f) \
- vbroadcastss (km+(4*n))(CTX), RKM; \
- vpinsrb $0, (kr+n)(CTX), RKRF, RKRF; \
- vpsubq RKRF, R32, RKRR; \
- subround(l ## 1, r ## 1, RX, n, f); \
- subround(l ## 2, r ## 2, RX, n, f); \
- subround(l ## 3, r ## 3, RX, n, f); \
- subround(l ## 4, r ## 4, RX, n, f);
-
+#define load_round_key(n, rkm, rkrl, rkrr) \
+ vbroadcastss (km+(4*n))(CTX), rkm; \
+ movzbl (kr+n)(CTX), RID1d; \
+ movl $32, RID2d; \
+ /* merge (kr)-bit and 16-bit rotates */ \
+ xorl $16, RID1d; \
+ vmovd RID1d, rkrl; \
+ subl RID1d, RID2d; \
+ vmovd RID2d, rkrr;
+
+#define enc_load_keys(n) \
+ load_round_key((n + 0), RKM0, RKRL0, RKRR0); \
+ load_round_key((n + 1), RKM1, RKRL1, RKRR1);
+
+#define dec_load_keys(n) \
+ load_round_key((n - 0), RKM0, RKRL0, RKRR0); \
+ load_round_key((n - 1), RKM1, RKRL1, RKRR1);
+
+#define round(l, r, f, rkm, rkrl, rkrr) \
+ subround(l ## 1, r ## 1, RX, f, rkm, rkrl, rkrr); \
+ subround(l ## 2, r ## 2, RX, f, rkm, rkrl, rkrr); \
+ subround(l ## 3, r ## 3, RX, f, rkm, rkrl, rkrr); \
+ subround(l ## 4, r ## 4, RX, f, rkm, rkrl, rkrr);

#define transpose_2x4(x0, x1, t0, t1) \
vpunpckldq x1, x0, t0; \
@@ -146,27 +164,27 @@
vpunpcklqdq t1, t0, x0; \
vpunpckhqdq t1, t0, x1;

-#define inpack_blocks(in, x0, x1, t0, t1) \
+#define inpack_blocks(in, x0, x1, t0, t1, rmask) \
vmovdqu (0*4*4)(in), x0; \
vmovdqu (1*4*4)(in), x1; \
- vpshufb RMASK, x0, x0; \
- vpshufb RMASK, x1, x1; \
+ vpshufb rmask, x0, x0; \
+ vpshufb rmask, x1, x1; \
\
transpose_2x4(x0, x1, t0, t1)

-#define outunpack_blocks(out, x0, x1, t0, t1) \
+#define outunpack_blocks(out, x0, x1, t0, t1, rmask) \
transpose_2x4(x0, x1, t0, t1) \
\
- vpshufb RMASK, x0, x0; \
- vpshufb RMASK, x1, x1; \
+ vpshufb rmask, x0, x0; \
+ vpshufb rmask, x1, x1; \
vmovdqu x0, (0*4*4)(out); \
vmovdqu x1, (1*4*4)(out);

-#define outunpack_xor_blocks(out, x0, x1, t0, t1) \
+#define outunpack_xor_blocks(out, x0, x1, t0, t1, rmask) \
transpose_2x4(x0, x1, t0, t1) \
\
- vpshufb RMASK, x0, x0; \
- vpshufb RMASK, x1, x1; \
+ vpshufb rmask, x0, x0; \
+ vpshufb rmask, x1, x1; \
vpxor (0*4*4)(out), x0, x0; \
vmovdqu x0, (0*4*4)(out); \
vpxor (1*4*4)(out), x1, x1; \
@@ -175,8 +193,6 @@
.align 16
.Lbswap_mask:
.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
-.L32_mask:
- .byte 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0

.align 16
.global __cast5_enc_blk_16way
@@ -193,68 +209,75 @@ __cast5_enc_blk_16way:
pushq %rbx;
pushq %rcx;

- vmovdqu .Lbswap_mask, RMASK;
- vmovdqu .L32_mask, R32;
- vpxor RKRF, RKRF, RKRF;
-
- inpack_blocks(%rdx, RL1, RR1, RTMP, RX);
+ vmovdqa .Lbswap_mask, RKM0;
+ inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM0);
leaq (2*4*4)(%rdx), %rax;
- inpack_blocks(%rax, RL2, RR2, RTMP, RX);
+ inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM0);
leaq (2*4*4)(%rax), %rax;
- inpack_blocks(%rax, RL3, RR3, RTMP, RX);
+ inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM0);
leaq (2*4*4)(%rax), %rax;
- inpack_blocks(%rax, RL4, RR4, RTMP, RX);
+ inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM0);

xorq RID1, RID1;
xorq RID2, RID2;

- round(RL, RR, 0, 1);
- round(RR, RL, 1, 2);
- round(RL, RR, 2, 3);
- round(RR, RL, 3, 1);
- round(RL, RR, 4, 2);
- round(RR, RL, 5, 3);
- round(RL, RR, 6, 1);
- round(RR, RL, 7, 2);
- round(RL, RR, 8, 3);
- round(RR, RL, 9, 1);
- round(RL, RR, 10, 2);
- round(RR, RL, 11, 3);
+ enc_load_keys(0);
+ round(RL, RR, 1, RKM0, RKRL0, RKRR0);
+ round(RR, RL, 2, RKM1, RKRL1, RKRR1);
+ enc_load_keys(2);
+ round(RL, RR, 3, RKM0, RKRL0, RKRR0);
+ round(RR, RL, 1, RKM1, RKRL1, RKRR1);
+ enc_load_keys(4);
+ round(RL, RR, 2, RKM0, RKRL0, RKRR0);
+ round(RR, RL, 3, RKM1, RKRL1, RKRR1);
+ enc_load_keys(6);
+ round(RL, RR, 1, RKM0, RKRL0, RKRR0);
+ round(RR, RL, 2, RKM1, RKRL1, RKRR1);
+ enc_load_keys(8);
+ round(RL, RR, 3, RKM0, RKRL0, RKRR0);
+ round(RR, RL, 1, RKM1, RKRL1, RKRR1);
+ enc_load_keys(10);
+ round(RL, RR, 2, RKM0, RKRL0, RKRR0);
+ round(RR, RL, 3, RKM1, RKRL1, RKRR1);

movb rr(CTX), %al;
testb %al, %al;
jnz __skip_enc;

- round(RL, RR, 12, 1);
- round(RR, RL, 13, 2);
- round(RL, RR, 14, 3);
- round(RR, RL, 15, 1);
+ enc_load_keys(12);
+ round(RL, RR, 1, RKM0, RKRL0, RKRR0);
+ round(RR, RL, 2, RKM1, RKRL1, RKRR1);
+ enc_load_keys(14);
+ round(RL, RR, 3, RKM0, RKRL0, RKRR0);
+ round(RR, RL, 1, RKM1, RKRL1, RKRR1);

__skip_enc:
popq %rcx;
popq %rbx;

+ vmovdqa .Lbswap_mask, RKM0;
+
testb %cl, %cl;
jnz __enc_xor16;

- outunpack_blocks(%rsi, RR1, RL1, RTMP, RX);
+ outunpack_blocks(%rsi, RR1, RL1, RTMP, RX, RKM0);
leaq (2*4*4)(%rsi), %rax;
- outunpack_blocks(%rax, RR2, RL2, RTMP, RX);
+ outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM0);
leaq (2*4*4)(%rax), %rax;
- outunpack_blocks(%rax, RR3, RL3, RTMP, RX);
+ outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM0);
leaq (2*4*4)(%rax), %rax;
- outunpack_blocks(%rax, RR4, RL4, RTMP, RX);
+ outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM0);

ret;

__enc_xor16:
- outunpack_xor_blocks(%rsi, RR1, RL1, RTMP, RX);
+ outunpack_xor_blocks(%rsi, RR1, RL1, RTMP, RX, RKM0);
leaq (2*4*4)(%rsi), %rax;
- outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX);
+ outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX, RKM0);
leaq (2*4*4)(%rax), %rax;
- outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX);
+ outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX, RKM0);
leaq (2*4*4)(%rax), %rax;
- outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX);
+ outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX, RKM0);

ret;

@@ -271,17 +294,14 @@ cast5_dec_blk_16way:

pushq %rbx;

- vmovdqu .Lbswap_mask, RMASK;
- vmovdqu .L32_mask, R32;
- vpxor RKRF, RKRF, RKRF;
-
- inpack_blocks(%rdx, RL1, RR1, RTMP, RX);
+ vmovdqa .Lbswap_mask, RKM0;
+ inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM0);
leaq (2*4*4)(%rdx), %rax;
- inpack_blocks(%rax, RL2, RR2, RTMP, RX);
+ inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM0);
leaq (2*4*4)(%rax), %rax;
- inpack_blocks(%rax, RL3, RR3, RTMP, RX);
+ inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM0);
leaq (2*4*4)(%rax), %rax;
- inpack_blocks(%rax, RL4, RR4, RTMP, RX);
+ inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM0);

xorq RID1, RID1;
xorq RID2, RID2;
@@ -290,33 +310,42 @@ cast5_dec_blk_16way:
testb %al, %al;
jnz __skip_dec;

- round(RL, RR, 15, 1);
- round(RR, RL, 14, 3);
- round(RL, RR, 13, 2);
- round(RR, RL, 12, 1);
+ dec_load_keys(15);
+ round(RL, RR, 1, RKM0, RKRL0, RKRR0);
+ round(RR, RL, 3, RKM1, RKRL1, RKRR1);
+ dec_load_keys(13);
+ round(RL, RR, 2, RKM0, RKRL0, RKRR0);
+ round(RR, RL, 1, RKM1, RKRL1, RKRR1);

__skip_dec:
- round(RL, RR, 11, 3);
- round(RR, RL, 10, 2);
- round(RL, RR, 9, 1);
- round(RR, RL, 8, 3);
- round(RL, RR, 7, 2);
- round(RR, RL, 6, 1);
- round(RL, RR, 5, 3);
- round(RR, RL, 4, 2);
- round(RL, RR, 3, 1);
- round(RR, RL, 2, 3);
- round(RL, RR, 1, 2);
- round(RR, RL, 0, 1);
-
+ dec_load_keys(11);
+ round(RL, RR, 3, RKM0, RKRL0, RKRR0);
+ round(RR, RL, 2, RKM1, RKRL1, RKRR1);
+ dec_load_keys(9);
+ round(RL, RR, 1, RKM0, RKRL0, RKRR0);
+ round(RR, RL, 3, RKM1, RKRL1, RKRR1);
+ dec_load_keys(7);
+ round(RL, RR, 2, RKM0, RKRL0, RKRR0);
+ round(RR, RL, 1, RKM1, RKRL1, RKRR1);
+ dec_load_keys(5);
+ round(RL, RR, 3, RKM0, RKRL0, RKRR0);
+ round(RR, RL, 2, RKM1, RKRL1, RKRR1);
+ dec_load_keys(3);
+ round(RL, RR, 1, RKM0, RKRL0, RKRR0);
+ round(RR, RL, 3, RKM1, RKRL1, RKRR1);
+ dec_load_keys(1);
+ round(RL, RR, 2, RKM0, RKRL0, RKRR0);
+ round(RR, RL, 1, RKM1, RKRL1, RKRR1);
+
+ vmovdqa .Lbswap_mask, RKM0;
popq %rbx;

- outunpack_blocks(%rsi, RR1, RL1, RTMP, RX);
+ outunpack_blocks(%rsi, RR1, RL1, RTMP, RX, RKM0);
leaq (2*4*4)(%rsi), %rax;
- outunpack_blocks(%rax, RR2, RL2, RTMP, RX);
+ outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM0);
leaq (2*4*4)(%rax), %rax;
- outunpack_blocks(%rax, RR3, RL3, RTMP, RX);
+ outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM0);
leaq (2*4*4)(%rax), %rax;
- outunpack_blocks(%rax, RR4, RL4, RTMP, RX);
+ outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM0);

ret;

2012-07-30 11:36:34

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 3/3] crypto: cast6-avx - tune assembler code for ~11% more performance

Patch replaces 'movb' instructions with 'movzbl' to break false register
dependencies, interleaves instructions better for out-of-order scheduling
and merges constant 16-bit rotation with round-key variable rotation.

Also move common round code to separate functions to reduce object size.

Tested on Core i5-2450M.

Cc: Johannes Goetzfried <[email protected]>
Signed-off-by: Jussi Kivilinna <[email protected]>
---
arch/x86/crypto/cast6-avx-x86_64-asm_64.S | 235 +++++++++++++++--------------
1 file changed, 121 insertions(+), 114 deletions(-)

diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index d258ce0..3d65def 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -56,18 +56,20 @@

#define RX %xmm8

-#define RKM %xmm9
-#define RKRF %xmm10
-#define RKRR %xmm11
+#define RKM0 %xmm9
+#define RKRL0 %xmm10
+#define RKRR0 %xmm11

-#define RTMP %xmm12
-#define RMASK %xmm13
-#define R32 %xmm14
+#define RKM1 %xmm12
+#define RKRL1 %xmm13
+#define RKRR1 %xmm14
+
+#define RTMP %xmm15

#define RID1 %rax
-#define RID1b %al
+#define RID1d %eax
#define RID2 %rbx
-#define RID2b %bl
+#define RID2d %ebx

#define RGI1 %rdx
#define RGI1bl %dl
@@ -84,95 +86,106 @@
#define RFS3d %r10d


-#define lookup_32bit(src, dst, op1, op2, op3) \
- movb src ## bl, RID1b; \
- movb src ## bh, RID2b; \
+#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
+ movzbl src ## bh, RID1d; \
+ movzbl src ## bl, RID2d; \
+ shrq $16, src; \
movl s1(, RID1, 4), dst ## d; \
op1 s2(, RID2, 4), dst ## d; \
- shrq $16, src; \
- movb src ## bl, RID1b; \
- movb src ## bh, RID2b; \
+ movzbl src ## bh, RID1d; \
+ movzbl src ## bl, RID2d; \
+ interleave_op(il_reg); \
op2 s3(, RID1, 4), dst ## d; \
op3 s4(, RID2, 4), dst ## d;

-#define F(a, x, op0, op1, op2, op3) \
- op0 a, RKM, x; \
- vpslld RKRF, x, RTMP; \
- vpsrld RKRR, x, x; \
+#define dummy(d) /* do nothing */
+
+#define shr_next(reg) \
+ shrq $16, reg;
+
+#define F(a, x, op0, op1, op2, op3, rkm, rkrl, rkrr) \
+ op0 a, rkm, x; \
+ vpslld rkrl, x, RTMP; \
+ vpsrld rkrr, x, x; \
vpor RTMP, x, x; \
\
- vpshufb RMASK, x, x; \
vmovq x, RGI1; \
- vpsrldq $8, x, x; \
- vmovq x, RGI2; \
+ vpextrq $1, x, RGI2; \
\
- lookup_32bit(RGI1, RFS1, op1, op2, op3); \
- shrq $16, RGI1; \
- lookup_32bit(RGI1, RFS2, op1, op2, op3); \
- shlq $32, RFS2; \
- orq RFS1, RFS2; \
+ lookup_32bit(RGI1, RFS1, op1, op2, op3, shr_next, RGI1); \
+ vmovd RFS1d, x; \
+ lookup_32bit(RGI1, RFS2, op1, op2, op3, dummy, none); \
+ vpinsrd $1, RFS2d, x, x; \
\
- lookup_32bit(RGI2, RFS1, op1, op2, op3); \
- shrq $16, RGI2; \
- lookup_32bit(RGI2, RFS3, op1, op2, op3); \
- shlq $32, RFS3; \
- orq RFS1, RFS3; \
- \
- vmovq RFS2, x; \
- vpinsrq $1, RFS3, x, x;
+ lookup_32bit(RGI2, RFS1, op1, op2, op3, shr_next, RGI2); \
+ vpinsrd $2, RFS1d, x, x; \
+ lookup_32bit(RGI2, RFS3, op1, op2, op3, dummy, none); \
+ vpinsrd $3, RFS3d, x, x;
+
+#define F1(b, x, rkm, rkrl, rkrr) \
+ F(b, x, vpaddd, xorl, subl, addl, rkm, rkrl, rkrr)
+#define F2(b, x, rkm, rkrl, rkrr) \
+ F(b, x, vpxor, subl, addl, xorl, rkm, rkrl, rkrr)
+#define F3(b, x, rkm, rkrl, rkrr) \
+ F(b, x, vpsubd, addl, xorl, subl, rkm, rkrl, rkrr)
+
+#define fn_qop(in, out, x, f, rkm, rkrl, rkrr) \
+ F ## f(in ## 1, x, rkm, rkrl, rkrr); \
+ vpxor out ## 1, x, out ## 1; \
+ F ## f(in ## 2, x, rkm, rkrl, rkrr); \
+ vpxor out ## 2, x, out ## 2;
+
+.align 4
+__qop_RD_RC_RX1_RKM0__qop_RC_RB_RX2_RKM1:
+ fn_qop(RD, RC, RX, 1, RKM0, RKRL0, RKRR0);
+ fn_qop(RC, RB, RX, 2, RKM1, RKRL1, RKRR1);
+ ret;

-#define F1(b, x) F(b, x, vpaddd, xorl, subl, addl)
-#define F2(b, x) F(b, x, vpxor, subl, addl, xorl)
-#define F3(b, x) F(b, x, vpsubd, addl, xorl, subl)
+.align 4
+__qop_RB_RA_RX_3_RKM0__qop_RA_RD_RX1_RKM1:
+ fn_qop(RB, RA, RX, 3, RKM0, RKRL0, RKRR0);
+ fn_qop(RA, RD, RX, 1, RKM1, RKRL1, RKRR1);
+ ret;

-#define qop(in, out, x, f) \
- F ## f(in ## 1, x); \
- vpxor out ## 1, x, out ## 1; \
- F ## f(in ## 2, x); \
- vpxor out ## 2, x, out ## 2; \
+.align 4
+__qop_RA_RD_RX1_RKM1__qop_RB_RA_RX3_RKM0:
+ fn_qop(RA, RD, RX, 1, RKM1, RKRL1, RKRR1);
+ fn_qop(RB, RA, RX, 3, RKM0, RKRL0, RKRR0);
+ ret;
+
+.align 4
+__qop_RC_RB_RX2_RKM1__qop_RD_RC_RX1_RKM0:
+ fn_qop(RC, RB, RX, 2, RKM1, RKRL1, RKRR1);
+ fn_qop(RD, RC, RX, 1, RKM0, RKRL0, RKRR0);
+ ret;
+
+#define load_round_key(x, rkm, rkrl, rkrr) \
+ movzbl (kr+(x))(CTX), RID1d; \
+ movl $32, RID2d; \
+ /* merge (kr)-bit and 16-bit rotates */ \
+ xorl $16, RID1d; \
+ vbroadcastss (km+(4*(x)))(CTX), rkm; \
+ vmovd RID1d, rkrl; \
+ subl RID1d, RID2d; \
+ vmovd RID2d, rkrr;

#define Q(n) \
- vbroadcastss (km+(4*(4*n+0)))(CTX), RKM; \
- vpinsrb $0, (kr+(4*n+0))(CTX), RKRF, RKRF; \
- vpsubq RKRF, R32, RKRR; \
- qop(RD, RC, RX, 1); \
- \
- vbroadcastss (km+(4*(4*n+1)))(CTX), RKM; \
- vpinsrb $0, (kr+(4*n+1))(CTX), RKRF, RKRF; \
- vpsubq RKRF, R32, RKRR; \
- qop(RC, RB, RX, 2); \
+ load_round_key((4*n+0), RKM0, RKRL0, RKRR0); \
+ load_round_key((4*n+1), RKM1, RKRL1, RKRR1); \
+ call __qop_RD_RC_RX1_RKM0__qop_RC_RB_RX2_RKM1; \
\
- vbroadcastss (km+(4*(4*n+2)))(CTX), RKM; \
- vpinsrb $0, (kr+(4*n+2))(CTX), RKRF, RKRF; \
- vpsubq RKRF, R32, RKRR; \
- qop(RB, RA, RX, 3); \
- \
- vbroadcastss (km+(4*(4*n+3)))(CTX), RKM; \
- vpinsrb $0, (kr+(4*n+3))(CTX), RKRF, RKRF; \
- vpsubq RKRF, R32, RKRR; \
- qop(RA, RD, RX, 1);
+ load_round_key((4*n+2), RKM0, RKRL0, RKRR0); \
+ load_round_key((4*n+3), RKM1, RKRL1, RKRR1); \
+ call __qop_RB_RA_RX_3_RKM0__qop_RA_RD_RX1_RKM1;

#define QBAR(n) \
- vbroadcastss (km+(4*(4*n+3)))(CTX), RKM; \
- vpinsrb $0, (kr+(4*n+3))(CTX), RKRF, RKRF; \
- vpsubq RKRF, R32, RKRR; \
- qop(RA, RD, RX, 1); \
- \
- vbroadcastss (km+(4*(4*n+2)))(CTX), RKM; \
- vpinsrb $0, (kr+(4*n+2))(CTX), RKRF, RKRF; \
- vpsubq RKRF, R32, RKRR; \
- qop(RB, RA, RX, 3); \
+ load_round_key((4*n+3), RKM1, RKRL1, RKRR1); \
+ load_round_key((4*n+2), RKM0, RKRL0, RKRR0); \
+ call __qop_RA_RD_RX1_RKM1__qop_RB_RA_RX3_RKM0; \
\
- vbroadcastss (km+(4*(4*n+1)))(CTX), RKM; \
- vpinsrb $0, (kr+(4*n+1))(CTX), RKRF, RKRF; \
- vpsubq RKRF, R32, RKRR; \
- qop(RC, RB, RX, 2); \
- \
- vbroadcastss (km+(4*(4*n+0)))(CTX), RKM; \
- vpinsrb $0, (kr+(4*n+0))(CTX), RKRF, RKRF; \
- vpsubq RKRF, R32, RKRR; \
- qop(RD, RC, RX, 1);
-
+ load_round_key((4*n+1), RKM1, RKRL1, RKRR1); \
+ load_round_key((4*n+0), RKM0, RKRL0, RKRR0); \
+ call __qop_RC_RB_RX2_RKM1__qop_RD_RC_RX1_RKM0;

#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
vpunpckldq x1, x0, t0; \
@@ -185,37 +198,37 @@
vpunpcklqdq x3, t2, x2; \
vpunpckhqdq x3, t2, x3;

-#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
+#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2, rmask) \
vmovdqu (0*4*4)(in), x0; \
vmovdqu (1*4*4)(in), x1; \
vmovdqu (2*4*4)(in), x2; \
vmovdqu (3*4*4)(in), x3; \
- vpshufb RMASK, x0, x0; \
- vpshufb RMASK, x1, x1; \
- vpshufb RMASK, x2, x2; \
- vpshufb RMASK, x3, x3; \
+ vpshufb rmask, x0, x0; \
+ vpshufb rmask, x1, x1; \
+ vpshufb rmask, x2, x2; \
+ vpshufb rmask, x3, x3; \
\
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)

-#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
\
- vpshufb RMASK, x0, x0; \
- vpshufb RMASK, x1, x1; \
- vpshufb RMASK, x2, x2; \
- vpshufb RMASK, x3, x3; \
+ vpshufb rmask, x0, x0; \
+ vpshufb rmask, x1, x1; \
+ vpshufb rmask, x2, x2; \
+ vpshufb rmask, x3, x3; \
vmovdqu x0, (0*4*4)(out); \
vmovdqu x1, (1*4*4)(out); \
vmovdqu x2, (2*4*4)(out); \
vmovdqu x3, (3*4*4)(out);

-#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
\
- vpshufb RMASK, x0, x0; \
- vpshufb RMASK, x1, x1; \
- vpshufb RMASK, x2, x2; \
- vpshufb RMASK, x3, x3; \
+ vpshufb rmask, x0, x0; \
+ vpshufb rmask, x1, x1; \
+ vpshufb rmask, x2, x2; \
+ vpshufb rmask, x3, x3; \
vpxor (0*4*4)(out), x0, x0; \
vmovdqu x0, (0*4*4)(out); \
vpxor (1*4*4)(out), x1, x1; \
@@ -228,8 +241,6 @@
.align 16
.Lbswap_mask:
.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
-.L32_mask:
- .byte 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0

.align 16
.global __cast6_enc_blk_8way
@@ -246,13 +257,10 @@ __cast6_enc_blk_8way:
pushq %rbx;
pushq %rcx;

- vmovdqu .Lbswap_mask, RMASK;
- vmovdqu .L32_mask, R32;
- vpxor RKRF, RKRF, RKRF;
-
+ vmovdqa .Lbswap_mask, RKM1;
leaq (4*4*4)(%rdx), %rax;
- inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
- inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
+ inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM0, RKM1);
+ inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM0, RKM1);

xorq RID1, RID1;
xorq RID2, RID2;
@@ -273,19 +281,20 @@ __cast6_enc_blk_8way:
popq %rcx;
popq %rbx;

+ vmovdqa .Lbswap_mask, RKM1;
leaq (4*4*4)(%rsi), %rax;

testb %cl, %cl;
jnz __enc_xor8;

- outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
- outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
+ outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM0, RKM1);
+ outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM0, RKM1);

ret;

__enc_xor8:
- outunpack_xor_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
- outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
+ outunpack_xor_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM0, RKM1);
+ outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM0, RKM1);

ret;

@@ -302,13 +311,10 @@ cast6_dec_blk_8way:

pushq %rbx;

- vmovdqu .Lbswap_mask, RMASK;
- vmovdqu .L32_mask, R32;
- vpxor RKRF, RKRF, RKRF;
-
+ vmovdqa .Lbswap_mask, RKM1;
leaq (4*4*4)(%rdx), %rax;
- inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
- inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
+ inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM0, RKM1);
+ inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM0, RKM1);

xorq RID1, RID1;
xorq RID2, RID2;
@@ -328,8 +334,9 @@ cast6_dec_blk_8way:

popq %rbx;

+ vmovdqa .Lbswap_mask, RKM1;
leaq (4*4*4)(%rsi), %rax;
- outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
- outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
+ outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM0, RKM1);
+ outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM0, RKM1);

ret;

2012-08-16 14:30:52

by Jussi Kivilinna

[permalink] [raw]
Subject: Re: [PATCH 1/3] crypto: twofish-avx - tune assembler code for ~10% more performance

Please, ignore this patchset as it causes performance regression on
Bulldozer. I'll make new patchset with this issue fixed.

-Jussi

Quoting Jussi Kivilinna <[email protected]>:

> Patch replaces 'movb' instructions with 'movzbl' to break false register
> dependencies and interleaves instructions better for out-of-order scheduling.
>
> Also move common round code to separate function to reduce object size.
>
> Tested on Core i5-2450M.
>
> Cc: Johannes Goetzfried <[email protected]>
> Signed-off-by: Jussi Kivilinna <[email protected]>
> ---
> arch/x86/crypto/twofish-avx-x86_64-asm_64.S | 144
> +++++++++++++++++----------
> 1 file changed, 92 insertions(+), 52 deletions(-)
>
> diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
> b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
> index 35f4557..42b27b7 100644
> --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
> +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
> @@ -47,15 +47,22 @@
> #define RC2 %xmm6
> #define RD2 %xmm7
>
> -#define RX %xmm8
> -#define RY %xmm9
> +#define RX0 %xmm8
> +#define RY0 %xmm9
>
> -#define RK1 %xmm10
> -#define RK2 %xmm11
> +#define RX1 %xmm10
> +#define RY1 %xmm11
> +
> +#define RK1 %xmm12
> +#define RK2 %xmm13
> +
> +#define RT %xmm14
>
> #define RID1 %rax
> +#define RID1d %eax
> #define RID1b %al
> #define RID2 %rbx
> +#define RID2d %ebx
> #define RID2b %bl
>
> #define RGI1 %rdx
> @@ -73,40 +80,45 @@
> #define RGS3d %r10d
>
>
> -#define lookup_32bit(t0, t1, t2, t3, src, dst) \
> - movb src ## bl, RID1b; \
> - movb src ## bh, RID2b; \
> +#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
> + movzbl src ## bl, RID1d; \
> + movzbl src ## bh, RID2d; \
> + shrq $16, src; \
> movl t0(CTX, RID1, 4), dst ## d; \
> xorl t1(CTX, RID2, 4), dst ## d; \
> - shrq $16, src; \
> - movb src ## bl, RID1b; \
> - movb src ## bh, RID2b; \
> + movzbl src ## bl, RID1d; \
> + movzbl src ## bh, RID2d; \
> + interleave_op(il_reg); \
> xorl t2(CTX, RID1, 4), dst ## d; \
> xorl t3(CTX, RID2, 4), dst ## d;
>
> +#define dummy(d) /* do nothing */
> +
> +#define shr_next(reg) \
> + shrq $16, reg;
> +
> #define G(a, x, t0, t1, t2, t3) \
> vmovq a, RGI1; \
> - vpsrldq $8, a, x; \
> - vmovq x, RGI2; \
> + vpextrq $1, a, RGI2; \
> \
> - lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
> - shrq $16, RGI1; \
> - lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
> - shlq $32, RGS2; \
> - orq RGS1, RGS2; \
> + lookup_32bit(t0, t1, t2, t3, RGI1, RGS1, shr_next, RGI1); \
> + vmovd RGS1d, x; \
> + lookup_32bit(t0, t1, t2, t3, RGI1, RGS2, dummy, none); \
> + vpinsrd $1, RGS2d, x, x; \
> \
> - lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \
> - shrq $16, RGI2; \
> - lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \
> - shlq $32, RGS3; \
> - orq RGS1, RGS3; \
> - \
> - vmovq RGS2, x; \
> - vpinsrq $1, RGS3, x, x;
> + lookup_32bit(t0, t1, t2, t3, RGI2, RGS1, shr_next, RGI2); \
> + vpinsrd $2, RGS1d, x, x; \
> + lookup_32bit(t0, t1, t2, t3, RGI2, RGS3, dummy, none); \
> + vpinsrd $3, RGS3d, x, x;
> +
> +#define encround_g1g2(a, b, c, d, x, y) \
> + G(a, x, s0, s1, s2, s3); \
> + G(b, y, s1, s2, s3, s0);
>
> -#define encround(a, b, c, d, x, y) \
> - G(a, x, s0, s1, s2, s3); \
> - G(b, y, s1, s2, s3, s0); \
> +#define encround_end(a, b, c, d, x, y) \
> + vpslld $1, d, RT; \
> + vpsrld $(32 - 1), d, d; \
> + vpor d, RT, d; \
> vpaddd x, y, x; \
> vpaddd y, x, y; \
> vpaddd x, RK1, x; \
> @@ -115,14 +127,16 @@
> vpsrld $1, c, x; \
> vpslld $(32 - 1), c, c; \
> vpor c, x, c; \
> - vpslld $1, d, x; \
> - vpsrld $(32 - 1), d, d; \
> - vpor d, x, d; \
> vpxor d, y, d;
>
> -#define decround(a, b, c, d, x, y) \
> - G(a, x, s0, s1, s2, s3); \
> - G(b, y, s1, s2, s3, s0); \
> +#define decround_g1g2(a, b, c, d, x, y) \
> + G(a, x, s0, s1, s2, s3); \
> + G(b, y, s1, s2, s3, s0);
> +
> +#define decround_end(a, b, c, d, x, y) \
> + vpslld $1, c, RT; \
> + vpsrld $(32 - 1), c, c; \
> + vpor c, RT, c; \
> vpaddd x, y, x; \
> vpaddd y, x, y; \
> vpaddd y, RK2, y; \
> @@ -130,23 +144,50 @@
> vpsrld $1, d, y; \
> vpslld $(32 - 1), d, d; \
> vpor d, y, d; \
> - vpslld $1, c, y; \
> - vpsrld $(32 - 1), c, c; \
> - vpor c, y, c; \
> vpaddd x, RK1, x; \
> vpxor x, c, c;
>
> +.align 4
> +encround_RARBRCRD:
> + encround_g1g2(RA1, RB1, RC1, RD1, RX0, RY0);
> + encround_g1g2(RA2, RB2, RC2, RD2, RX1, RY1);
> + encround_end(RA1, RB1, RC1, RD1, RX0, RY0);
> + encround_end(RA2, RB2, RC2, RD2, RX1, RY1);
> + ret;
> +
> +.align 4
> +encround_RCRDRARB:
> + encround_g1g2(RC1, RD1, RA1, RB1, RX0, RY0);
> + encround_g1g2(RC2, RD2, RA2, RB2, RX1, RY1);
> + encround_end(RC1, RD1, RA1, RB1, RX0, RY0);
> + encround_end(RC2, RD2, RA2, RB2, RX1, RY1);
> + ret;
> +
> #define encrypt_round(n, a, b, c, d) \
> vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
> vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
> - encround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
> - encround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
> + call encround_ ## a ## b ## c ## d;
> +
> +.align 4
> +decround_RARBRCRD:
> + decround_g1g2(RA1, RB1, RC1, RD1, RX0, RY0);
> + decround_g1g2(RA2, RB2, RC2, RD2, RX1, RY1);
> + decround_end(RA1, RB1, RC1, RD1, RX0, RY0);
> + decround_end(RA2, RB2, RC2, RD2, RX1, RY1);
> + ret;
> +
> +.align 4
> +decround_RCRDRARB:
> + decround_g1g2(RC1, RD1, RA1, RB1, RX0, RY0);
> + decround_g1g2(RC2, RD2, RA2, RB2, RX1, RY1);
> + decround_end(RC1, RD1, RA1, RB1, RX0, RY0);
> + decround_end(RC2, RD2, RA2, RB2, RX1, RY1);
> + ret;
>
> #define decrypt_round(n, a, b, c, d) \
> vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
> vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
> - decround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
> - decround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
> + call decround_ ## a ## b ## c ## d;
>
> #define encrypt_cycle(n) \
> encrypt_round((2*n), RA, RB, RC, RD); \
> @@ -156,7 +197,6 @@
> decrypt_round(((2*n) + 1), RC, RD, RA, RB); \
> decrypt_round((2*n), RA, RB, RC, RD);
>
> -
> #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
> vpunpckldq x1, x0, t0; \
> vpunpckhdq x1, x0, t2; \
> @@ -222,8 +262,8 @@ __twofish_enc_blk_8way:
> vmovdqu w(CTX), RK1;
>
> leaq (4*4*4)(%rdx), %rax;
> - inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
> - inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
> + inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
> + inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
>
> xorq RID1, RID1;
> xorq RID2, RID2;
> @@ -247,14 +287,14 @@ __twofish_enc_blk_8way:
> testb %cl, %cl;
> jnz __enc_xor8;
>
> - outunpack_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
> - outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
> + outunpack_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
> + outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
>
> ret;
>
> __enc_xor8:
> - outunpack_xor_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
> - outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
> + outunpack_xor_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
> + outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
>
> ret;
>
> @@ -274,8 +314,8 @@ twofish_dec_blk_8way:
> vmovdqu (w+4*4)(CTX), RK1;
>
> leaq (4*4*4)(%rdx), %rax;
> - inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
> - inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
> + inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
> + inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
>
> xorq RID1, RID1;
> xorq RID2, RID2;
> @@ -294,7 +334,7 @@ twofish_dec_blk_8way:
> popq %rbx;
>
> leaq (4*4*4)(%rsi), %rax;
> - outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
> - outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
> + outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
> + outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
>
> ret;
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>

2012-08-16 14:41:01

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH 1/3] crypto: twofish-avx - tune assembler code for ~10% more performance

On Thu, Aug 16, 2012 at 05:30:49PM +0300, Jussi Kivilinna wrote:
> Please, ignore this patchset as it causes performance regression on
> Bulldozer. I'll make new patchset with this issue fixed.

OK.
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt