2011-09-23 16:50:57

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 1/2] crypto: blowfish-x86_64: improve x86_64 blowfish 4-way performance

This patch adds improved F-macro for 4-way parallel functions. With new
F-macro for 4-way parallel functions, blowfish sees ~15% improvement in
speed tests on AMD Phenom II (~5% on Intel Xeon E7330).

However when used in 1-way blowfish function new macro would be ~10%
slower than original, so old F-macro is kept for 1-way functions.
Patch cleans up old F-macro as it is no longer needed in 4-way part.

Patch also does register macro renaming to reduce stack usage.

Signed-off-by: Jussi Kivilinna <[email protected]>
---
arch/x86/crypto/blowfish-x86_64-asm_64.S | 198 +++++++++++++++---------------
1 files changed, 98 insertions(+), 100 deletions(-)

diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S
index 44eb23a..391d245 100644
--- a/arch/x86/crypto/blowfish-x86_64-asm_64.S
+++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S
@@ -56,38 +56,32 @@

#define RT0 %rbp
#define RT1 %rsi
+#define RT2 %r8
+#define RT3 %r9

#define RT0d %ebp
#define RT1d %esi
+#define RT2d %r8d
+#define RT3d %r9d

-#define RK0 %r8
-#define RK1 %r9
-#define RK2 %r10
-#define RK3 %r11
-
-#define RK0d %r8d
-#define RK1d %r9d
-#define RK2d %r10d
-#define RK3d %r11d
-
-#define RKEY %r12
+#define RKEY %r10

/***********************************************************************
* 1-way blowfish
***********************************************************************/
-#define F(x, k) \
- rorq $16, x; \
- movzbl x ## bh, RT0d; \
- movzbl x ## bl, RT1d; \
- rolq $16, x; \
- movl s0(CTX,RT0,4), k ## d; \
- addl s1(CTX,RT1,4), k ## d; \
- movzbl x ## bh, RT0d; \
- movzbl x ## bl, RT1d; \
- rolq $32, x; \
- xorl s2(CTX,RT0,4), k ## d; \
- addl s3(CTX,RT1,4), k ## d; \
- xorq k, x;
+#define F() \
+ rorq $16, RX0; \
+ movzbl RX0bh, RT0d; \
+ movzbl RX0bl, RT1d; \
+ rolq $16, RX0; \
+ movl s0(CTX,RT0,4), RT0d; \
+ addl s1(CTX,RT1,4), RT0d; \
+ movzbl RX0bh, RT1d; \
+ movzbl RX0bl, RT2d; \
+ rolq $32, RX0; \
+ xorl s2(CTX,RT1,4), RT0d; \
+ addl s3(CTX,RT2,4), RT0d; \
+ xorq RT0, RX0;

#define add_roundkey_enc(n) \
xorq p+4*(n)(CTX), RX0;
@@ -95,11 +89,8 @@
#define round_enc(n) \
add_roundkey_enc(n); \
\
- F(RX0, RK0); \
- F(RX0, RK0);
-
-#define round_final_enc(n) \
- xorq p+4*(n)(CTX), RX0;
+ F(); \
+ F();

#define add_roundkey_dec(n) \
movq p+4*(n-1)(CTX), RT0; \
@@ -109,8 +100,8 @@
#define round_dec(n) \
add_roundkey_dec(n); \
\
- F(RX0, RK0); \
- F(RX0, RK0); \
+ F(); \
+ F(); \

#define read_block() \
movq (RIO), RX0; \
@@ -130,16 +121,15 @@
.type __blowfish_enc_blk,@function;

__blowfish_enc_blk:
- // input:
- // %rdi: ctx, CTX
- // %rsi: dst
- // %rdx: src
- // %rcx: bool xor
- pushq %rbp;
- pushq %rbx;
-
- pushq %rsi;
- pushq %rcx;
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: bool, if true: xor output
+ */
+ movq %rbp, %r11;
+
+ movq %rsi, %r10;
movq %rdx, RIO;

read_block();
@@ -154,38 +144,31 @@ __blowfish_enc_blk:
round_enc(14);
add_roundkey_enc(16);

- popq %rbp;
- popq RIO;
+ movq %r11, %rbp;

- test %bpl, %bpl;
+ movq %r10, RIO;
+ test %cl, %cl;
jnz __enc_xor;

write_block();
-
-__enc_ret:
- popq %rbx;
- popq %rbp;
-
ret;
-
__enc_xor:
xor_block();
-
- jmp __enc_ret;
+ ret;

.align 8
.global blowfish_dec_blk
.type blowfish_dec_blk,@function;

blowfish_dec_blk:
- // input:
- // %rdi: ctx, CTX
- // %rsi: dst
- // %rdx: src
- pushq %rbp;
- pushq %rbx;
-
- pushq %rsi;
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ movq %rbp, %r11;
+
+ movq %rsi, %r10;
movq %rdx, RIO;

read_block();
@@ -200,17 +183,33 @@ blowfish_dec_blk:
round_dec(3);
add_roundkey_dec(1);

- popq RIO;
+ movq %r10, RIO;
write_block();

- popq %rbx;
- popq %rbp;
+ movq %r11, %rbp;

ret;

/**********************************************************************
4-way blowfish, four blocks parallel
**********************************************************************/
+
+/* F() for 4-way. Slower when used alone/1-way, but faster when used
+ * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
+ */
+#define F4(x) \
+ movzbl x ## bh, RT1d; \
+ movzbl x ## bl, RT3d; \
+ rorq $16, x; \
+ movzbl x ## bh, RT0d; \
+ movzbl x ## bl, RT2d; \
+ rorq $16, x; \
+ movl s0(CTX,RT0,4), RT0d; \
+ addl s1(CTX,RT2,4), RT0d; \
+ xorl s2(CTX,RT1,4), RT0d; \
+ addl s3(CTX,RT3,4), RT0d; \
+ xorq RT0, x;
+
#define add_preloaded_roundkey4() \
xorq RKEY, RX0; \
xorq RKEY, RX1; \
@@ -227,15 +226,15 @@ blowfish_dec_blk:
#define round_enc4(n) \
add_roundkey_enc4(n); \
\
- F(RX0, RK0); \
- F(RX1, RK1); \
- F(RX2, RK2); \
- F(RX3, RK3); \
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3); \
\
- F(RX0, RK0); \
- F(RX1, RK1); \
- F(RX2, RK2); \
- F(RX3, RK3);
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3);

#define preload_roundkey_dec(n) \
movq p+4*((n)-1)(CTX), RKEY; \
@@ -248,15 +247,15 @@ blowfish_dec_blk:
#define round_dec4(n) \
add_roundkey_dec4(n); \
\
- F(RX0, RK0); \
- F(RX1, RK1); \
- F(RX2, RK2); \
- F(RX3, RK3); \
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3); \
\
- F(RX0, RK0); \
- F(RX1, RK1); \
- F(RX2, RK2); \
- F(RX3, RK3);
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3);

#define read_block4() \
movq (RIO), RX0; \
@@ -306,18 +305,19 @@ blowfish_dec_blk:
.type __blowfish_enc_blk_4way,@function;

__blowfish_enc_blk_4way:
- // input:
- // %rdi: ctx, CTX
- // %rsi: dst
- // %rdx: src
- // %rcx: bool xor
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: bool, if true: xor output
+ */
pushq %rbp;
pushq %rbx;
- pushq RKEY;
+ pushq %rcx;
+
preload_roundkey_enc(0);

- pushq %rsi;
- pushq %rcx;
+ movq %rsi, %r11;
movq %rdx, RIO;

read_block4();
@@ -333,40 +333,39 @@ __blowfish_enc_blk_4way:
add_preloaded_roundkey4();

popq %rbp;
- popq RIO;
+ movq %r11, RIO;

test %bpl, %bpl;
jnz __enc_xor4;

write_block4();

-__enc_ret4:
- popq RKEY;
popq %rbx;
popq %rbp;
-
ret;

__enc_xor4:
xor_block4();

- jmp __enc_ret4;
+ popq %rbx;
+ popq %rbp;
+ ret;

.align 8
.global blowfish_dec_blk_4way
.type blowfish_dec_blk_4way,@function;

blowfish_dec_blk_4way:
- // input:
- // %rdi: ctx, CTX
- // %rsi: dst
- // %rdx: src
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
pushq %rbp;
pushq %rbx;
- pushq RKEY;
preload_roundkey_dec(17);

- pushq %rsi;
+ movq %rsi, %r11;
movq %rdx, RIO;

read_block4();
@@ -381,10 +380,9 @@ blowfish_dec_blk_4way:
round_dec4(3);
add_preloaded_roundkey4();

- popq RIO;
+ movq %r11, RIO;
write_block4();

- popq RKEY;
popq %rbx;
popq %rbp;



2011-09-23 16:51:03

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 2/2] crypto: blowfish-x86_64: add credits

Signed-off-by: Jussi Kivilinna <[email protected]>
---
arch/x86/crypto/blowfish_glue.c | 5 +++++
1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index 40911ab..2568a7b 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -3,6 +3,11 @@
*
* Copyright (c) 2011 Jussi Kivilinna <[email protected]>
*
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ * Copyright (c) 2006 Herbert Xu <[email protected]>
+ * CTR part based on code (crypto/ctr.c) by:
+ * (C) Copyright IBM Corp. 2007 - Joy Latten <[email protected]>
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or

2011-10-21 12:37:37

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH 2/2] crypto: blowfish-x86_64: add credits

On Fri, Sep 23, 2011 at 07:51:00PM +0300, Jussi Kivilinna wrote:
> Signed-off-by: Jussi Kivilinna <[email protected]>

Both patches applied. Thanks!
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt