2012-10-20 12:06:34

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 0/5] Avoid stack buffers in glue code of AVX implementations

Patches modify AVX implementations so that they avoid using extra
stack memory. Stack is not needed since implementation store
blocks in registers.

---

Jussi Kivilinna (5):
crypto: x86/glue_helper - use le128 instead of u128 for CTR mode
crypto: cast6/avx - avoid using temporary stack buffers
crypto: twofish/avx - avoid using temporary stack buffers
crypto: serpent/avx - avoid using temporary stack buffers
crypto: cast5/avx - avoid using temporary stack buffers


arch/x86/crypto/camellia_glue.c | 16 +
arch/x86/crypto/cast5-avx-x86_64-asm_64.S | 332 +++++++++++++++++++++------
arch/x86/crypto/cast5_avx_glue.c | 79 ++----
arch/x86/crypto/cast6-avx-x86_64-asm_64.S | 190 ++++++++++-----
arch/x86/crypto/cast6_avx_glue.c | 77 +-----
arch/x86/crypto/glue_helper-asm-avx.S | 91 +++++++
arch/x86/crypto/glue_helper.c | 12 -
arch/x86/crypto/serpent-avx-x86_64-asm_64.S | 166 +++++++++-----
arch/x86/crypto/serpent_avx_glue.c | 49 +---
arch/x86/crypto/serpent_sse2_glue.c | 12 -
arch/x86/crypto/twofish-avx-x86_64-asm_64.S | 208 +++++++++++------
arch/x86/crypto/twofish_avx_glue.c | 73 +-----
arch/x86/crypto/twofish_glue_3way.c | 20 +-
arch/x86/include/asm/crypto/glue_helper.h | 28 +-
arch/x86/include/asm/crypto/serpent-avx.h | 27 +-
arch/x86/include/asm/crypto/twofish.h | 4
16 files changed, 835 insertions(+), 549 deletions(-)
create mode 100644 arch/x86/crypto/glue_helper-asm-avx.S


2012-10-20 12:06:40

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 1/5] crypto: x86/glue_helper - use le128 instead of u128 for CTR mode

'u128' currently used for CTR mode is on little-endian 'long long' swapped
and would require extra swap operations by SSE/AVX code. Use of le128
instead of u128 allows IV calculations to be done with vector registers
easier.

Signed-off-by: Jussi Kivilinna <[email protected]>
---
arch/x86/crypto/camellia_glue.c | 16 ++++++++--------
arch/x86/crypto/cast6_avx_glue.c | 12 ++++++------
arch/x86/crypto/glue_helper.c | 12 ++++++------
arch/x86/crypto/serpent_avx_glue.c | 12 ++++++------
arch/x86/crypto/serpent_sse2_glue.c | 12 ++++++------
arch/x86/crypto/twofish_avx_glue.c | 6 +++---
arch/x86/crypto/twofish_glue_3way.c | 20 ++++++++++----------
arch/x86/include/asm/crypto/glue_helper.h | 28 +++++++++++++++++-----------
arch/x86/include/asm/crypto/twofish.h | 4 ++--
9 files changed, 64 insertions(+), 58 deletions(-)

diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 42ffd2b..021a008 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -1317,21 +1317,21 @@ static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
u128_xor(&dst[1], &dst[1], &iv);
}

-static void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
+static void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
be128 ctrblk;

if (dst != src)
*dst = *src;

- u128_to_be128(&ctrblk, iv);
- u128_inc(iv);
+ le128_to_be128(&ctrblk, iv);
+ le128_inc(iv);

camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
}

static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
- u128 *iv)
+ le128 *iv)
{
be128 ctrblks[2];

@@ -1340,10 +1340,10 @@ static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
dst[1] = src[1];
}

- u128_to_be128(&ctrblks[0], iv);
- u128_inc(iv);
- u128_to_be128(&ctrblks[1], iv);
- u128_inc(iv);
+ le128_to_be128(&ctrblks[0], iv);
+ le128_inc(iv);
+ le128_to_be128(&ctrblks[1], iv);
+ le128_inc(iv);

camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks);
}
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 15e5f85..1dfd33b 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -78,19 +78,19 @@ static void cast6_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
}

-static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
+static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
be128 ctrblk;

- u128_to_be128(&ctrblk, iv);
- u128_inc(iv);
+ le128_to_be128(&ctrblk, iv);
+ le128_inc(iv);

__cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
u128_xor(dst, src, (u128 *)&ctrblk);
}

static void cast6_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
- u128 *iv)
+ le128 *iv)
{
be128 ctrblks[CAST6_PARALLEL_BLOCKS];
unsigned int i;
@@ -99,8 +99,8 @@ static void cast6_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
if (dst != src)
dst[i] = src[i];

- u128_to_be128(&ctrblks[i], iv);
- u128_inc(iv);
+ le128_to_be128(&ctrblks[i], iv);
+ le128_inc(iv);
}

cast6_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index 30b3927..22ce4f6 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -221,16 +221,16 @@ static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr,
u8 *src = (u8 *)walk->src.virt.addr;
u8 *dst = (u8 *)walk->dst.virt.addr;
unsigned int nbytes = walk->nbytes;
- u128 ctrblk;
+ le128 ctrblk;
u128 tmp;

- be128_to_u128(&ctrblk, (be128 *)walk->iv);
+ be128_to_le128(&ctrblk, (be128 *)walk->iv);

memcpy(&tmp, src, nbytes);
fn_ctr(ctx, &tmp, &tmp, &ctrblk);
memcpy(dst, &tmp, nbytes);

- u128_to_be128((be128 *)walk->iv, &ctrblk);
+ le128_to_be128((be128 *)walk->iv, &ctrblk);
}
EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit);

@@ -243,11 +243,11 @@ static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
unsigned int nbytes = walk->nbytes;
u128 *src = (u128 *)walk->src.virt.addr;
u128 *dst = (u128 *)walk->dst.virt.addr;
- u128 ctrblk;
+ le128 ctrblk;
unsigned int num_blocks, func_bytes;
unsigned int i;

- be128_to_u128(&ctrblk, (be128 *)walk->iv);
+ be128_to_le128(&ctrblk, (be128 *)walk->iv);

/* Process multi-block batch */
for (i = 0; i < gctx->num_funcs; i++) {
@@ -269,7 +269,7 @@ static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
}

done:
- u128_to_be128((be128 *)walk->iv, &ctrblk);
+ le128_to_be128((be128 *)walk->iv, &ctrblk);
return nbytes;
}

diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 3f543a0..2aa31ad 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -56,19 +56,19 @@ static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
}

-static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
+static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
be128 ctrblk;

- u128_to_be128(&ctrblk, iv);
- u128_inc(iv);
+ le128_to_be128(&ctrblk, iv);
+ le128_inc(iv);

__serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
u128_xor(dst, src, (u128 *)&ctrblk);
}

static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
- u128 *iv)
+ le128 *iv)
{
be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
unsigned int i;
@@ -77,8 +77,8 @@ static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
if (dst != src)
dst[i] = src[i];

- u128_to_be128(&ctrblks[i], iv);
- u128_inc(iv);
+ le128_to_be128(&ctrblks[i], iv);
+ le128_inc(iv);
}

serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 9107a99..97a356e 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -59,19 +59,19 @@ static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
}

-static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
+static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
be128 ctrblk;

- u128_to_be128(&ctrblk, iv);
- u128_inc(iv);
+ le128_to_be128(&ctrblk, iv);
+ le128_inc(iv);

__serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
u128_xor(dst, src, (u128 *)&ctrblk);
}

static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
- u128 *iv)
+ le128 *iv)
{
be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
unsigned int i;
@@ -80,8 +80,8 @@ static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
if (dst != src)
dst[i] = src[i];

- u128_to_be128(&ctrblks[i], iv);
- u128_inc(iv);
+ le128_to_be128(&ctrblks[i], iv);
+ le128_inc(iv);
}

serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index e7708b5..810e45d 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -90,7 +90,7 @@ static void twofish_dec_blk_cbc_xway(void *ctx, u128 *dst, const u128 *src)
}

static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src,
- u128 *iv)
+ le128 *iv)
{
be128 ctrblks[TWOFISH_PARALLEL_BLOCKS];
unsigned int i;
@@ -99,8 +99,8 @@ static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src,
if (dst != src)
dst[i] = src[i];

- u128_to_be128(&ctrblks[i], iv);
- u128_inc(iv);
+ le128_to_be128(&ctrblks[i], iv);
+ le128_inc(iv);
}

twofish_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index aa3eb35..13e63b3 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -62,15 +62,15 @@ void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src)
}
EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way);

-void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
+void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
be128 ctrblk;

if (dst != src)
*dst = *src;

- u128_to_be128(&ctrblk, iv);
- u128_inc(iv);
+ le128_to_be128(&ctrblk, iv);
+ le128_inc(iv);

twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
u128_xor(dst, dst, (u128 *)&ctrblk);
@@ -78,7 +78,7 @@ void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr);

void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
- u128 *iv)
+ le128 *iv)
{
be128 ctrblks[3];

@@ -88,12 +88,12 @@ void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
dst[2] = src[2];
}

- u128_to_be128(&ctrblks[0], iv);
- u128_inc(iv);
- u128_to_be128(&ctrblks[1], iv);
- u128_inc(iv);
- u128_to_be128(&ctrblks[2], iv);
- u128_inc(iv);
+ le128_to_be128(&ctrblks[0], iv);
+ le128_inc(iv);
+ le128_to_be128(&ctrblks[1], iv);
+ le128_inc(iv);
+ le128_to_be128(&ctrblks[2], iv);
+ le128_inc(iv);

twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks);
}
diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h
index 3e408bd..e2d65b0 100644
--- a/arch/x86/include/asm/crypto/glue_helper.h
+++ b/arch/x86/include/asm/crypto/glue_helper.h
@@ -13,7 +13,7 @@
typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src);
typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src);
typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src,
- u128 *iv);
+ le128 *iv);

#define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn))
#define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn))
@@ -71,23 +71,29 @@ static inline void glue_fpu_end(bool fpu_enabled)
kernel_fpu_end();
}

-static inline void u128_to_be128(be128 *dst, const u128 *src)
+static inline void le128_to_be128(be128 *dst, const le128 *src)
{
- dst->a = cpu_to_be64(src->a);
- dst->b = cpu_to_be64(src->b);
+ dst->a = cpu_to_be64(le64_to_cpu(src->a));
+ dst->b = cpu_to_be64(le64_to_cpu(src->b));
}

-static inline void be128_to_u128(u128 *dst, const be128 *src)
+static inline void be128_to_le128(le128 *dst, const be128 *src)
{
- dst->a = be64_to_cpu(src->a);
- dst->b = be64_to_cpu(src->b);
+ dst->a = cpu_to_le64(be64_to_cpu(src->a));
+ dst->b = cpu_to_le64(be64_to_cpu(src->b));
}

-static inline void u128_inc(u128 *i)
+static inline void le128_inc(le128 *i)
{
- i->b++;
- if (!i->b)
- i->a++;
+ u64 a = le64_to_cpu(i->a);
+ u64 b = le64_to_cpu(i->b);
+
+ b++;
+ if (!b)
+ a++;
+
+ i->a = cpu_to_le64(a);
+ i->b = cpu_to_le64(b);
}

extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h
index 9d2c514..878c51c 100644
--- a/arch/x86/include/asm/crypto/twofish.h
+++ b/arch/x86/include/asm/crypto/twofish.h
@@ -31,9 +31,9 @@ asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
/* helpers from twofish_x86_64-3way module */
extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src);
extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src,
- u128 *iv);
+ le128 *iv);
extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
- u128 *iv);
+ le128 *iv);

extern int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen);

2012-10-20 12:06:44

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 2/5] crypto: cast6/avx - avoid using temporary stack buffers

Introduce new assembler functions to avoid use temporary stack buffers in
glue code. This also allows use of vector instructions for xoring output
in CTR and CBC modes and construction of IVs for CTR mode.

ECB mode sees ~0.5% decrease in speed because added one extra function
call. CBC mode decryption and CTR mode benefit from vector operations
and gain ~2%.

Signed-off-by: Jussi Kivilinna <[email protected]>
---
arch/x86/crypto/cast6-avx-x86_64-asm_64.S | 190 +++++++++++++++++++----------
arch/x86/crypto/cast6_avx_glue.c | 71 ++---------
arch/x86/crypto/glue_helper-asm-avx.S | 91 ++++++++++++++
3 files changed, 227 insertions(+), 125 deletions(-)
create mode 100644 arch/x86/crypto/glue_helper-asm-avx.S

diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index 218d283..83a5381 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -23,6 +23,8 @@
*
*/

+#include "glue_helper-asm-avx.S"
+
.file "cast6-avx-x86_64-asm_64.S"

.extern cast6_s1
@@ -205,11 +207,7 @@
vpunpcklqdq x3, t2, x2; \
vpunpckhqdq x3, t2, x3;

-#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2, rmask) \
- vmovdqu (0*4*4)(in), x0; \
- vmovdqu (1*4*4)(in), x1; \
- vmovdqu (2*4*4)(in), x2; \
- vmovdqu (3*4*4)(in), x3; \
+#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
vpshufb rmask, x0, x0; \
vpshufb rmask, x1, x1; \
vpshufb rmask, x2, x2; \
@@ -217,39 +215,21 @@
\
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)

-#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
+#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
\
vpshufb rmask, x0, x0; \
vpshufb rmask, x1, x1; \
vpshufb rmask, x2, x2; \
- vpshufb rmask, x3, x3; \
- vmovdqu x0, (0*4*4)(out); \
- vmovdqu x1, (1*4*4)(out); \
- vmovdqu x2, (2*4*4)(out); \
- vmovdqu x3, (3*4*4)(out);
-
-#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
- transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
- \
- vpshufb rmask, x0, x0; \
- vpshufb rmask, x1, x1; \
- vpshufb rmask, x2, x2; \
- vpshufb rmask, x3, x3; \
- vpxor (0*4*4)(out), x0, x0; \
- vmovdqu x0, (0*4*4)(out); \
- vpxor (1*4*4)(out), x1, x1; \
- vmovdqu x1, (1*4*4)(out); \
- vpxor (2*4*4)(out), x2, x2; \
- vmovdqu x2, (2*4*4)(out); \
- vpxor (3*4*4)(out), x3, x3; \
- vmovdqu x3, (3*4*4)(out);
+ vpshufb rmask, x3, x3;

.data

.align 16
.Lbswap_mask:
.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
.Lrkr_enc_Q_Q_QBAR_QBAR:
.byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
.Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
@@ -269,31 +249,26 @@

.text

-.align 16
-.global __cast6_enc_blk_8way
-.type __cast6_enc_blk_8way,@function;
+.align 8
+.type __cast6_enc_blk8,@function;

-__cast6_enc_blk_8way:
+__cast6_enc_blk8:
/* input:
* %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: bool, if true: xor output
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
+ * output:
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
*/

pushq %rbp;
pushq %rbx;
- pushq %rcx;

vmovdqa .Lbswap_mask, RKM;
vmovd .Lfirst_mask, R1ST;
vmovd .L32_mask, R32;

- leaq (4*4*4)(%rdx), %rax;
- inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
- inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
-
- movq %rsi, %r11;
+ inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+ inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);

preload_rkr(0, dummy, none);
Q(0);
@@ -311,36 +286,25 @@ __cast6_enc_blk_8way:
QBAR(10);
QBAR(11);

- popq %rcx;
popq %rbx;
popq %rbp;

vmovdqa .Lbswap_mask, RKM;
- leaq (4*4*4)(%r11), %rax;
-
- testb %cl, %cl;
- jnz __enc_xor8;
-
- outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
- outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
-
- ret;

-__enc_xor8:
- outunpack_xor_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
- outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
+ outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+ outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);

ret;

-.align 16
-.global cast6_dec_blk_8way
-.type cast6_dec_blk_8way,@function;
+.align 8
+.type __cast6_dec_blk8,@function;

-cast6_dec_blk_8way:
+__cast6_dec_blk8:
/* input:
* %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
+ * output:
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
*/

pushq %rbp;
@@ -350,11 +314,8 @@ cast6_dec_blk_8way:
vmovd .Lfirst_mask, R1ST;
vmovd .L32_mask, R32;

- leaq (4*4*4)(%rdx), %rax;
- inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
- inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
-
- movq %rsi, %r11;
+ inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+ inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);

preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
Q(11);
@@ -376,8 +337,103 @@ cast6_dec_blk_8way:
popq %rbp;

vmovdqa .Lbswap_mask, RKM;
- leaq (4*4*4)(%r11), %rax;
- outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
- outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
+ outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+ outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
+
+ ret;
+
+.align 8
+.global cast6_ecb_enc_8way
+.type cast6_ecb_enc_8way,@function;
+
+cast6_ecb_enc_8way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ movq %rsi, %r11;
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __cast6_enc_blk8;
+
+ store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ ret;
+
+.align 8
+.global cast6_ecb_dec_8way
+.type cast6_ecb_dec_8way,@function;
+
+cast6_ecb_dec_8way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ movq %rsi, %r11;
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __cast6_dec_blk8;
+
+ store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ ret;
+
+.align 8
+.global cast6_cbc_dec_8way
+.type cast6_cbc_dec_8way,@function;
+
+cast6_cbc_dec_8way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ pushq %r12;
+
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __cast6_dec_blk8;
+
+ store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ popq %r12;
+
+ ret;
+
+.align 8
+.global cast6_ctr_8way
+.type cast6_ctr_8way,@function;
+
+cast6_ctr_8way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (little endian, 128bit)
+ */
+
+ pushq %r12;
+
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+ RD2, RX, RKR, RKM);
+
+ call __cast6_enc_blk8;
+
+ store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ popq %r12;

ret;
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 1dfd33b..92f7ca2 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -40,43 +40,15 @@

#define CAST6_PARALLEL_BLOCKS 8

-asmlinkage void __cast6_enc_blk_8way(struct cast6_ctx *ctx, u8 *dst,
- const u8 *src, bool xor);
-asmlinkage void cast6_dec_blk_8way(struct cast6_ctx *ctx, u8 *dst,
+asmlinkage void cast6_ecb_enc_8way(struct cast6_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void cast6_ecb_dec_8way(struct cast6_ctx *ctx, u8 *dst,
const u8 *src);

-static inline void cast6_enc_blk_xway(struct cast6_ctx *ctx, u8 *dst,
- const u8 *src)
-{
- __cast6_enc_blk_8way(ctx, dst, src, false);
-}
-
-static inline void cast6_enc_blk_xway_xor(struct cast6_ctx *ctx, u8 *dst,
- const u8 *src)
-{
- __cast6_enc_blk_8way(ctx, dst, src, true);
-}
-
-static inline void cast6_dec_blk_xway(struct cast6_ctx *ctx, u8 *dst,
- const u8 *src)
-{
- cast6_dec_blk_8way(ctx, dst, src);
-}
-
-
-static void cast6_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
-{
- u128 ivs[CAST6_PARALLEL_BLOCKS - 1];
- unsigned int j;
-
- for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++)
- ivs[j] = src[j];
-
- cast6_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
-
- for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++)
- u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
-}
+asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src,
+ le128 *iv);

static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
@@ -89,30 +61,13 @@ static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
u128_xor(dst, src, (u128 *)&ctrblk);
}

-static void cast6_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
- le128 *iv)
-{
- be128 ctrblks[CAST6_PARALLEL_BLOCKS];
- unsigned int i;
-
- for (i = 0; i < CAST6_PARALLEL_BLOCKS; i++) {
- if (dst != src)
- dst[i] = src[i];
-
- le128_to_be128(&ctrblks[i], iv);
- le128_inc(iv);
- }
-
- cast6_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
-}
-
static const struct common_glue_ctx cast6_enc = {
.num_funcs = 2,
.fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,

.funcs = { {
.num_blocks = CAST6_PARALLEL_BLOCKS,
- .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_enc_blk_xway) }
+ .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_enc_8way) }
}, {
.num_blocks = 1,
.fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) }
@@ -125,7 +80,7 @@ static const struct common_glue_ctx cast6_ctr = {

.funcs = { {
.num_blocks = CAST6_PARALLEL_BLOCKS,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr_xway) }
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_ctr_8way) }
}, {
.num_blocks = 1,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) }
@@ -138,7 +93,7 @@ static const struct common_glue_ctx cast6_dec = {

.funcs = { {
.num_blocks = CAST6_PARALLEL_BLOCKS,
- .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_dec_blk_xway) }
+ .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_dec_8way) }
}, {
.num_blocks = 1,
.fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) }
@@ -151,7 +106,7 @@ static const struct common_glue_ctx cast6_dec_cbc = {

.funcs = { {
.num_blocks = CAST6_PARALLEL_BLOCKS,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_decrypt_cbc_xway) }
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_cbc_dec_8way) }
}, {
.num_blocks = 1,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) }
@@ -215,7 +170,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);

if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
- cast6_enc_blk_xway(ctx->ctx, srcdst, srcdst);
+ cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
return;
}

@@ -232,7 +187,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);

if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
- cast6_dec_blk_xway(ctx->ctx, srcdst, srcdst);
+ cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
return;
}

diff --git a/arch/x86/crypto/glue_helper-asm-avx.S b/arch/x86/crypto/glue_helper-asm-avx.S
new file mode 100644
index 0000000..f7b6ea2
--- /dev/null
+++ b/arch/x86/crypto/glue_helper-asm-avx.S
@@ -0,0 +1,91 @@
+/*
+ * Shared glue code for 128bit block ciphers, AVX assembler macros
+ *
+ * Copyright (c) 2012 Jussi Kivilinna <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vmovdqu (0*16)(src), x0; \
+ vmovdqu (1*16)(src), x1; \
+ vmovdqu (2*16)(src), x2; \
+ vmovdqu (3*16)(src), x3; \
+ vmovdqu (4*16)(src), x4; \
+ vmovdqu (5*16)(src), x5; \
+ vmovdqu (6*16)(src), x6; \
+ vmovdqu (7*16)(src), x7;
+
+#define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vmovdqu x0, (0*16)(dst); \
+ vmovdqu x1, (1*16)(dst); \
+ vmovdqu x2, (2*16)(dst); \
+ vmovdqu x3, (3*16)(dst); \
+ vmovdqu x4, (4*16)(dst); \
+ vmovdqu x5, (5*16)(dst); \
+ vmovdqu x6, (6*16)(dst); \
+ vmovdqu x7, (7*16)(dst);
+
+#define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vpxor (0*16)(src), x1, x1; \
+ vpxor (1*16)(src), x2, x2; \
+ vpxor (2*16)(src), x3, x3; \
+ vpxor (3*16)(src), x4, x4; \
+ vpxor (4*16)(src), x5, x5; \
+ vpxor (5*16)(src), x6, x6; \
+ vpxor (6*16)(src), x7, x7; \
+ store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \
+ vpcmpeqd t0, t0, t0; \
+ vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \
+ vmovdqa bswap, t1; \
+ \
+ /* load IV and byteswap */ \
+ vmovdqu (iv), x7; \
+ vpshufb t1, x7, x0; \
+ \
+ /* construct IVs */ \
+ inc_le128(x7, t0, t2); \
+ vpshufb t1, x7, x1; \
+ inc_le128(x7, t0, t2); \
+ vpshufb t1, x7, x2; \
+ inc_le128(x7, t0, t2); \
+ vpshufb t1, x7, x3; \
+ inc_le128(x7, t0, t2); \
+ vpshufb t1, x7, x4; \
+ inc_le128(x7, t0, t2); \
+ vpshufb t1, x7, x5; \
+ inc_le128(x7, t0, t2); \
+ vpshufb t1, x7, x6; \
+ inc_le128(x7, t0, t2); \
+ vmovdqa x7, t2; \
+ vpshufb t1, x7, x7; \
+ inc_le128(t2, t0, t1); \
+ vmovdqu t2, (iv);
+
+#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vpxor (0*16)(src), x0, x0; \
+ vpxor (1*16)(src), x1, x1; \
+ vpxor (2*16)(src), x2, x2; \
+ vpxor (3*16)(src), x3, x3; \
+ vpxor (4*16)(src), x4, x4; \
+ vpxor (5*16)(src), x5, x5; \
+ vpxor (6*16)(src), x6, x6; \
+ vpxor (7*16)(src), x7, x7; \
+ store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);

2012-10-20 12:06:50

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 3/5] crypto: twofish/avx - avoid using temporary stack buffers

Introduce new assembler functions to avoid use temporary stack buffers in glue
code. This also allows use of vector instructions for xoring output in CTR and
CBC modes and construction of IVs for CTR mode.

ECB mode sees ~0.2% decrease in speed because added one extra function
call. CBC mode decryption and CTR mode benefit from vector operations
and gain ~3%.

Signed-off-by: Jussi Kivilinna <[email protected]>
---
arch/x86/crypto/twofish-avx-x86_64-asm_64.S | 208 ++++++++++++++++++---------
arch/x86/crypto/twofish_avx_glue.c | 73 ++-------
2 files changed, 152 insertions(+), 129 deletions(-)

diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 1585abb..ebac16b 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -23,7 +23,16 @@
*
*/

+#include "glue_helper-asm-avx.S"
+
.file "twofish-avx-x86_64-asm_64.S"
+
+.data
+.align 16
+
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
.text

/* structure of crypto context */
@@ -217,69 +226,45 @@
vpunpcklqdq x3, t2, x2; \
vpunpckhqdq x3, t2, x3;

-#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \
- vpxor (0*4*4)(in), wkey, x0; \
- vpxor (1*4*4)(in), wkey, x1; \
- vpxor (2*4*4)(in), wkey, x2; \
- vpxor (3*4*4)(in), wkey, x3; \
+#define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
+ vpxor x0, wkey, x0; \
+ vpxor x1, wkey, x1; \
+ vpxor x2, wkey, x2; \
+ vpxor x3, wkey, x3; \
\
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)

-#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
- transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
- \
- vpxor x0, wkey, x0; \
- vmovdqu x0, (0*4*4)(out); \
- vpxor x1, wkey, x1; \
- vmovdqu x1, (1*4*4)(out); \
- vpxor x2, wkey, x2; \
- vmovdqu x2, (2*4*4)(out); \
- vpxor x3, wkey, x3; \
- vmovdqu x3, (3*4*4)(out);
-
-#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
+#define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
\
- vpxor x0, wkey, x0; \
- vpxor (0*4*4)(out), x0, x0; \
- vmovdqu x0, (0*4*4)(out); \
- vpxor x1, wkey, x1; \
- vpxor (1*4*4)(out), x1, x1; \
- vmovdqu x1, (1*4*4)(out); \
- vpxor x2, wkey, x2; \
- vpxor (2*4*4)(out), x2, x2; \
- vmovdqu x2, (2*4*4)(out); \
- vpxor x3, wkey, x3; \
- vpxor (3*4*4)(out), x3, x3; \
- vmovdqu x3, (3*4*4)(out);
+ vpxor x0, wkey, x0; \
+ vpxor x1, wkey, x1; \
+ vpxor x2, wkey, x2; \
+ vpxor x3, wkey, x3;

.align 8
-.global __twofish_enc_blk_8way
-.type __twofish_enc_blk_8way,@function;
+.type __twofish_enc_blk8,@function;

-__twofish_enc_blk_8way:
+__twofish_enc_blk8:
/* input:
* %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: bool, if true: xor output
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
+ * output:
+ * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
*/

+ vmovdqu w(CTX), RK1;
+
pushq %rbp;
pushq %rbx;
pushq %rcx;

- vmovdqu w(CTX), RK1;
-
- leaq (4*4*4)(%rdx), %rax;
- inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
+ inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
preload_rgi(RA1);
rotate_1l(RD1);
- inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
+ inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
rotate_1l(RD2);

- movq %rsi, %r11;
-
encrypt_cycle(0);
encrypt_cycle(1);
encrypt_cycle(2);
@@ -295,47 +280,33 @@ __twofish_enc_blk_8way:
popq %rbx;
popq %rbp;

- leaq (4*4*4)(%r11), %rax;
-
- testb %cl, %cl;
- jnz __enc_xor8;
-
- outunpack_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
- outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
-
- ret;
-
-__enc_xor8:
- outunpack_xor_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
- outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
+ outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
+ outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);

ret;

.align 8
-.global twofish_dec_blk_8way
-.type twofish_dec_blk_8way,@function;
+.type __twofish_dec_blk8,@function;

-twofish_dec_blk_8way:
+__twofish_dec_blk8:
/* input:
* %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
+ * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
+ * output:
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
*/

+ vmovdqu (w+4*4)(CTX), RK1;
+
pushq %rbp;
pushq %rbx;

- vmovdqu (w+4*4)(CTX), RK1;
-
- leaq (4*4*4)(%rdx), %rax;
- inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
+ inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
preload_rgi(RC1);
rotate_1l(RA1);
- inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
+ inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
rotate_1l(RA2);

- movq %rsi, %r11;
-
decrypt_cycle(7);
decrypt_cycle(6);
decrypt_cycle(5);
@@ -350,8 +321,103 @@ twofish_dec_blk_8way:
popq %rbx;
popq %rbp;

- leaq (4*4*4)(%r11), %rax;
- outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
- outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
+ outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
+ outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
+
+ ret;
+
+.align 8
+.global twofish_ecb_enc_8way
+.type twofish_ecb_enc_8way,@function;
+
+twofish_ecb_enc_8way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ movq %rsi, %r11;
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __twofish_enc_blk8;
+
+ store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+ ret;
+
+.align 8
+.global twofish_ecb_dec_8way
+.type twofish_ecb_dec_8way,@function;
+
+twofish_ecb_dec_8way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ movq %rsi, %r11;
+
+ load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+ call __twofish_dec_blk8;
+
+ store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ ret;
+
+.align 8
+.global twofish_cbc_dec_8way
+.type twofish_cbc_dec_8way,@function;
+
+twofish_cbc_dec_8way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ pushq %r12;
+
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+ call __twofish_dec_blk8;
+
+ store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ popq %r12;
+
+ ret;
+
+.align 8
+.global twofish_ctr_8way
+.type twofish_ctr_8way,@function;
+
+twofish_ctr_8way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (little endian, 128bit)
+ */
+
+ pushq %r12;
+
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+ RD2, RX0, RX1, RY0);
+
+ call __twofish_enc_blk8;
+
+ store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+ popq %r12;

ret;
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index 810e45d..94ac91d 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -45,66 +45,23 @@

#define TWOFISH_PARALLEL_BLOCKS 8

-static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
- const u8 *src)
-{
- __twofish_enc_blk_3way(ctx, dst, src, false);
-}
-
/* 8-way parallel cipher functions */
-asmlinkage void __twofish_enc_blk_8way(struct twofish_ctx *ctx, u8 *dst,
- const u8 *src, bool xor);
-asmlinkage void twofish_dec_blk_8way(struct twofish_ctx *ctx, u8 *dst,
+asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src);

-static inline void twofish_enc_blk_xway(struct twofish_ctx *ctx, u8 *dst,
- const u8 *src)
-{
- __twofish_enc_blk_8way(ctx, dst, src, false);
-}
-
-static inline void twofish_enc_blk_xway_xor(struct twofish_ctx *ctx, u8 *dst,
- const u8 *src)
-{
- __twofish_enc_blk_8way(ctx, dst, src, true);
-}
+asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);

-static inline void twofish_dec_blk_xway(struct twofish_ctx *ctx, u8 *dst,
+static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src)
{
- twofish_dec_blk_8way(ctx, dst, src);
-}
-
-static void twofish_dec_blk_cbc_xway(void *ctx, u128 *dst, const u128 *src)
-{
- u128 ivs[TWOFISH_PARALLEL_BLOCKS - 1];
- unsigned int j;
-
- for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
- ivs[j] = src[j];
-
- twofish_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
-
- for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
- u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
+ __twofish_enc_blk_3way(ctx, dst, src, false);
}

-static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src,
- le128 *iv)
-{
- be128 ctrblks[TWOFISH_PARALLEL_BLOCKS];
- unsigned int i;
-
- for (i = 0; i < TWOFISH_PARALLEL_BLOCKS; i++) {
- if (dst != src)
- dst[i] = src[i];
-
- le128_to_be128(&ctrblks[i], iv);
- le128_inc(iv);
- }
-
- twofish_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
-}

static const struct common_glue_ctx twofish_enc = {
.num_funcs = 3,
@@ -112,7 +69,7 @@ static const struct common_glue_ctx twofish_enc = {

.funcs = { {
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
- .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_xway) }
+ .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
}, {
.num_blocks = 3,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
@@ -128,7 +85,7 @@ static const struct common_glue_ctx twofish_ctr = {

.funcs = { {
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_xway) }
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
}, {
.num_blocks = 3,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
@@ -144,7 +101,7 @@ static const struct common_glue_ctx twofish_dec = {

.funcs = { {
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
- .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_xway) }
+ .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
}, {
.num_blocks = 3,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
@@ -160,7 +117,7 @@ static const struct common_glue_ctx twofish_dec_cbc = {

.funcs = { {
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_xway) }
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
}, {
.num_blocks = 3,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
@@ -227,7 +184,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);

if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
- twofish_enc_blk_xway(ctx->ctx, srcdst, srcdst);
+ twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
return;
}

@@ -249,7 +206,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);

if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
- twofish_dec_blk_xway(ctx->ctx, srcdst, srcdst);
+ twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
return;
}


2012-10-20 12:06:56

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 4/5] crypto: serpent/avx - avoid using temporary stack buffers

Introduce new assembler functions to avoid use temporary stack buffers in glue
code. This also allows use of vector instructions for xoring output in CTR and
CBC modes and construction of IVs for CTR mode.

ECB mode sees ~0.5% decrease in speed because added one extra function
call. CBC mode decryption and CTR mode benefit from vector operations
and gain ~3%.

Signed-off-by: Jussi Kivilinna <[email protected]>
---
arch/x86/crypto/serpent-avx-x86_64-asm_64.S | 166 ++++++++++++++++++---------
arch/x86/crypto/serpent_avx_glue.c | 43 +------
arch/x86/include/asm/crypto/serpent-avx.h | 27 +---
3 files changed, 121 insertions(+), 115 deletions(-)

diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index 504106b..02b0e9f 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -24,7 +24,16 @@
*
*/

+#include "glue_helper-asm-avx.S"
+
.file "serpent-avx-x86_64-asm_64.S"
+
+.data
+.align 16
+
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
.text

#define CTX %rdi
@@ -550,51 +559,27 @@
vpunpcklqdq x3, t2, x2; \
vpunpckhqdq x3, t2, x3;

-#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
- vmovdqu (0*4*4)(in), x0; \
- vmovdqu (1*4*4)(in), x1; \
- vmovdqu (2*4*4)(in), x2; \
- vmovdqu (3*4*4)(in), x3; \
- \
+#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)

-#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
- transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
- \
- vmovdqu x0, (0*4*4)(out); \
- vmovdqu x1, (1*4*4)(out); \
- vmovdqu x2, (2*4*4)(out); \
- vmovdqu x3, (3*4*4)(out);
-
-#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
- transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
- \
- vpxor (0*4*4)(out), x0, x0; \
- vmovdqu x0, (0*4*4)(out); \
- vpxor (1*4*4)(out), x1, x1; \
- vmovdqu x1, (1*4*4)(out); \
- vpxor (2*4*4)(out), x2, x2; \
- vmovdqu x2, (2*4*4)(out); \
- vpxor (3*4*4)(out), x3, x3; \
- vmovdqu x3, (3*4*4)(out);
+#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)

.align 8
-.global __serpent_enc_blk_8way_avx
-.type __serpent_enc_blk_8way_avx,@function;
+.type __serpent_enc_blk8_avx,@function;

-__serpent_enc_blk_8way_avx:
+__serpent_enc_blk8_avx:
/* input:
* %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: bool, if true: xor output
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
+ * output:
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
*/

vpcmpeqd RNOT, RNOT, RNOT;

- leaq (4*4*4)(%rdx), %rax;
- read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
- read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+ read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);

K2(RA, RB, RC, RD, RE, 0);
S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
@@ -630,38 +615,26 @@ __serpent_enc_blk_8way_avx:
S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);

- leaq (4*4*4)(%rsi), %rax;
-
- testb %cl, %cl;
- jnz __enc_xor8;
-
- write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
- write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
-
- ret;
-
-__enc_xor8:
- xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
- xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+ write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);

ret;

.align 8
-.global serpent_dec_blk_8way_avx
-.type serpent_dec_blk_8way_avx,@function;
+.type __serpent_dec_blk8_avx,@function;

-serpent_dec_blk_8way_avx:
+__serpent_dec_blk8_avx:
/* input:
* %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
+ * output:
+ * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks
*/

vpcmpeqd RNOT, RNOT, RNOT;

- leaq (4*4*4)(%rdx), %rax;
- read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
- read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+ read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);

K2(RA, RB, RC, RD, RE, 32);
SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
@@ -697,8 +670,85 @@ serpent_dec_blk_8way_avx:
SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);

- leaq (4*4*4)(%rsi), %rax;
- write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
- write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+ write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
+ write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+
+ ret;
+
+.align 8
+.global serpent_ecb_enc_8way_avx
+.type serpent_ecb_enc_8way_avx,@function;
+
+serpent_ecb_enc_8way_avx:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __serpent_enc_blk8_avx;
+
+ store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ ret;
+
+.align 8
+.global serpent_ecb_dec_8way_avx
+.type serpent_ecb_dec_8way_avx,@function;
+
+serpent_ecb_dec_8way_avx:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __serpent_dec_blk8_avx;
+
+ store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+ ret;
+
+.align 8
+.global serpent_cbc_dec_8way_avx
+.type serpent_cbc_dec_8way_avx,@function;
+
+serpent_cbc_dec_8way_avx:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __serpent_dec_blk8_avx;
+
+ store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+ ret;
+
+.align 8
+.global serpent_ctr_8way_avx
+.type serpent_ctr_8way_avx,@function;
+
+serpent_ctr_8way_avx:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (little endian, 128bit)
+ */
+
+ load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+ RD2, RK0, RK1, RK2);
+
+ call __serpent_enc_blk8_avx;
+
+ store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);

ret;
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 2aa31ad..52abaaf 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -42,20 +42,6 @@
#include <asm/crypto/ablk_helper.h>
#include <asm/crypto/glue_helper.h>

-static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
-{
- u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
- unsigned int j;
-
- for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
- ivs[j] = src[j];
-
- serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
-
- for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
- u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
-}
-
static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
be128 ctrblk;
@@ -67,30 +53,13 @@ static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
u128_xor(dst, src, (u128 *)&ctrblk);
}

-static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
- le128 *iv)
-{
- be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
- unsigned int i;
-
- for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
- if (dst != src)
- dst[i] = src[i];
-
- le128_to_be128(&ctrblks[i], iv);
- le128_inc(iv);
- }
-
- serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
-}
-
static const struct common_glue_ctx serpent_enc = {
.num_funcs = 2,
.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,

.funcs = { {
.num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) }
+ .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) }
}, {
.num_blocks = 1,
.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
@@ -103,7 +72,7 @@ static const struct common_glue_ctx serpent_ctr = {

.funcs = { {
.num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) }
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
}, {
.num_blocks = 1,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
@@ -116,7 +85,7 @@ static const struct common_glue_ctx serpent_dec = {

.funcs = { {
.num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) }
+ .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) }
}, {
.num_blocks = 1,
.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
@@ -129,7 +98,7 @@ static const struct common_glue_ctx serpent_dec_cbc = {

.funcs = { {
.num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) }
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) }
}, {
.num_blocks = 1,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
@@ -193,7 +162,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);

if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
- serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
+ serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
return;
}

@@ -210,7 +179,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);

if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
- serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
+ serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
return;
}

diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h
index 432deed..0da1d3e 100644
--- a/arch/x86/include/asm/crypto/serpent-avx.h
+++ b/arch/x86/include/asm/crypto/serpent-avx.h
@@ -6,27 +6,14 @@

#define SERPENT_PARALLEL_BLOCKS 8

-asmlinkage void __serpent_enc_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst,
- const u8 *src, bool xor);
-asmlinkage void serpent_dec_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
const u8 *src);

-static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
- const u8 *src)
-{
- __serpent_enc_blk_8way_avx(ctx, dst, src, false);
-}
-
-static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
- const u8 *src)
-{
- __serpent_enc_blk_8way_avx(ctx, dst, src, true);
-}
-
-static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
- const u8 *src)
-{
- serpent_dec_blk_8way_avx(ctx, dst, src);
-}
+asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);

#endif

2012-10-20 12:07:00

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 5/5] crypto: cast5/avx - avoid using temporary stack buffers

Introduce new assembler functions to avoid use temporary stack buffers in glue
code. This also allows use of vector instructions for xoring output in CTR and
CBC modes and construction of IVs for CTR mode.

ECB mode sees ~0.5% decrease in speed because added one extra function
call. CBC mode decryption and CTR mode benefit from vector operations
and gain ~5%.

Signed-off-by: Jussi Kivilinna <[email protected]>
---
arch/x86/crypto/cast5-avx-x86_64-asm_64.S | 332 ++++++++++++++++++++++-------
arch/x86/crypto/cast5_avx_glue.c | 79 ++-----
2 files changed, 280 insertions(+), 131 deletions(-)

diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index a41a3aa..12478e4 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -180,31 +180,17 @@
vpunpcklqdq t1, t0, x0; \
vpunpckhqdq t1, t0, x1;

-#define inpack_blocks(in, x0, x1, t0, t1, rmask) \
- vmovdqu (0*4*4)(in), x0; \
- vmovdqu (1*4*4)(in), x1; \
+#define inpack_blocks(x0, x1, t0, t1, rmask) \
vpshufb rmask, x0, x0; \
vpshufb rmask, x1, x1; \
\
transpose_2x4(x0, x1, t0, t1)

-#define outunpack_blocks(out, x0, x1, t0, t1, rmask) \
+#define outunpack_blocks(x0, x1, t0, t1, rmask) \
transpose_2x4(x0, x1, t0, t1) \
\
vpshufb rmask, x0, x0; \
- vpshufb rmask, x1, x1; \
- vmovdqu x0, (0*4*4)(out); \
- vmovdqu x1, (1*4*4)(out);
-
-#define outunpack_xor_blocks(out, x0, x1, t0, t1, rmask) \
- transpose_2x4(x0, x1, t0, t1) \
- \
- vpshufb rmask, x0, x0; \
- vpshufb rmask, x1, x1; \
- vpxor (0*4*4)(out), x0, x0; \
- vmovdqu x0, (0*4*4)(out); \
- vpxor (1*4*4)(out), x1, x1; \
- vmovdqu x1, (1*4*4)(out);
+ vpshufb rmask, x1, x1;

.data

@@ -213,6 +199,8 @@
.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lbswap_iv_mask:
+ .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
.L16_mask:
.byte 16, 16, 16, 16
.L32_mask:
@@ -223,35 +211,42 @@
.text

.align 16
-.global __cast5_enc_blk_16way
-.type __cast5_enc_blk_16way,@function;
+.type __cast5_enc_blk16,@function;

-__cast5_enc_blk_16way:
+__cast5_enc_blk16:
/* input:
* %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: bool, if true: xor output
+ * RL1: blocks 1 and 2
+ * RR1: blocks 3 and 4
+ * RL2: blocks 5 and 6
+ * RR2: blocks 7 and 8
+ * RL3: blocks 9 and 10
+ * RR3: blocks 11 and 12
+ * RL4: blocks 13 and 14
+ * RR4: blocks 15 and 16
+ * output:
+ * RL1: encrypted blocks 1 and 2
+ * RR1: encrypted blocks 3 and 4
+ * RL2: encrypted blocks 5 and 6
+ * RR2: encrypted blocks 7 and 8
+ * RL3: encrypted blocks 9 and 10
+ * RR3: encrypted blocks 11 and 12
+ * RL4: encrypted blocks 13 and 14
+ * RR4: encrypted blocks 15 and 16
*/

pushq %rbp;
pushq %rbx;
- pushq %rcx;

vmovdqa .Lbswap_mask, RKM;
vmovd .Lfirst_mask, R1ST;
vmovd .L32_mask, R32;
enc_preload_rkr();

- leaq 1*(2*4*4)(%rdx), %rax;
- inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM);
- inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM);
- leaq 2*(2*4*4)(%rdx), %rax;
- inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM);
- leaq 3*(2*4*4)(%rdx), %rax;
- inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM);
-
- movq %rsi, %r11;
+ inpack_blocks(RL1, RR1, RTMP, RX, RKM);
+ inpack_blocks(RL2, RR2, RTMP, RX, RKM);
+ inpack_blocks(RL3, RR3, RTMP, RX, RKM);
+ inpack_blocks(RL4, RR4, RTMP, RX, RKM);

round(RL, RR, 0, 1);
round(RR, RL, 1, 2);
@@ -276,44 +271,41 @@ __cast5_enc_blk_16way:
round(RR, RL, 15, 1);

__skip_enc:
- popq %rcx;
popq %rbx;
popq %rbp;

vmovdqa .Lbswap_mask, RKM;
- leaq 1*(2*4*4)(%r11), %rax;

- testb %cl, %cl;
- jnz __enc_xor16;
-
- outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
- outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
- leaq 2*(2*4*4)(%r11), %rax;
- outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
- leaq 3*(2*4*4)(%r11), %rax;
- outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
-
- ret;
-
-__enc_xor16:
- outunpack_xor_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
- outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
- leaq 2*(2*4*4)(%r11), %rax;
- outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
- leaq 3*(2*4*4)(%r11), %rax;
- outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
+ outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
+ outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
+ outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
+ outunpack_blocks(RR4, RL4, RTMP, RX, RKM);

ret;

.align 16
-.global cast5_dec_blk_16way
-.type cast5_dec_blk_16way,@function;
+.type __cast5_dec_blk16,@function;

-cast5_dec_blk_16way:
+__cast5_dec_blk16:
/* input:
* %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
+ * RL1: encrypted blocks 1 and 2
+ * RR1: encrypted blocks 3 and 4
+ * RL2: encrypted blocks 5 and 6
+ * RR2: encrypted blocks 7 and 8
+ * RL3: encrypted blocks 9 and 10
+ * RR3: encrypted blocks 11 and 12
+ * RL4: encrypted blocks 13 and 14
+ * RR4: encrypted blocks 15 and 16
+ * output:
+ * RL1: decrypted blocks 1 and 2
+ * RR1: decrypted blocks 3 and 4
+ * RL2: decrypted blocks 5 and 6
+ * RR2: decrypted blocks 7 and 8
+ * RL3: decrypted blocks 9 and 10
+ * RR3: decrypted blocks 11 and 12
+ * RL4: decrypted blocks 13 and 14
+ * RR4: decrypted blocks 15 and 16
*/

pushq %rbp;
@@ -324,15 +316,10 @@ cast5_dec_blk_16way:
vmovd .L32_mask, R32;
dec_preload_rkr();

- leaq 1*(2*4*4)(%rdx), %rax;
- inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM);
- inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM);
- leaq 2*(2*4*4)(%rdx), %rax;
- inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM);
- leaq 3*(2*4*4)(%rdx), %rax;
- inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM);
-
- movq %rsi, %r11;
+ inpack_blocks(RL1, RR1, RTMP, RX, RKM);
+ inpack_blocks(RL2, RR2, RTMP, RX, RKM);
+ inpack_blocks(RL3, RR3, RTMP, RX, RKM);
+ inpack_blocks(RL4, RR4, RTMP, RX, RKM);

movzbl rr(CTX), %eax;
testl %eax, %eax;
@@ -361,16 +348,211 @@ __dec_tail:
popq %rbx;
popq %rbp;

- leaq 1*(2*4*4)(%r11), %rax;
- outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
- outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
- leaq 2*(2*4*4)(%r11), %rax;
- outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
- leaq 3*(2*4*4)(%r11), %rax;
- outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
+ outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
+ outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
+ outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
+ outunpack_blocks(RR4, RL4, RTMP, RX, RKM);

ret;

__skip_dec:
vpsrldq $4, RKR, RKR;
jmp __dec_tail;
+
+.align 16
+.global cast5_ecb_enc_16way
+.type cast5_ecb_enc_16way,@function;
+
+cast5_ecb_enc_16way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ movq %rsi, %r11;
+
+ vmovdqu (0*4*4)(%rdx), RL1;
+ vmovdqu (1*4*4)(%rdx), RR1;
+ vmovdqu (2*4*4)(%rdx), RL2;
+ vmovdqu (3*4*4)(%rdx), RR2;
+ vmovdqu (4*4*4)(%rdx), RL3;
+ vmovdqu (5*4*4)(%rdx), RR3;
+ vmovdqu (6*4*4)(%rdx), RL4;
+ vmovdqu (7*4*4)(%rdx), RR4;
+
+ call __cast5_enc_blk16;
+
+ vmovdqu RR1, (0*4*4)(%r11);
+ vmovdqu RL1, (1*4*4)(%r11);
+ vmovdqu RR2, (2*4*4)(%r11);
+ vmovdqu RL2, (3*4*4)(%r11);
+ vmovdqu RR3, (4*4*4)(%r11);
+ vmovdqu RL3, (5*4*4)(%r11);
+ vmovdqu RR4, (6*4*4)(%r11);
+ vmovdqu RL4, (7*4*4)(%r11);
+
+ ret;
+
+.align 16
+.global cast5_ecb_dec_16way
+.type cast5_ecb_dec_16way,@function;
+
+cast5_ecb_dec_16way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ movq %rsi, %r11;
+
+ vmovdqu (0*4*4)(%rdx), RL1;
+ vmovdqu (1*4*4)(%rdx), RR1;
+ vmovdqu (2*4*4)(%rdx), RL2;
+ vmovdqu (3*4*4)(%rdx), RR2;
+ vmovdqu (4*4*4)(%rdx), RL3;
+ vmovdqu (5*4*4)(%rdx), RR3;
+ vmovdqu (6*4*4)(%rdx), RL4;
+ vmovdqu (7*4*4)(%rdx), RR4;
+
+ call __cast5_dec_blk16;
+
+ vmovdqu RR1, (0*4*4)(%r11);
+ vmovdqu RL1, (1*4*4)(%r11);
+ vmovdqu RR2, (2*4*4)(%r11);
+ vmovdqu RL2, (3*4*4)(%r11);
+ vmovdqu RR3, (4*4*4)(%r11);
+ vmovdqu RL3, (5*4*4)(%r11);
+ vmovdqu RR4, (6*4*4)(%r11);
+ vmovdqu RL4, (7*4*4)(%r11);
+
+ ret;
+
+.align 16
+.global cast5_cbc_dec_16way
+.type cast5_cbc_dec_16way,@function;
+
+cast5_cbc_dec_16way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ pushq %r12;
+
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ vmovdqu (0*16)(%rdx), RL1;
+ vmovdqu (1*16)(%rdx), RR1;
+ vmovdqu (2*16)(%rdx), RL2;
+ vmovdqu (3*16)(%rdx), RR2;
+ vmovdqu (4*16)(%rdx), RL3;
+ vmovdqu (5*16)(%rdx), RR3;
+ vmovdqu (6*16)(%rdx), RL4;
+ vmovdqu (7*16)(%rdx), RR4;
+
+ call __cast5_dec_blk16;
+
+ /* xor with src */
+ vmovq (%r12), RX;
+ vpshufd $0x4f, RX, RX;
+ vpxor RX, RR1, RR1;
+ vpxor 0*16+8(%r12), RL1, RL1;
+ vpxor 1*16+8(%r12), RR2, RR2;
+ vpxor 2*16+8(%r12), RL2, RL2;
+ vpxor 3*16+8(%r12), RR3, RR3;
+ vpxor 4*16+8(%r12), RL3, RL3;
+ vpxor 5*16+8(%r12), RR4, RR4;
+ vpxor 6*16+8(%r12), RL4, RL4;
+
+ vmovdqu RR1, (0*16)(%r11);
+ vmovdqu RL1, (1*16)(%r11);
+ vmovdqu RR2, (2*16)(%r11);
+ vmovdqu RL2, (3*16)(%r11);
+ vmovdqu RR3, (4*16)(%r11);
+ vmovdqu RL3, (5*16)(%r11);
+ vmovdqu RR4, (6*16)(%r11);
+ vmovdqu RL4, (7*16)(%r11);
+
+ popq %r12;
+
+ ret;
+
+.align 16
+.global cast5_ctr_16way
+.type cast5_ctr_16way,@function;
+
+cast5_ctr_16way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (big endian, 64bit)
+ */
+
+ pushq %r12;
+
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ vpcmpeqd RTMP, RTMP, RTMP;
+ vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */
+
+ vpcmpeqd RKR, RKR, RKR;
+ vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
+ vmovdqa .Lbswap_iv_mask, R1ST;
+ vmovdqa .Lbswap128_mask, RKM;
+
+ /* load IV and byteswap */
+ vmovq (%rcx), RX;
+ vpshufb R1ST, RX, RX;
+
+ /* construct IVs */
+ vpsubq RTMP, RX, RX; /* le: IV1, IV0 */
+ vpshufb RKM, RX, RL1; /* be: IV0, IV1 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RR1; /* be: IV2, IV3 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RL2; /* be: IV4, IV5 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RR2; /* be: IV6, IV7 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RL3; /* be: IV8, IV9 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RR3; /* be: IV10, IV11 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RL4; /* be: IV12, IV13 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RR4; /* be: IV14, IV15 */
+
+ /* store last IV */
+ vpsubq RTMP, RX, RX; /* le: IV16, IV14 */
+ vpshufb R1ST, RX, RX; /* be: IV16, IV16 */
+ vmovq RX, (%rcx);
+
+ call __cast5_enc_blk16;
+
+ /* dst = src ^ iv */
+ vpxor (0*16)(%r12), RR1, RR1;
+ vpxor (1*16)(%r12), RL1, RL1;
+ vpxor (2*16)(%r12), RR2, RR2;
+ vpxor (3*16)(%r12), RL2, RL2;
+ vpxor (4*16)(%r12), RR3, RR3;
+ vpxor (5*16)(%r12), RL3, RL3;
+ vpxor (6*16)(%r12), RR4, RR4;
+ vpxor (7*16)(%r12), RL4, RL4;
+ vmovdqu RR1, (0*16)(%r11);
+ vmovdqu RL1, (1*16)(%r11);
+ vmovdqu RR2, (2*16)(%r11);
+ vmovdqu RL2, (3*16)(%r11);
+ vmovdqu RR3, (4*16)(%r11);
+ vmovdqu RL3, (5*16)(%r11);
+ vmovdqu RR4, (6*16)(%r11);
+ vmovdqu RL4, (7*16)(%r11);
+
+ popq %r12;
+
+ ret;
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index e0ea14f..c663181 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -37,29 +37,14 @@

#define CAST5_PARALLEL_BLOCKS 16

-asmlinkage void __cast5_enc_blk_16way(struct cast5_ctx *ctx, u8 *dst,
- const u8 *src, bool xor);
-asmlinkage void cast5_dec_blk_16way(struct cast5_ctx *ctx, u8 *dst,
+asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst,
const u8 *src);
-
-static inline void cast5_enc_blk_xway(struct cast5_ctx *ctx, u8 *dst,
- const u8 *src)
-{
- __cast5_enc_blk_16way(ctx, dst, src, false);
-}
-
-static inline void cast5_enc_blk_xway_xor(struct cast5_ctx *ctx, u8 *dst,
- const u8 *src)
-{
- __cast5_enc_blk_16way(ctx, dst, src, true);
-}
-
-static inline void cast5_dec_blk_xway(struct cast5_ctx *ctx, u8 *dst,
- const u8 *src)
-{
- cast5_dec_blk_16way(ctx, dst, src);
-}
-
+asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void cast5_ctr_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src,
+ __be64 *iv);

static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes)
{
@@ -79,8 +64,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
const unsigned int bsize = CAST5_BLOCK_SIZE;
unsigned int nbytes;
+ void (*fn)(struct cast5_ctx *ctx, u8 *dst, const u8 *src);
int err;

+ fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way;
+
err = blkcipher_walk_virt(desc, walk);
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;

@@ -93,10 +81,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
/* Process multi-block batch */
if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
do {
- if (enc)
- cast5_enc_blk_xway(ctx, wdst, wsrc);
- else
- cast5_dec_blk_xway(ctx, wdst, wsrc);
+ fn(ctx, wdst, wsrc);

wsrc += bsize * CAST5_PARALLEL_BLOCKS;
wdst += bsize * CAST5_PARALLEL_BLOCKS;
@@ -107,12 +92,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
goto done;
}

+ fn = (enc) ? __cast5_encrypt : __cast5_decrypt;
+
/* Handle leftovers */
do {
- if (enc)
- __cast5_encrypt(ctx, wdst, wsrc);
- else
- __cast5_decrypt(ctx, wdst, wsrc);
+ fn(ctx, wdst, wsrc);

wsrc += bsize;
wdst += bsize;
@@ -194,9 +178,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
unsigned int nbytes = walk->nbytes;
u64 *src = (u64 *)walk->src.virt.addr;
u64 *dst = (u64 *)walk->dst.virt.addr;
- u64 ivs[CAST5_PARALLEL_BLOCKS - 1];
u64 last_iv;
- int i;

/* Start of the last block. */
src += nbytes / bsize - 1;
@@ -211,13 +193,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
src -= CAST5_PARALLEL_BLOCKS - 1;
dst -= CAST5_PARALLEL_BLOCKS - 1;

- for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++)
- ivs[i] = src[i];
-
- cast5_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
-
- for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++)
- *(dst + (i + 1)) ^= *(ivs + i);
+ cast5_cbc_dec_16way(ctx, (u8 *)dst, (u8 *)src);

nbytes -= bsize;
if (nbytes < bsize)
@@ -298,23 +274,12 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
unsigned int nbytes = walk->nbytes;
u64 *src = (u64 *)walk->src.virt.addr;
u64 *dst = (u64 *)walk->dst.virt.addr;
- u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
- __be64 ctrblocks[CAST5_PARALLEL_BLOCKS];
- int i;

/* Process multi-block batch */
if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
do {
- /* create ctrblks for parallel encrypt */
- for (i = 0; i < CAST5_PARALLEL_BLOCKS; i++) {
- if (dst != src)
- dst[i] = src[i];
-
- ctrblocks[i] = cpu_to_be64(ctrblk++);
- }
-
- cast5_enc_blk_xway_xor(ctx, (u8 *)dst,
- (u8 *)ctrblocks);
+ cast5_ctr_16way(ctx, (u8 *)dst, (u8 *)src,
+ (__be64 *)walk->iv);

src += CAST5_PARALLEL_BLOCKS;
dst += CAST5_PARALLEL_BLOCKS;
@@ -327,13 +292,16 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,

/* Handle leftovers */
do {
+ u64 ctrblk;
+
if (dst != src)
*dst = *src;

- ctrblocks[0] = cpu_to_be64(ctrblk++);
+ ctrblk = *(u64 *)walk->iv;
+ be64_add_cpu((__be64 *)walk->iv, 1);

- __cast5_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
- *dst ^= ctrblocks[0];
+ __cast5_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
+ *dst ^= ctrblk;

src += 1;
dst += 1;
@@ -341,7 +309,6 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
} while (nbytes >= bsize);

done:
- *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
return nbytes;
}


2012-10-24 13:11:55

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH 0/5] Avoid stack buffers in glue code of AVX implementations

On Sat, Oct 20, 2012 at 03:06:31PM +0300, Jussi Kivilinna wrote:
> Patches modify AVX implementations so that they avoid using extra
> stack memory. Stack is not needed since implementation store
> blocks in registers.

All applied to cryptodev.
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt