2013-04-08 18:51:02

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 1/5] crypto: x86 - add more optimized XTS-mode for serpent-avx

This patch adds AVX optimized XTS-mode helper functions/macros and converts
serpent-avx to use the new facilities. Benefits are slightly improved speed
and reduced stack usage as use of temporary IV-array is avoided.

tcrypt results, with Intel i5-2450M:
enc dec
16B 1.00x 1.00x
64B 1.00x 1.00x
256B 1.04x 1.06x
1024B 1.09x 1.09x
8192B 1.10x 1.09x

Signed-off-by: Jussi Kivilinna <[email protected]>
---
arch/x86/crypto/glue_helper-asm-avx.S | 61 +++++++++++++++++
arch/x86/crypto/glue_helper.c | 97 +++++++++++++++++++++++++++
arch/x86/crypto/serpent-avx-x86_64-asm_64.S | 45 ++++++++++++-
arch/x86/crypto/serpent_avx_glue.c | 87 +++++++++++++-----------
arch/x86/include/asm/crypto/glue_helper.h | 24 +++++++
arch/x86/include/asm/crypto/serpent-avx.h | 5 +
6 files changed, 273 insertions(+), 46 deletions(-)

diff --git a/arch/x86/crypto/glue_helper-asm-avx.S b/arch/x86/crypto/glue_helper-asm-avx.S
index f7b6ea2..02ee230 100644
--- a/arch/x86/crypto/glue_helper-asm-avx.S
+++ b/arch/x86/crypto/glue_helper-asm-avx.S
@@ -1,7 +1,7 @@
/*
* Shared glue code for 128bit block ciphers, AVX assembler macros
*
- * Copyright (c) 2012 Jussi Kivilinna <[email protected]>
+ * Copyright © 2012-2013 Jussi Kivilinna <[email protected]>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -89,3 +89,62 @@
vpxor (6*16)(src), x6, x6; \
vpxor (7*16)(src), x7, x7; \
store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
+
+#define gf128mul_x_ble(iv, mask, tmp) \
+ vpsrad $31, iv, tmp; \
+ vpaddq iv, iv, iv; \
+ vpshufd $0x13, tmp, tmp; \
+ vpand mask, tmp, tmp; \
+ vpxor tmp, iv, iv;
+
+#define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \
+ t1, xts_gf128mul_and_shl1_mask) \
+ vmovdqa xts_gf128mul_and_shl1_mask, t0; \
+ \
+ /* load IV */ \
+ vmovdqu (iv), tiv; \
+ vpxor (0*16)(src), tiv, x0; \
+ vmovdqu tiv, (0*16)(dst); \
+ \
+ /* construct and store IVs, also xor with source */ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (1*16)(src), tiv, x1; \
+ vmovdqu tiv, (1*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (2*16)(src), tiv, x2; \
+ vmovdqu tiv, (2*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (3*16)(src), tiv, x3; \
+ vmovdqu tiv, (3*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (4*16)(src), tiv, x4; \
+ vmovdqu tiv, (4*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (5*16)(src), tiv, x5; \
+ vmovdqu tiv, (5*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (6*16)(src), tiv, x6; \
+ vmovdqu tiv, (6*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (7*16)(src), tiv, x7; \
+ vmovdqu tiv, (7*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vmovdqu tiv, (iv);
+
+#define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vpxor (0*16)(dst), x0, x0; \
+ vpxor (1*16)(dst), x1, x1; \
+ vpxor (2*16)(dst), x2, x2; \
+ vpxor (3*16)(dst), x3, x3; \
+ vpxor (4*16)(dst), x4, x4; \
+ vpxor (5*16)(dst), x5, x5; \
+ vpxor (6*16)(dst), x6, x6; \
+ vpxor (7*16)(dst), x7, x7; \
+ store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index 22ce4f6..432f1d76 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -1,7 +1,7 @@
/*
* Shared glue code for 128bit block ciphers
*
- * Copyright (c) 2012 Jussi Kivilinna <[email protected]>
+ * Copyright © 2012-2013 Jussi Kivilinna <[email protected]>
*
* CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
* Copyright (c) 2006 Herbert Xu <[email protected]>
@@ -304,4 +304,99 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
}
EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit);

+static unsigned int __glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
+ void *ctx,
+ struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ const unsigned int bsize = 128 / 8;
+ unsigned int nbytes = walk->nbytes;
+ u128 *src = (u128 *)walk->src.virt.addr;
+ u128 *dst = (u128 *)walk->dst.virt.addr;
+ unsigned int num_blocks, func_bytes;
+ unsigned int i;
+
+ /* Process multi-block batch */
+ for (i = 0; i < gctx->num_funcs; i++) {
+ num_blocks = gctx->funcs[i].num_blocks;
+ func_bytes = bsize * num_blocks;
+
+ if (nbytes >= func_bytes) {
+ do {
+ gctx->funcs[i].fn_u.xts(ctx, dst, src,
+ (le128 *)walk->iv);
+
+ src += num_blocks;
+ dst += num_blocks;
+ nbytes -= func_bytes;
+ } while (nbytes >= func_bytes);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+ }
+
+done:
+ return nbytes;
+}
+
+/* for implementations implementing faster XTS IV generator */
+int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
+ struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes,
+ void (*tweak_fn)(void *ctx, u8 *dst, const u8 *src),
+ void *tweak_ctx, void *crypt_ctx)
+{
+ const unsigned int bsize = 128 / 8;
+ bool fpu_enabled = false;
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+
+ err = blkcipher_walk_virt(desc, &walk);
+ nbytes = walk.nbytes;
+ if (!nbytes)
+ return err;
+
+ /* set minimum length to bsize, for tweak_fn */
+ fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+ desc, fpu_enabled,
+ nbytes < bsize ? bsize : nbytes);
+
+ /* calculate first value of T */
+ tweak_fn(tweak_ctx, walk.iv, walk.iv);
+
+ while (nbytes) {
+ nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
+
+ err = blkcipher_walk_done(desc, &walk, nbytes);
+ nbytes = walk.nbytes;
+ }
+
+ glue_fpu_end(fpu_enabled);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
+
+void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv,
+ common_glue_func_t fn)
+{
+ le128 ivblk = *iv;
+
+ /* generate next IV */
+ le128_gf128mul_x_ble(iv, &ivblk);
+
+ /* CC <- T xor C */
+ u128_xor(dst, src, (u128 *)&ivblk);
+
+ /* PP <- D(Key2,CC) */
+ fn(ctx, (u8 *)dst, (u8 *)dst);
+
+ /* P <- T xor PP */
+ u128_xor(dst, dst, (u128 *)&ivblk);
+}
+EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit_one);
+
MODULE_LICENSE("GPL");
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index 43c9386..2f202f4 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -4,8 +4,7 @@
* Copyright (C) 2012 Johannes Goetzfried
* <[email protected]>
*
- * Based on arch/x86/crypto/serpent-sse2-x86_64-asm_64.S by
- * Copyright (C) 2011 Jussi Kivilinna <[email protected]>
+ * Copyright © 2011-2013 Jussi Kivilinna <[email protected]>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -34,6 +33,8 @@

.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lxts_gf128mul_and_shl1_mask:
+ .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0

.text

@@ -739,3 +740,43 @@ ENTRY(serpent_ctr_8way_avx)

ret;
ENDPROC(serpent_ctr_8way_avx)
+
+ENTRY(serpent_xts_enc_8way_avx)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+
+ /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+ load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+ RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
+
+ call __serpent_enc_blk8_avx;
+
+ /* dst <= regs xor IVs(in dst) */
+ store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ ret;
+ENDPROC(serpent_xts_enc_8way_avx)
+
+ENTRY(serpent_xts_dec_8way_avx)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+
+ /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+ load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+ RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
+
+ call __serpent_dec_blk8_avx;
+
+ /* dst <= regs xor IVs(in dst) */
+ store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+ ret;
+ENDPROC(serpent_xts_dec_8way_avx)
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 52abaaf..0f8519c 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -4,8 +4,7 @@
* Copyright (C) 2012 Johannes Goetzfried
* <[email protected]>
*
- * Glue code based on serpent_sse2_glue.c by:
- * Copyright (C) 2011 Jussi Kivilinna <[email protected]>
+ * Copyright © 2011-2013 Jussi Kivilinna <[email protected]>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -53,6 +52,18 @@ static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
u128_xor(dst, src, (u128 *)&ctrblk);
}

+static void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+ GLUE_FUNC_CAST(__serpent_encrypt));
+}
+
+static void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+ GLUE_FUNC_CAST(__serpent_decrypt));
+}
+
static const struct common_glue_ctx serpent_enc = {
.num_funcs = 2,
.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
@@ -79,6 +90,19 @@ static const struct common_glue_ctx serpent_ctr = {
} }
};

+static const struct common_glue_ctx serpent_enc_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) }
+ } }
+};
+
static const struct common_glue_ctx serpent_dec = {
.num_funcs = 2,
.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
@@ -105,6 +129,19 @@ static const struct common_glue_ctx serpent_dec_cbc = {
} }
};

+static const struct common_glue_ctx serpent_dec_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) }
+ } }
+};
+
static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
@@ -299,54 +336,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
- be128 buf[SERPENT_PARALLEL_BLOCKS];
- struct crypt_priv crypt_ctx = {
- .ctx = &ctx->crypt_ctx,
- .fpu_enabled = false,
- };
- struct xts_crypt_req req = {
- .tbuf = buf,
- .tbuflen = sizeof(buf),
-
- .tweak_ctx = &ctx->tweak_ctx,
- .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
- .crypt_ctx = &crypt_ctx,
- .crypt_fn = encrypt_callback,
- };
- int ret;

- desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
- ret = xts_crypt(desc, dst, src, nbytes, &req);
- serpent_fpu_end(crypt_ctx.fpu_enabled);
-
- return ret;
+ return glue_xts_crypt_128bit(&serpent_enc_xts, desc, dst, src, nbytes,
+ XTS_TWEAK_CAST(__serpent_encrypt),
+ &ctx->tweak_ctx, &ctx->crypt_ctx);
}

static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
- be128 buf[SERPENT_PARALLEL_BLOCKS];
- struct crypt_priv crypt_ctx = {
- .ctx = &ctx->crypt_ctx,
- .fpu_enabled = false,
- };
- struct xts_crypt_req req = {
- .tbuf = buf,
- .tbuflen = sizeof(buf),
-
- .tweak_ctx = &ctx->tweak_ctx,
- .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
- .crypt_ctx = &crypt_ctx,
- .crypt_fn = decrypt_callback,
- };
- int ret;

- desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
- ret = xts_crypt(desc, dst, src, nbytes, &req);
- serpent_fpu_end(crypt_ctx.fpu_enabled);
-
- return ret;
+ return glue_xts_crypt_128bit(&serpent_dec_xts, desc, dst, src, nbytes,
+ XTS_TWEAK_CAST(__serpent_encrypt),
+ &ctx->tweak_ctx, &ctx->crypt_ctx);
}

static struct crypto_alg serpent_algs[10] = { {
diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h
index e2d65b0..1eef555 100644
--- a/arch/x86/include/asm/crypto/glue_helper.h
+++ b/arch/x86/include/asm/crypto/glue_helper.h
@@ -14,10 +14,13 @@ typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src);
typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src);
typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src,
le128 *iv);
+typedef void (*common_glue_xts_func_t)(void *ctx, u128 *dst, const u128 *src,
+ le128 *iv);

#define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn))
#define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn))
#define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn))
+#define GLUE_XTS_FUNC_CAST(fn) ((common_glue_xts_func_t)(fn))

struct common_glue_func_entry {
unsigned int num_blocks; /* number of blocks that @fn will process */
@@ -25,6 +28,7 @@ struct common_glue_func_entry {
common_glue_func_t ecb;
common_glue_cbc_func_t cbc;
common_glue_ctr_func_t ctr;
+ common_glue_xts_func_t xts;
} fn_u;
};

@@ -96,6 +100,16 @@ static inline void le128_inc(le128 *i)
i->b = cpu_to_le64(b);
}

+static inline void le128_gf128mul_x_ble(le128 *dst, const le128 *src)
+{
+ u64 a = le64_to_cpu(src->a);
+ u64 b = le64_to_cpu(src->b);
+ u64 _tt = ((s64)a >> 63) & 0x87;
+
+ dst->a = cpu_to_le64((a << 1) ^ (b >> 63));
+ dst->b = cpu_to_le64((b << 1) ^ _tt);
+}
+
extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
struct blkcipher_desc *desc,
struct scatterlist *dst,
@@ -118,4 +132,14 @@ extern int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes);

+extern int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
+ struct blkcipher_desc *desc,
+ struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes,
+ common_glue_func_t tweak_fn, void *tweak_ctx,
+ void *crypt_ctx);
+
+extern void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src,
+ le128 *iv, common_glue_func_t fn);
+
#endif /* _CRYPTO_GLUE_HELPER_H */
diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h
index 0da1d3e..56e79cc 100644
--- a/arch/x86/include/asm/crypto/serpent-avx.h
+++ b/arch/x86/include/asm/crypto/serpent-avx.h
@@ -16,4 +16,9 @@ asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
const u8 *src, le128 *iv);

+asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+
#endif


2013-04-08 18:51:13

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 2/5] crypto: x86/twofish-avx - use optimized XTS code

Change twofish-avx to use the new XTS code, for smaller stack usage and small
boost to performance.

tcrypt results, with Intel i5-2450M:
enc dec
16B 1.03x 1.02x
64B 0.91x 0.91x
256B 1.10x 1.09x
1024B 1.12x 1.11x
8192B 1.12x 1.11x

Since XTS is practically always used with data blocks of size 512 bytes or
more, I chose to not make use of twofish-3way for block sized smaller than
128 bytes. This causes slower result in tcrypt for 64 bytes.

Signed-off-by: Jussi Kivilinna <[email protected]>
---
arch/x86/crypto/twofish-avx-x86_64-asm_64.S | 48 ++++++++++++++
arch/x86/crypto/twofish_avx_glue.c | 91 +++++++++++++++------------
2 files changed, 98 insertions(+), 41 deletions(-)

diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 8d3e113..0505813 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -4,7 +4,7 @@
* Copyright (C) 2012 Johannes Goetzfried
* <[email protected]>
*
- * Copyright © 2012 Jussi Kivilinna <[email protected]>
+ * Copyright © 2012-2013 Jussi Kivilinna <[email protected]>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -33,6 +33,8 @@

.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lxts_gf128mul_and_shl1_mask:
+ .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0

.text

@@ -408,3 +410,47 @@ ENTRY(twofish_ctr_8way)

ret;
ENDPROC(twofish_ctr_8way)
+
+ENTRY(twofish_xts_enc_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+
+ movq %rsi, %r11;
+
+ /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+ load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+ RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask);
+
+ call __twofish_enc_blk8;
+
+ /* dst <= regs xor IVs(in dst) */
+ store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+ ret;
+ENDPROC(twofish_xts_enc_8way)
+
+ENTRY(twofish_xts_dec_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+
+ movq %rsi, %r11;
+
+ /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+ load_xts_8way(%rcx, %rdx, %rsi, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2,
+ RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask);
+
+ call __twofish_dec_blk8;
+
+ /* dst <= regs xor IVs(in dst) */
+ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ ret;
+ENDPROC(twofish_xts_dec_8way)
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index 94ac91d..a62ba54 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -4,6 +4,8 @@
* Copyright (C) 2012 Johannes Goetzfried
* <[email protected]>
*
+ * Copyright © 2013 Jussi Kivilinna <[email protected]>
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@@ -56,12 +58,29 @@ asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src, le128 *iv);

+asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+
static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src)
{
__twofish_enc_blk_3way(ctx, dst, src, false);
}

+static void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+ GLUE_FUNC_CAST(twofish_enc_blk));
+}
+
+static void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+ GLUE_FUNC_CAST(twofish_dec_blk));
+}
+

static const struct common_glue_ctx twofish_enc = {
.num_funcs = 3,
@@ -95,6 +114,19 @@ static const struct common_glue_ctx twofish_ctr = {
} }
};

+static const struct common_glue_ctx twofish_enc_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = TWOFISH_PARALLEL_BLOCKS,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) }
+ } }
+};
+
static const struct common_glue_ctx twofish_dec = {
.num_funcs = 3,
.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
@@ -127,6 +159,19 @@ static const struct common_glue_ctx twofish_dec_cbc = {
} }
};

+static const struct common_glue_ctx twofish_dec_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = TWOFISH_PARALLEL_BLOCKS,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) }
+ } }
+};
+
static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
@@ -275,54 +320,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
- be128 buf[TWOFISH_PARALLEL_BLOCKS];
- struct crypt_priv crypt_ctx = {
- .ctx = &ctx->crypt_ctx,
- .fpu_enabled = false,
- };
- struct xts_crypt_req req = {
- .tbuf = buf,
- .tbuflen = sizeof(buf),
-
- .tweak_ctx = &ctx->tweak_ctx,
- .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
- .crypt_ctx = &crypt_ctx,
- .crypt_fn = encrypt_callback,
- };
- int ret;

- desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
- ret = xts_crypt(desc, dst, src, nbytes, &req);
- twofish_fpu_end(crypt_ctx.fpu_enabled);
-
- return ret;
+ return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes,
+ XTS_TWEAK_CAST(twofish_enc_blk),
+ &ctx->tweak_ctx, &ctx->crypt_ctx);
}

static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
- be128 buf[TWOFISH_PARALLEL_BLOCKS];
- struct crypt_priv crypt_ctx = {
- .ctx = &ctx->crypt_ctx,
- .fpu_enabled = false,
- };
- struct xts_crypt_req req = {
- .tbuf = buf,
- .tbuflen = sizeof(buf),
-
- .tweak_ctx = &ctx->tweak_ctx,
- .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
- .crypt_ctx = &crypt_ctx,
- .crypt_fn = decrypt_callback,
- };
- int ret;

- desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
- ret = xts_crypt(desc, dst, src, nbytes, &req);
- twofish_fpu_end(crypt_ctx.fpu_enabled);
-
- return ret;
+ return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes,
+ XTS_TWEAK_CAST(twofish_enc_blk),
+ &ctx->tweak_ctx, &ctx->crypt_ctx);
}

static struct crypto_alg twofish_algs[10] = { {

2013-04-08 18:51:12

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 3/5] crypto: cast6-avx: use new optimized XTS code

Change cast6-avx to use the new XTS code, for smaller stack usage and small
boost to performance.

tcrypt results, with Intel i5-2450M:
enc dec
16B 1.01x 1.01x
64B 1.01x 1.00x
256B 1.09x 1.02x
1024B 1.08x 1.06x
8192B 1.08x 1.07x

Signed-off-by: Jussi Kivilinna <[email protected]>
---
arch/x86/crypto/cast6-avx-x86_64-asm_64.S | 48 +++++++++++++++
arch/x86/crypto/cast6_avx_glue.c | 91 ++++++++++++++++-------------
2 files changed, 98 insertions(+), 41 deletions(-)

diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index f93b610..e3531f8 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -4,7 +4,7 @@
* Copyright (C) 2012 Johannes Goetzfried
* <[email protected]>
*
- * Copyright © 2012 Jussi Kivilinna <[email protected]>
+ * Copyright © 2012-2013 Jussi Kivilinna <[email protected]>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -227,6 +227,8 @@
.data

.align 16
+.Lxts_gf128mul_and_shl1_mask:
+ .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
.Lbswap_mask:
.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
.Lbswap128_mask:
@@ -424,3 +426,47 @@ ENTRY(cast6_ctr_8way)

ret;
ENDPROC(cast6_ctr_8way)
+
+ENTRY(cast6_xts_enc_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+
+ movq %rsi, %r11;
+
+ /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+ load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+ RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
+
+ call __cast6_enc_blk8;
+
+ /* dst <= regs xor IVs(in dst) */
+ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ ret;
+ENDPROC(cast6_xts_enc_8way)
+
+ENTRY(cast6_xts_dec_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+
+ movq %rsi, %r11;
+
+ /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+ load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+ RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
+
+ call __cast6_dec_blk8;
+
+ /* dst <= regs xor IVs(in dst) */
+ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ ret;
+ENDPROC(cast6_xts_dec_8way)
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 92f7ca2..8d0dfb8 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -4,6 +4,8 @@
* Copyright (C) 2012 Johannes Goetzfried
* <[email protected]>
*
+ * Copyright © 2013 Jussi Kivilinna <[email protected]>
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@@ -50,6 +52,23 @@ asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst,
asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src,
le128 *iv);

+asmlinkage void cast6_xts_enc_8way(struct cast6_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+asmlinkage void cast6_xts_dec_8way(struct cast6_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+
+static void cast6_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+ GLUE_FUNC_CAST(__cast6_encrypt));
+}
+
+static void cast6_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+ GLUE_FUNC_CAST(__cast6_decrypt));
+}
+
static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
be128 ctrblk;
@@ -87,6 +106,19 @@ static const struct common_glue_ctx cast6_ctr = {
} }
};

+static const struct common_glue_ctx cast6_enc_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAST6_PARALLEL_BLOCKS,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc_8way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc) }
+ } }
+};
+
static const struct common_glue_ctx cast6_dec = {
.num_funcs = 2,
.fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
@@ -113,6 +145,19 @@ static const struct common_glue_ctx cast6_dec_cbc = {
} }
};

+static const struct common_glue_ctx cast6_dec_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAST6_PARALLEL_BLOCKS,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec_8way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec) }
+ } }
+};
+
static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
@@ -307,54 +352,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
- be128 buf[CAST6_PARALLEL_BLOCKS];
- struct crypt_priv crypt_ctx = {
- .ctx = &ctx->crypt_ctx,
- .fpu_enabled = false,
- };
- struct xts_crypt_req req = {
- .tbuf = buf,
- .tbuflen = sizeof(buf),

- .tweak_ctx = &ctx->tweak_ctx,
- .tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt),
- .crypt_ctx = &crypt_ctx,
- .crypt_fn = encrypt_callback,
- };
- int ret;
-
- desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
- ret = xts_crypt(desc, dst, src, nbytes, &req);
- cast6_fpu_end(crypt_ctx.fpu_enabled);
-
- return ret;
+ return glue_xts_crypt_128bit(&cast6_enc_xts, desc, dst, src, nbytes,
+ XTS_TWEAK_CAST(__cast6_encrypt),
+ &ctx->tweak_ctx, &ctx->crypt_ctx);
}

static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
- be128 buf[CAST6_PARALLEL_BLOCKS];
- struct crypt_priv crypt_ctx = {
- .ctx = &ctx->crypt_ctx,
- .fpu_enabled = false,
- };
- struct xts_crypt_req req = {
- .tbuf = buf,
- .tbuflen = sizeof(buf),
-
- .tweak_ctx = &ctx->tweak_ctx,
- .tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt),
- .crypt_ctx = &crypt_ctx,
- .crypt_fn = decrypt_callback,
- };
- int ret;

- desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
- ret = xts_crypt(desc, dst, src, nbytes, &req);
- cast6_fpu_end(crypt_ctx.fpu_enabled);
-
- return ret;
+ return glue_xts_crypt_128bit(&cast6_dec_xts, desc, dst, src, nbytes,
+ XTS_TWEAK_CAST(__cast6_encrypt),
+ &ctx->tweak_ctx, &ctx->crypt_ctx);
}

static struct crypto_alg cast6_algs[10] = { {

2013-04-08 18:51:17

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 4/5] crypto: x86/camellia-aesni-avx - add more optimized XTS code

Add more optimized XTS code for camellia-aesni-avx, for smaller stack usage
and small boost for speed.

tcrypt results, with Intel i5-2450M:
enc dec
16B 1.10x 1.01x
64B 0.82x 0.77x
256B 1.14x 1.10x
1024B 1.17x 1.16x
8192B 1.10x 1.11x

Since XTS is practically always used with data blocks of size 512 bytes or
more, I chose to not make use of camellia-2way for block sized smaller than
256 bytes. This causes slower result in tcrypt for 64 bytes.

Signed-off-by: Jussi Kivilinna <[email protected]>
---
arch/x86/crypto/camellia-aesni-avx-asm_64.S | 180 +++++++++++++++++++++++++++
arch/x86/crypto/camellia_aesni_avx_glue.c | 91 ++++++++------
2 files changed, 229 insertions(+), 42 deletions(-)

diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index cfc1634..ce71f92 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -1,7 +1,7 @@
/*
* x86_64/AVX/AES-NI assembler implementation of Camellia
*
- * Copyright © 2012 Jussi Kivilinna <[email protected]>
+ * Copyright © 2012-2013 Jussi Kivilinna <[email protected]>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -589,6 +589,10 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0

+/* For XTS mode IV generation */
+.Lxts_gf128mul_and_shl1_mask:
+ .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+
/*
* pre-SubByte transform
*
@@ -1090,3 +1094,177 @@ ENTRY(camellia_ctr_16way)

ret;
ENDPROC(camellia_ctr_16way)
+
+#define gf128mul_x_ble(iv, mask, tmp) \
+ vpsrad $31, iv, tmp; \
+ vpaddq iv, iv, iv; \
+ vpshufd $0x13, tmp, tmp; \
+ vpand mask, tmp, tmp; \
+ vpxor tmp, iv, iv;
+
+.align 8
+camellia_xts_crypt_16way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ * %r8: index for input whitening key
+ * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16
+ */
+
+ subq $(16 * 16), %rsp;
+ movq %rsp, %rax;
+
+ vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14;
+
+ /* load IV */
+ vmovdqu (%rcx), %xmm0;
+ vpxor 0 * 16(%rdx), %xmm0, %xmm15;
+ vmovdqu %xmm15, 15 * 16(%rax);
+ vmovdqu %xmm0, 0 * 16(%rsi);
+
+ /* construct IVs */
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 1 * 16(%rdx), %xmm0, %xmm15;
+ vmovdqu %xmm15, 14 * 16(%rax);
+ vmovdqu %xmm0, 1 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 2 * 16(%rdx), %xmm0, %xmm13;
+ vmovdqu %xmm0, 2 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 3 * 16(%rdx), %xmm0, %xmm12;
+ vmovdqu %xmm0, 3 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 4 * 16(%rdx), %xmm0, %xmm11;
+ vmovdqu %xmm0, 4 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 5 * 16(%rdx), %xmm0, %xmm10;
+ vmovdqu %xmm0, 5 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 6 * 16(%rdx), %xmm0, %xmm9;
+ vmovdqu %xmm0, 6 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 7 * 16(%rdx), %xmm0, %xmm8;
+ vmovdqu %xmm0, 7 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 8 * 16(%rdx), %xmm0, %xmm7;
+ vmovdqu %xmm0, 8 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 9 * 16(%rdx), %xmm0, %xmm6;
+ vmovdqu %xmm0, 9 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 10 * 16(%rdx), %xmm0, %xmm5;
+ vmovdqu %xmm0, 10 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 11 * 16(%rdx), %xmm0, %xmm4;
+ vmovdqu %xmm0, 11 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 12 * 16(%rdx), %xmm0, %xmm3;
+ vmovdqu %xmm0, 12 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 13 * 16(%rdx), %xmm0, %xmm2;
+ vmovdqu %xmm0, 13 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 14 * 16(%rdx), %xmm0, %xmm1;
+ vmovdqu %xmm0, 14 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 15 * 16(%rdx), %xmm0, %xmm15;
+ vmovdqu %xmm15, 0 * 16(%rax);
+ vmovdqu %xmm0, 15 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vmovdqu %xmm0, (%rcx);
+
+ /* inpack16_pre: */
+ vmovq (key_table)(CTX, %r8, 8), %xmm15;
+ vpshufb .Lpack_bswap, %xmm15, %xmm15;
+ vpxor 0 * 16(%rax), %xmm15, %xmm0;
+ vpxor %xmm1, %xmm15, %xmm1;
+ vpxor %xmm2, %xmm15, %xmm2;
+ vpxor %xmm3, %xmm15, %xmm3;
+ vpxor %xmm4, %xmm15, %xmm4;
+ vpxor %xmm5, %xmm15, %xmm5;
+ vpxor %xmm6, %xmm15, %xmm6;
+ vpxor %xmm7, %xmm15, %xmm7;
+ vpxor %xmm8, %xmm15, %xmm8;
+ vpxor %xmm9, %xmm15, %xmm9;
+ vpxor %xmm10, %xmm15, %xmm10;
+ vpxor %xmm11, %xmm15, %xmm11;
+ vpxor %xmm12, %xmm15, %xmm12;
+ vpxor %xmm13, %xmm15, %xmm13;
+ vpxor 14 * 16(%rax), %xmm15, %xmm14;
+ vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+ call *%r9;
+
+ addq $(16 * 16), %rsp;
+
+ vpxor 0 * 16(%rsi), %xmm7, %xmm7;
+ vpxor 1 * 16(%rsi), %xmm6, %xmm6;
+ vpxor 2 * 16(%rsi), %xmm5, %xmm5;
+ vpxor 3 * 16(%rsi), %xmm4, %xmm4;
+ vpxor 4 * 16(%rsi), %xmm3, %xmm3;
+ vpxor 5 * 16(%rsi), %xmm2, %xmm2;
+ vpxor 6 * 16(%rsi), %xmm1, %xmm1;
+ vpxor 7 * 16(%rsi), %xmm0, %xmm0;
+ vpxor 8 * 16(%rsi), %xmm15, %xmm15;
+ vpxor 9 * 16(%rsi), %xmm14, %xmm14;
+ vpxor 10 * 16(%rsi), %xmm13, %xmm13;
+ vpxor 11 * 16(%rsi), %xmm12, %xmm12;
+ vpxor 12 * 16(%rsi), %xmm11, %xmm11;
+ vpxor 13 * 16(%rsi), %xmm10, %xmm10;
+ vpxor 14 * 16(%rsi), %xmm9, %xmm9;
+ vpxor 15 * 16(%rsi), %xmm8, %xmm8;
+ write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+ %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+ %xmm8, %rsi);
+
+ ret;
+ENDPROC(camellia_xts_crypt_16way)
+
+ENTRY(camellia_xts_enc_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+ xorl %r8d, %r8d; /* input whitening key, 0 for enc */
+
+ leaq __camellia_enc_blk16, %r9;
+
+ jmp camellia_xts_crypt_16way;
+ENDPROC(camellia_xts_enc_16way)
+
+ENTRY(camellia_xts_dec_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+
+ cmpl $16, key_length(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* input whitening key, last for dec */
+
+ leaq __camellia_dec_blk16, %r9;
+
+ jmp camellia_xts_crypt_16way;
+ENDPROC(camellia_xts_dec_16way)
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index 96cbb60..4ff7ed4 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -1,7 +1,7 @@
/*
* Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia
*
- * Copyright © 2012 Jussi Kivilinna <[email protected]>
+ * Copyright © 2012-2013 Jussi Kivilinna <[email protected]>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -37,6 +37,23 @@ asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
const u8 *src, le128 *iv);

+asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+
+static void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+ GLUE_FUNC_CAST(camellia_enc_blk));
+}
+
+static void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+ GLUE_FUNC_CAST(camellia_dec_blk));
+}
+
static const struct common_glue_ctx camellia_enc = {
.num_funcs = 3,
.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
@@ -69,6 +86,19 @@ static const struct common_glue_ctx camellia_ctr = {
} }
};

+static const struct common_glue_ctx camellia_enc_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) }
+ } }
+};
+
static const struct common_glue_ctx camellia_dec = {
.num_funcs = 3,
.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
@@ -101,6 +131,19 @@ static const struct common_glue_ctx camellia_dec_cbc = {
} }
};

+static const struct common_glue_ctx camellia_dec_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) }
+ } }
+};
+
static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
@@ -261,54 +304,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
- be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
- struct crypt_priv crypt_ctx = {
- .ctx = &ctx->crypt_ctx,
- .fpu_enabled = false,
- };
- struct xts_crypt_req req = {
- .tbuf = buf,
- .tbuflen = sizeof(buf),
-
- .tweak_ctx = &ctx->tweak_ctx,
- .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
- .crypt_ctx = &crypt_ctx,
- .crypt_fn = encrypt_callback,
- };
- int ret;
-
- desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
- ret = xts_crypt(desc, dst, src, nbytes, &req);
- camellia_fpu_end(crypt_ctx.fpu_enabled);

- return ret;
+ return glue_xts_crypt_128bit(&camellia_enc_xts, desc, dst, src, nbytes,
+ XTS_TWEAK_CAST(camellia_enc_blk),
+ &ctx->tweak_ctx, &ctx->crypt_ctx);
}

static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
- be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
- struct crypt_priv crypt_ctx = {
- .ctx = &ctx->crypt_ctx,
- .fpu_enabled = false,
- };
- struct xts_crypt_req req = {
- .tbuf = buf,
- .tbuflen = sizeof(buf),

- .tweak_ctx = &ctx->tweak_ctx,
- .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
- .crypt_ctx = &crypt_ctx,
- .crypt_fn = decrypt_callback,
- };
- int ret;
-
- desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
- ret = xts_crypt(desc, dst, src, nbytes, &req);
- camellia_fpu_end(crypt_ctx.fpu_enabled);
-
- return ret;
+ return glue_xts_crypt_128bit(&camellia_dec_xts, desc, dst, src, nbytes,
+ XTS_TWEAK_CAST(camellia_enc_blk),
+ &ctx->tweak_ctx, &ctx->crypt_ctx);
}

static struct crypto_alg cmll_algs[10] = { {

2013-04-08 18:51:19

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 5/5] crypto: aesni_intel - add more optimized XTS mode for x86-64

Add more optimized XTS code for aesni_intel in 64-bit mode, for smaller stack
usage and boost for speed.

tcrypt results, with Intel i5-2450M:
256-bit key
enc dec
16B 0.98x 0.99x
64B 0.64x 0.63x
256B 1.29x 1.32x
1024B 1.54x 1.58x
8192B 1.57x 1.60x

512-bit key
enc dec
16B 0.98x 0.99x
64B 0.60x 0.59x
256B 1.24x 1.25x
1024B 1.39x 1.42x
8192B 1.38x 1.42x

I chose not to optimize smaller than block size of 256 bytes, since XTS is
practically always used with data blocks of size 512 bytes. This is why
performance is reduced in tcrypt for 64 byte long blocks.

Cc: Huang Ying <[email protected]>
Signed-off-by: Jussi Kivilinna <[email protected]>
---
arch/x86/crypto/aesni-intel_asm.S | 117 ++++++++++++++++++++++++++++++++++++
arch/x86/crypto/aesni-intel_glue.c | 80 +++++++++++++++++++++++++
crypto/Kconfig | 1
3 files changed, 198 insertions(+)

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 04b7977..62fe22c 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -34,6 +34,10 @@

#ifdef __x86_64__
.data
+.align 16
+.Lgf128mul_x_ble_mask:
+ .octa 0x00000000000000010000000000000087
+
POLY: .octa 0xC2000000000000000000000000000001
TWOONE: .octa 0x00000001000000000000000000000001

@@ -105,6 +109,8 @@ enc: .octa 0x2
#define CTR %xmm11
#define INC %xmm12

+#define GF128MUL_MASK %xmm10
+
#ifdef __x86_64__
#define AREG %rax
#define KEYP %rdi
@@ -2636,4 +2642,115 @@ ENTRY(aesni_ctr_enc)
.Lctr_enc_just_ret:
ret
ENDPROC(aesni_ctr_enc)
+
+/*
+ * _aesni_gf128mul_x_ble: internal ABI
+ * Multiply in GF(2^128) for XTS IVs
+ * input:
+ * IV: current IV
+ * GF128MUL_MASK == mask with 0x87 and 0x01
+ * output:
+ * IV: next IV
+ * changed:
+ * CTR: == temporary value
+ */
+#define _aesni_gf128mul_x_ble() \
+ pshufd $0x13, IV, CTR; \
+ paddq IV, IV; \
+ psrad $31, CTR; \
+ pand GF128MUL_MASK, CTR; \
+ pxor CTR, IV;
+
+/*
+ * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * bool enc, u8 *iv)
+ */
+ENTRY(aesni_xts_crypt8)
+ cmpb $0, %cl
+ movl $0, %ecx
+ movl $240, %r10d
+ leaq _aesni_enc4, %r11
+ leaq _aesni_dec4, %rax
+ cmovel %r10d, %ecx
+ cmoveq %rax, %r11
+
+ movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
+ movups (IVP), IV
+
+ mov 480(KEYP), KLEN
+ addq %rcx, KEYP
+
+ movdqa IV, STATE1
+ pxor 0x00(INP), STATE1
+ movdqu IV, 0x00(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE2
+ pxor 0x10(INP), STATE2
+ movdqu IV, 0x10(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE3
+ pxor 0x20(INP), STATE3
+ movdqu IV, 0x20(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE4
+ pxor 0x30(INP), STATE4
+ movdqu IV, 0x30(OUTP)
+
+ call *%r11
+
+ pxor 0x00(OUTP), STATE1
+ movdqu STATE1, 0x00(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE1
+ pxor 0x40(INP), STATE1
+ movdqu IV, 0x40(OUTP)
+
+ pxor 0x10(OUTP), STATE2
+ movdqu STATE2, 0x10(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE2
+ pxor 0x50(INP), STATE2
+ movdqu IV, 0x50(OUTP)
+
+ pxor 0x20(OUTP), STATE3
+ movdqu STATE3, 0x20(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE3
+ pxor 0x60(INP), STATE3
+ movdqu IV, 0x60(OUTP)
+
+ pxor 0x30(OUTP), STATE4
+ movdqu STATE4, 0x30(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE4
+ pxor 0x70(INP), STATE4
+ movdqu IV, 0x70(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movups IV, (IVP)
+
+ call *%r11
+
+ pxor 0x40(OUTP), STATE1
+ movdqu STATE1, 0x40(OUTP)
+
+ pxor 0x50(OUTP), STATE2
+ movdqu STATE2, 0x50(OUTP)
+
+ pxor 0x60(OUTP), STATE3
+ movdqu STATE3, 0x60(OUTP)
+
+ pxor 0x70(OUTP), STATE4
+ movdqu STATE4, 0x70(OUTP)
+
+ ret
+ENDPROC(aesni_xts_crypt8)
+
#endif
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index a0795da..f80e668 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -39,6 +39,9 @@
#include <crypto/internal/aead.h>
#include <linux/workqueue.h>
#include <linux/spinlock.h>
+#ifdef CONFIG_X86_64
+#include <asm/crypto/glue_helper.h>
+#endif

#if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE)
#define HAS_PCBC
@@ -102,6 +105,9 @@ void crypto_fpu_exit(void);
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);

+asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, bool enc, u8 *iv);
+
/* asmlinkage void aesni_gcm_enc()
* void *ctx, AES Key schedule. Starts on a 16 byte boundary.
* u8 *out, Ciphertext output. Encrypt in-place is allowed.
@@ -510,6 +516,78 @@ static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in)
aesni_enc(ctx, out, in);
}

+#ifdef CONFIG_X86_64
+
+static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc));
+}
+
+static void aesni_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_dec));
+}
+
+static void aesni_xts_enc8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, true, (u8 *)iv);
+}
+
+static void aesni_xts_dec8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, false, (u8 *)iv);
+}
+
+static const struct common_glue_ctx aesni_enc_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = 1,
+
+ .funcs = { {
+ .num_blocks = 8,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc8) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc) }
+ } }
+};
+
+static const struct common_glue_ctx aesni_dec_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = 1,
+
+ .funcs = { {
+ .num_blocks = 8,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec8) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec) }
+ } }
+};
+
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+
+ return glue_xts_crypt_128bit(&aesni_enc_xts, desc, dst, src, nbytes,
+ XTS_TWEAK_CAST(aesni_xts_tweak),
+ aes_ctx(ctx->raw_tweak_ctx),
+ aes_ctx(ctx->raw_crypt_ctx));
+}
+
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+
+ return glue_xts_crypt_128bit(&aesni_dec_xts, desc, dst, src, nbytes,
+ XTS_TWEAK_CAST(aesni_xts_tweak),
+ aes_ctx(ctx->raw_tweak_ctx),
+ aes_ctx(ctx->raw_crypt_ctx));
+}
+
+#else
+
static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
@@ -560,6 +638,8 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
return ret;
}

+#endif
+
#ifdef CONFIG_X86_64
static int rfc4106_init(struct crypto_tfm *tfm)
{
diff --git a/crypto/Kconfig b/crypto/Kconfig
index c1142f3..808ac37 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -678,6 +678,7 @@ config CRYPTO_AES_NI_INTEL
select CRYPTO_CRYPTD
select CRYPTO_ABLK_HELPER_X86
select CRYPTO_ALGAPI
+ select CRYPTO_GLUE_HELPER if 64BIT
select CRYPTO_LRW
select CRYPTO_XTS
help

2013-04-10 03:32:44

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH 1/5] crypto: x86 - add more optimized XTS-mode for serpent-avx

On Mon, Apr 08, 2013 at 09:50:55PM +0300, Jussi Kivilinna wrote:
> This patch adds AVX optimized XTS-mode helper functions/macros and converts
> serpent-avx to use the new facilities. Benefits are slightly improved speed
> and reduced stack usage as use of temporary IV-array is avoided.
>
> tcrypt results, with Intel i5-2450M:
> enc dec
> 16B 1.00x 1.00x
> 64B 1.00x 1.00x
> 256B 1.04x 1.06x
> 1024B 1.09x 1.09x
> 8192B 1.10x 1.09x
>
> Signed-off-by: Jussi Kivilinna <[email protected]>

All applied. Thanks!
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt