2011-10-17 21:02:52

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 0/7] crypto: add SSE2-x86_64/i586 implementation of Serpent cipher

This series adds SSE2 optimized version of Serpent cipher for x86_64 and i586
architectures. The i586 implementation processes four serpent blocks parallel
in SSE2 registers. The x86_64 implementation utilizes available extra SSE2
registers for higher performance on out-of-order CPUs, crypting 8 blocks
parallel.

Series depends on previous testmgr/tcrypt patches in twofish-asm-3way series
and also on following patches:
http://marc.info/?l=linux-crypto-vger&m=131827700228773&w=2
http://marc.info/?l=linux-crypto-vger&m=131827699228759&w=2

---

Jussi Kivilinna (7):
crypto: testmgr: add new serpent test vectors
crypto: tcrypt: add test_acipher_speed
crypto: tcrypt: add serpent speed tests
crypto: serpent: export common functions for x86_64/i386-sse2 assembler implementations
crypto: serpent: rename module from serpent to serpent_generic
crypto: serpent: add 8-way parallel x86_64/SSE2 assembler implementation
crypto: serpent: add 4-way parallel i586/SSE2 assembler implementation


arch/x86/crypto/Makefile | 4
arch/x86/crypto/serpent-sse2-i586-asm_32.S | 639 ++++++++++++++++++++++
arch/x86/crypto/serpent-sse2-x86_64-asm_64.S | 761 ++++++++++++++++++++++++++
arch/x86/crypto/serpent_sse2_glue.c | 719 +++++++++++++++++++++++++
arch/x86/include/asm/serpent.h | 64 ++
crypto/Kconfig | 34 +
crypto/Makefile | 4
crypto/serpent.c | 44 +-
crypto/tcrypt.c | 282 ++++++++++
crypto/testmgr.c | 90 +++
crypto/testmgr.h | 393 +++++++++++++
include/crypto/serpent.h | 25 +
12 files changed, 3037 insertions(+), 22 deletions(-)
create mode 100644 arch/x86/crypto/serpent-sse2-i586-asm_32.S
create mode 100644 arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
create mode 100644 arch/x86/crypto/serpent_sse2_glue.c
create mode 100644 arch/x86/include/asm/serpent.h
create mode 100644 include/crypto/serpent.h


2011-10-17 21:02:57

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 1/7] crypto: testmgr: add new serpent test vectors

Add new serpent tests for serpent_sse2 x86_64/i586 8-way/4-way code paths.

Signed-off-by: Jussi Kivilinna <[email protected]>
---
crypto/tcrypt.c | 2
crypto/testmgr.c | 30 ++++
crypto/testmgr.h | 393 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 423 insertions(+), 2 deletions(-)

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 0c4e80f..ac9e4d2 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -793,6 +793,8 @@ static int do_test(int m)

case 9:
ret += tcrypt_test("ecb(serpent)");
+ ret += tcrypt_test("cbc(serpent)");
+ ret += tcrypt_test("ctr(serpent)");
break;

case 10:
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index e91c1eb..49436b9 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1675,6 +1675,21 @@ static const struct alg_test_desc alg_test_descs[] = {
}
}
}, {
+ .alg = "cbc(serpent)",
+ .test = alg_test_skcipher,
+ .suite = {
+ .cipher = {
+ .enc = {
+ .vecs = serpent_cbc_enc_tv_template,
+ .count = SERPENT_CBC_ENC_TEST_VECTORS
+ },
+ .dec = {
+ .vecs = serpent_cbc_dec_tv_template,
+ .count = SERPENT_CBC_DEC_TEST_VECTORS
+ }
+ }
+ }
+ }, {
.alg = "cbc(twofish)",
.test = alg_test_skcipher,
.suite = {
@@ -1771,6 +1786,21 @@ static const struct alg_test_desc alg_test_descs[] = {
}
}
}, {
+ .alg = "ctr(serpent)",
+ .test = alg_test_skcipher,
+ .suite = {
+ .cipher = {
+ .enc = {
+ .vecs = serpent_ctr_enc_tv_template,
+ .count = SERPENT_CTR_ENC_TEST_VECTORS
+ },
+ .dec = {
+ .vecs = serpent_ctr_dec_tv_template,
+ .count = SERPENT_CTR_DEC_TEST_VECTORS
+ }
+ }
+ }
+ }, {
.alg = "ctr(twofish)",
.test = alg_test_skcipher,
.suite = {
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 37b4d8f..ed4aec9 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -3096,12 +3096,18 @@ static struct cipher_testvec tf_ctr_dec_tv_template[] = {
* Serpent test vectors. These are backwards because Serpent writes
* octet sequences in right-to-left mode.
*/
-#define SERPENT_ENC_TEST_VECTORS 4
-#define SERPENT_DEC_TEST_VECTORS 4
+#define SERPENT_ENC_TEST_VECTORS 5
+#define SERPENT_DEC_TEST_VECTORS 5

#define TNEPRES_ENC_TEST_VECTORS 4
#define TNEPRES_DEC_TEST_VECTORS 4

+#define SERPENT_CBC_ENC_TEST_VECTORS 1
+#define SERPENT_CBC_DEC_TEST_VECTORS 1
+
+#define SERPENT_CTR_ENC_TEST_VECTORS 2
+#define SERPENT_CTR_DEC_TEST_VECTORS 2
+
static struct cipher_testvec serpent_enc_tv_template[] = {
{
.input = "\x00\x01\x02\x03\x04\x05\x06\x07"
@@ -3140,6 +3146,50 @@ static struct cipher_testvec serpent_enc_tv_template[] = {
.result = "\xdd\xd2\x6b\x98\xa5\xff\xd8\x2c"
"\x05\x34\x5a\x9d\xad\xbf\xaf\x49",
.rlen = 16,
+ }, { /* Generated with Crypto++ */
+ .key = "\x85\x62\x3F\x1C\xF9\xD6\x1C\xF9"
+ "\xD6\xB3\x90\x6D\x4A\x90\x6D\x4A"
+ "\x27\x04\xE1\x27\x04\xE1\xBE\x9B"
+ "\x78\xBE\x9B\x78\x55\x32\x0F\x55",
+ .klen = 32,
+ .input = "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
+ "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
+ "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
+ "\xAC\x20\xB7\x4E\xE5\x59\xF0\x87"
+ "\x1E\x92\x29\xC0\x34\xCB\x62\xF9"
+ "\x6D\x04\x9B\x0F\xA6\x3D\xD4\x48"
+ "\xDF\x76\x0D\x81\x18\xAF\x23\xBA"
+ "\x51\xE8\x5C\xF3\x8A\x21\x95\x2C"
+ "\xC3\x37\xCE\x65\xFC\x70\x07\x9E"
+ "\x12\xA9\x40\xD7\x4B\xE2\x79\x10"
+ "\x84\x1B\xB2\x26\xBD\x54\xEB\x5F"
+ "\xF6\x8D\x01\x98\x2F\xC6\x3A\xD1"
+ "\x68\xFF\x73\x0A\xA1\x15\xAC\x43"
+ "\xDA\x4E\xE5\x7C\x13\x87\x1E\xB5"
+ "\x29\xC0\x57\xEE\x62\xF9\x90\x04"
+ "\x9B\x32\xC9\x3D\xD4\x6B\x02\x76"
+ "\x0D\xA4\x18\xAF\x46\xDD\x51\xE8"
+ "\x7F\x16\x8A\x21\xB8\x2C\xC3\x5A",
+ .ilen = 144,
+ .result = "\xFB\xB0\x5D\xDE\xC0\xFE\xFC\xEB"
+ "\xB1\x80\x10\x43\xDE\x62\x70\xBD"
+ "\xFA\x8A\x93\xEA\x6B\xF7\xC5\xD7"
+ "\x0C\xD1\xBB\x29\x25\x14\x4C\x22"
+ "\x77\xA6\x38\x00\xDB\xB9\xE2\x07"
+ "\xD1\xAC\x82\xBA\xEA\x67\xAA\x39"
+ "\x99\x34\x89\x5B\x54\xE9\x12\x13"
+ "\x3B\x04\xE5\x12\x42\xC5\x79\xAB"
+ "\x0D\xC7\x3C\x58\x2D\xA3\x98\xF6"
+ "\xE4\x61\x9E\x17\x0B\xCE\xE8\xAA"
+ "\xB5\x6C\x1A\x3A\x67\x52\x81\x6A"
+ "\x04\xFF\x8A\x1B\x96\xFE\xE6\x87"
+ "\x3C\xD4\x39\x7D\x36\x9B\x03\xD5"
+ "\xB6\xA0\x75\x3C\x83\xE6\x1C\x73"
+ "\x9D\x74\x2B\x77\x53\x2D\xE5\xBD"
+ "\x69\xDA\x7A\x01\xF5\x6A\x70\x39"
+ "\x30\xD4\x2C\xF2\x8E\x06\x4B\x39"
+ "\xB3\x12\x1D\xB3\x17\x46\xE6\xD6",
+ .rlen = 144,
},
};

@@ -3231,6 +3281,50 @@ static struct cipher_testvec serpent_dec_tv_template[] = {
.ilen = 16,
.result = zeroed_string,
.rlen = 16,
+ }, { /* Generated with Crypto++ */
+ .key = "\x85\x62\x3F\x1C\xF9\xD6\x1C\xF9"
+ "\xD6\xB3\x90\x6D\x4A\x90\x6D\x4A"
+ "\x27\x04\xE1\x27\x04\xE1\xBE\x9B"
+ "\x78\xBE\x9B\x78\x55\x32\x0F\x55",
+ .klen = 32,
+ .input = "\xFB\xB0\x5D\xDE\xC0\xFE\xFC\xEB"
+ "\xB1\x80\x10\x43\xDE\x62\x70\xBD"
+ "\xFA\x8A\x93\xEA\x6B\xF7\xC5\xD7"
+ "\x0C\xD1\xBB\x29\x25\x14\x4C\x22"
+ "\x77\xA6\x38\x00\xDB\xB9\xE2\x07"
+ "\xD1\xAC\x82\xBA\xEA\x67\xAA\x39"
+ "\x99\x34\x89\x5B\x54\xE9\x12\x13"
+ "\x3B\x04\xE5\x12\x42\xC5\x79\xAB"
+ "\x0D\xC7\x3C\x58\x2D\xA3\x98\xF6"
+ "\xE4\x61\x9E\x17\x0B\xCE\xE8\xAA"
+ "\xB5\x6C\x1A\x3A\x67\x52\x81\x6A"
+ "\x04\xFF\x8A\x1B\x96\xFE\xE6\x87"
+ "\x3C\xD4\x39\x7D\x36\x9B\x03\xD5"
+ "\xB6\xA0\x75\x3C\x83\xE6\x1C\x73"
+ "\x9D\x74\x2B\x77\x53\x2D\xE5\xBD"
+ "\x69\xDA\x7A\x01\xF5\x6A\x70\x39"
+ "\x30\xD4\x2C\xF2\x8E\x06\x4B\x39"
+ "\xB3\x12\x1D\xB3\x17\x46\xE6\xD6",
+ .ilen = 144,
+ .result = "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
+ "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
+ "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
+ "\xAC\x20\xB7\x4E\xE5\x59\xF0\x87"
+ "\x1E\x92\x29\xC0\x34\xCB\x62\xF9"
+ "\x6D\x04\x9B\x0F\xA6\x3D\xD4\x48"
+ "\xDF\x76\x0D\x81\x18\xAF\x23\xBA"
+ "\x51\xE8\x5C\xF3\x8A\x21\x95\x2C"
+ "\xC3\x37\xCE\x65\xFC\x70\x07\x9E"
+ "\x12\xA9\x40\xD7\x4B\xE2\x79\x10"
+ "\x84\x1B\xB2\x26\xBD\x54\xEB\x5F"
+ "\xF6\x8D\x01\x98\x2F\xC6\x3A\xD1"
+ "\x68\xFF\x73\x0A\xA1\x15\xAC\x43"
+ "\xDA\x4E\xE5\x7C\x13\x87\x1E\xB5"
+ "\x29\xC0\x57\xEE\x62\xF9\x90\x04"
+ "\x9B\x32\xC9\x3D\xD4\x6B\x02\x76"
+ "\x0D\xA4\x18\xAF\x46\xDD\x51\xE8"
+ "\x7F\x16\x8A\x21\xB8\x2C\xC3\x5A",
+ .rlen = 144,
},
};

@@ -3275,6 +3369,301 @@ static struct cipher_testvec tnepres_dec_tv_template[] = {
},
};

+static struct cipher_testvec serpent_cbc_enc_tv_template[] = {
+ { /* Generated with Crypto++ */
+ .key = "\x85\x62\x3F\x1C\xF9\xD6\x1C\xF9"
+ "\xD6\xB3\x90\x6D\x4A\x90\x6D\x4A"
+ "\x27\x04\xE1\x27\x04\xE1\xBE\x9B"
+ "\x78\xBE\x9B\x78\x55\x32\x0F\x55",
+ .klen = 32,
+ .iv = "\xE2\x24\x89\xEE\x53\xB8\x1D\x5F"
+ "\xC4\x29\x8E\xF3\x35\x9A\xFF\x64",
+ .input = "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
+ "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
+ "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
+ "\xAC\x20\xB7\x4E\xE5\x59\xF0\x87"
+ "\x1E\x92\x29\xC0\x34\xCB\x62\xF9"
+ "\x6D\x04\x9B\x0F\xA6\x3D\xD4\x48"
+ "\xDF\x76\x0D\x81\x18\xAF\x23\xBA"
+ "\x51\xE8\x5C\xF3\x8A\x21\x95\x2C"
+ "\xC3\x37\xCE\x65\xFC\x70\x07\x9E"
+ "\x12\xA9\x40\xD7\x4B\xE2\x79\x10"
+ "\x84\x1B\xB2\x26\xBD\x54\xEB\x5F"
+ "\xF6\x8D\x01\x98\x2F\xC6\x3A\xD1"
+ "\x68\xFF\x73\x0A\xA1\x15\xAC\x43"
+ "\xDA\x4E\xE5\x7C\x13\x87\x1E\xB5"
+ "\x29\xC0\x57\xEE\x62\xF9\x90\x04"
+ "\x9B\x32\xC9\x3D\xD4\x6B\x02\x76"
+ "\x0D\xA4\x18\xAF\x46\xDD\x51\xE8"
+ "\x7F\x16\x8A\x21\xB8\x2C\xC3\x5A",
+ .ilen = 144,
+ .result = "\x80\xCF\x11\x41\x1A\xB9\x4B\x9C"
+ "\xFF\xB7\x6C\xEA\xF0\xAF\x77\x6E"
+ "\x71\x75\x95\x9D\x4E\x1C\xCF\xAD"
+ "\x81\x34\xE9\x8F\xAE\x5A\x91\x1C"
+ "\x38\x63\x35\x7E\x79\x18\x0A\xE8"
+ "\x67\x06\x76\xD5\xFF\x22\x2F\xDA"
+ "\xB6\x2D\x57\x13\xB6\x3C\xBC\x97"
+ "\xFE\x53\x75\x35\x97\x7F\x51\xEA"
+ "\xDF\x5D\xE8\x9D\xCC\xD9\xAE\xE7"
+ "\x62\x67\xFF\x04\xC2\x18\x22\x5F"
+ "\x2E\x06\xC1\xE2\x26\xCD\xC6\x1E"
+ "\xE5\x2C\x4E\x87\x23\xDD\xF0\x41"
+ "\x08\xA5\xB4\x3E\x07\x1E\x0B\xBB"
+ "\x72\x84\xF8\x0A\x3F\x38\x5E\x91"
+ "\x15\x26\xE1\xDB\xA4\x3D\x74\xD2"
+ "\x41\x1E\x3F\xA9\xC6\x7D\x2A\xAB"
+ "\x27\xDF\x89\x1D\x86\x3E\xF7\x5A"
+ "\xF6\xE3\x0F\xC7\x6B\x4C\x96\x7C",
+ .rlen = 144,
+ },
+};
+
+static struct cipher_testvec serpent_cbc_dec_tv_template[] = {
+ { /* Generated with Crypto++ */
+ .key = "\x85\x62\x3F\x1C\xF9\xD6\x1C\xF9"
+ "\xD6\xB3\x90\x6D\x4A\x90\x6D\x4A"
+ "\x27\x04\xE1\x27\x04\xE1\xBE\x9B"
+ "\x78\xBE\x9B\x78\x55\x32\x0F\x55",
+ .klen = 32,
+ .iv = "\xE2\x24\x89\xEE\x53\xB8\x1D\x5F"
+ "\xC4\x29\x8E\xF3\x35\x9A\xFF\x64",
+ .input = "\x80\xCF\x11\x41\x1A\xB9\x4B\x9C"
+ "\xFF\xB7\x6C\xEA\xF0\xAF\x77\x6E"
+ "\x71\x75\x95\x9D\x4E\x1C\xCF\xAD"
+ "\x81\x34\xE9\x8F\xAE\x5A\x91\x1C"
+ "\x38\x63\x35\x7E\x79\x18\x0A\xE8"
+ "\x67\x06\x76\xD5\xFF\x22\x2F\xDA"
+ "\xB6\x2D\x57\x13\xB6\x3C\xBC\x97"
+ "\xFE\x53\x75\x35\x97\x7F\x51\xEA"
+ "\xDF\x5D\xE8\x9D\xCC\xD9\xAE\xE7"
+ "\x62\x67\xFF\x04\xC2\x18\x22\x5F"
+ "\x2E\x06\xC1\xE2\x26\xCD\xC6\x1E"
+ "\xE5\x2C\x4E\x87\x23\xDD\xF0\x41"
+ "\x08\xA5\xB4\x3E\x07\x1E\x0B\xBB"
+ "\x72\x84\xF8\x0A\x3F\x38\x5E\x91"
+ "\x15\x26\xE1\xDB\xA4\x3D\x74\xD2"
+ "\x41\x1E\x3F\xA9\xC6\x7D\x2A\xAB"
+ "\x27\xDF\x89\x1D\x86\x3E\xF7\x5A"
+ "\xF6\xE3\x0F\xC7\x6B\x4C\x96\x7C",
+ .ilen = 144,
+ .result = "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
+ "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
+ "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
+ "\xAC\x20\xB7\x4E\xE5\x59\xF0\x87"
+ "\x1E\x92\x29\xC0\x34\xCB\x62\xF9"
+ "\x6D\x04\x9B\x0F\xA6\x3D\xD4\x48"
+ "\xDF\x76\x0D\x81\x18\xAF\x23\xBA"
+ "\x51\xE8\x5C\xF3\x8A\x21\x95\x2C"
+ "\xC3\x37\xCE\x65\xFC\x70\x07\x9E"
+ "\x12\xA9\x40\xD7\x4B\xE2\x79\x10"
+ "\x84\x1B\xB2\x26\xBD\x54\xEB\x5F"
+ "\xF6\x8D\x01\x98\x2F\xC6\x3A\xD1"
+ "\x68\xFF\x73\x0A\xA1\x15\xAC\x43"
+ "\xDA\x4E\xE5\x7C\x13\x87\x1E\xB5"
+ "\x29\xC0\x57\xEE\x62\xF9\x90\x04"
+ "\x9B\x32\xC9\x3D\xD4\x6B\x02\x76"
+ "\x0D\xA4\x18\xAF\x46\xDD\x51\xE8"
+ "\x7F\x16\x8A\x21\xB8\x2C\xC3\x5A",
+ .rlen = 144,
+ },
+};
+
+static struct cipher_testvec serpent_ctr_enc_tv_template[] = {
+ { /* Generated with Crypto++ */
+ .key = "\x85\x62\x3F\x1C\xF9\xD6\x1C\xF9"
+ "\xD6\xB3\x90\x6D\x4A\x90\x6D\x4A"
+ "\x27\x04\xE1\x27\x04\xE1\xBE\x9B"
+ "\x78\xBE\x9B\x78\x55\x32\x0F\x55",
+ .klen = 32,
+ .iv = "\xE2\x24\x89\xEE\x53\xB8\x1D\x5F"
+ "\xC4\x29\x8E\xF3\x35\x9A\xFF\x64",
+ .input = "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
+ "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
+ "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
+ "\xAC\x20\xB7\x4E\xE5\x59\xF0\x87"
+ "\x1E\x92\x29\xC0\x34\xCB\x62\xF9"
+ "\x6D\x04\x9B\x0F\xA6\x3D\xD4\x48"
+ "\xDF\x76\x0D\x81\x18\xAF\x23\xBA"
+ "\x51\xE8\x5C\xF3\x8A\x21\x95\x2C"
+ "\xC3\x37\xCE\x65\xFC\x70\x07\x9E"
+ "\x12\xA9\x40\xD7\x4B\xE2\x79\x10"
+ "\x84\x1B\xB2\x26\xBD\x54\xEB\x5F"
+ "\xF6\x8D\x01\x98\x2F\xC6\x3A\xD1"
+ "\x68\xFF\x73\x0A\xA1\x15\xAC\x43"
+ "\xDA\x4E\xE5\x7C\x13\x87\x1E\xB5"
+ "\x29\xC0\x57\xEE\x62\xF9\x90\x04"
+ "\x9B\x32\xC9\x3D\xD4\x6B\x02\x76"
+ "\x0D\xA4\x18\xAF\x46\xDD\x51\xE8"
+ "\x7F\x16\x8A\x21\xB8\x2C\xC3\x5A",
+ .ilen = 144,
+ .result = "\x84\x68\xEC\xF2\x1C\x88\x20\xCA"
+ "\x37\x69\xE3\x3A\x22\x85\x48\x46"
+ "\x70\xAA\x25\xB4\xCD\x8B\x04\x4E"
+ "\x8D\x15\x2B\x98\xDF\x7B\x6D\xB9"
+ "\xE0\x4A\x73\x00\x65\xB6\x1A\x0D"
+ "\x5C\x60\xDF\x34\xDC\x60\x4C\xDF"
+ "\xB5\x1F\x26\x8C\xDA\xC1\x11\xA8"
+ "\x80\xFA\x37\x7A\x89\xAA\xAE\x7B"
+ "\x92\x6E\xB9\xDC\xC9\x62\x4F\x88"
+ "\x0A\x5D\x97\x2F\x6B\xAC\x03\x7C"
+ "\x22\xF6\x55\x5A\xFA\x35\xA5\x17"
+ "\xA1\x5C\x5E\x2B\x63\x2D\xB9\x91"
+ "\x3E\x83\x26\x00\x4E\xD5\xBE\xCE"
+ "\x79\xC4\x3D\xFC\x70\xA0\xAD\x96"
+ "\xBA\x58\x2A\x1C\xDF\xC2\x3A\xA5"
+ "\x7C\xB5\x12\x89\xED\xBF\xB6\x09"
+ "\x13\x4F\x7D\x61\x3C\x5C\x27\xFC"
+ "\x5D\xE1\x4F\xA1\xEA\xB3\xCA\xB9",
+ .rlen = 144,
+ }, { /* Generated with Crypto++ */
+ .key = "\x85\x62\x3F\x1C\xF9\xD6\x1C\xF9"
+ "\xD6\xB3\x90\x6D\x4A\x90\x6D\x4A"
+ "\x27\x04\xE1\x27\x04\xE1\xBE\x9B"
+ "\x78\xBE\x9B\x78\x55\x32\x0F\x55",
+ .klen = 32,
+ .iv = "\xE2\x24\x89\xEE\x53\xB8\x1D\x5F"
+ "\xC4\x29\x8E\xF3\x35\x9A\xFF\x64",
+ .input = "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
+ "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
+ "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
+ "\xAC\x20\xB7\x4E\xE5\x59\xF0\x87"
+ "\x1E\x92\x29\xC0\x34\xCB\x62\xF9"
+ "\x6D\x04\x9B\x0F\xA6\x3D\xD4\x48"
+ "\xDF\x76\x0D\x81\x18\xAF\x23\xBA"
+ "\x51\xE8\x5C\xF3\x8A\x21\x95\x2C"
+ "\xC3\x37\xCE\x65\xFC\x70\x07\x9E"
+ "\x12\xA9\x40\xD7\x4B\xE2\x79\x10"
+ "\x84\x1B\xB2\x26\xBD\x54\xEB\x5F"
+ "\xF6\x8D\x01\x98\x2F\xC6\x3A\xD1"
+ "\x68\xFF\x73\x0A\xA1\x15\xAC\x43"
+ "\xDA\x4E\xE5\x7C\x13\x87\x1E\xB5"
+ "\x29\xC0\x57\xEE\x62\xF9\x90\x04"
+ "\x9B\x32\xC9\x3D\xD4\x6B\x02\x76"
+ "\x0D\xA4\x18\xAF\x46\xDD\x51\xE8"
+ "\x7F\x16\x8A\x21\xB8\x2C\xC3\x5A"
+ "\xF1\x65\xFC",
+ .ilen = 147,
+ .result = "\x84\x68\xEC\xF2\x1C\x88\x20\xCA"
+ "\x37\x69\xE3\x3A\x22\x85\x48\x46"
+ "\x70\xAA\x25\xB4\xCD\x8B\x04\x4E"
+ "\x8D\x15\x2B\x98\xDF\x7B\x6D\xB9"
+ "\xE0\x4A\x73\x00\x65\xB6\x1A\x0D"
+ "\x5C\x60\xDF\x34\xDC\x60\x4C\xDF"
+ "\xB5\x1F\x26\x8C\xDA\xC1\x11\xA8"
+ "\x80\xFA\x37\x7A\x89\xAA\xAE\x7B"
+ "\x92\x6E\xB9\xDC\xC9\x62\x4F\x88"
+ "\x0A\x5D\x97\x2F\x6B\xAC\x03\x7C"
+ "\x22\xF6\x55\x5A\xFA\x35\xA5\x17"
+ "\xA1\x5C\x5E\x2B\x63\x2D\xB9\x91"
+ "\x3E\x83\x26\x00\x4E\xD5\xBE\xCE"
+ "\x79\xC4\x3D\xFC\x70\xA0\xAD\x96"
+ "\xBA\x58\x2A\x1C\xDF\xC2\x3A\xA5"
+ "\x7C\xB5\x12\x89\xED\xBF\xB6\x09"
+ "\x13\x4F\x7D\x61\x3C\x5C\x27\xFC"
+ "\x5D\xE1\x4F\xA1\xEA\xB3\xCA\xB9"
+ "\xE6\xD0\x97",
+ .rlen = 147,
+ },
+};
+
+static struct cipher_testvec serpent_ctr_dec_tv_template[] = {
+ { /* Generated with Crypto++ */
+ .key = "\x85\x62\x3F\x1C\xF9\xD6\x1C\xF9"
+ "\xD6\xB3\x90\x6D\x4A\x90\x6D\x4A"
+ "\x27\x04\xE1\x27\x04\xE1\xBE\x9B"
+ "\x78\xBE\x9B\x78\x55\x32\x0F\x55",
+ .klen = 32,
+ .iv = "\xE2\x24\x89\xEE\x53\xB8\x1D\x5F"
+ "\xC4\x29\x8E\xF3\x35\x9A\xFF\x64",
+ .input = "\x84\x68\xEC\xF2\x1C\x88\x20\xCA"
+ "\x37\x69\xE3\x3A\x22\x85\x48\x46"
+ "\x70\xAA\x25\xB4\xCD\x8B\x04\x4E"
+ "\x8D\x15\x2B\x98\xDF\x7B\x6D\xB9"
+ "\xE0\x4A\x73\x00\x65\xB6\x1A\x0D"
+ "\x5C\x60\xDF\x34\xDC\x60\x4C\xDF"
+ "\xB5\x1F\x26\x8C\xDA\xC1\x11\xA8"
+ "\x80\xFA\x37\x7A\x89\xAA\xAE\x7B"
+ "\x92\x6E\xB9\xDC\xC9\x62\x4F\x88"
+ "\x0A\x5D\x97\x2F\x6B\xAC\x03\x7C"
+ "\x22\xF6\x55\x5A\xFA\x35\xA5\x17"
+ "\xA1\x5C\x5E\x2B\x63\x2D\xB9\x91"
+ "\x3E\x83\x26\x00\x4E\xD5\xBE\xCE"
+ "\x79\xC4\x3D\xFC\x70\xA0\xAD\x96"
+ "\xBA\x58\x2A\x1C\xDF\xC2\x3A\xA5"
+ "\x7C\xB5\x12\x89\xED\xBF\xB6\x09"
+ "\x13\x4F\x7D\x61\x3C\x5C\x27\xFC"
+ "\x5D\xE1\x4F\xA1\xEA\xB3\xCA\xB9",
+ .ilen = 144,
+ .result = "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
+ "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
+ "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
+ "\xAC\x20\xB7\x4E\xE5\x59\xF0\x87"
+ "\x1E\x92\x29\xC0\x34\xCB\x62\xF9"
+ "\x6D\x04\x9B\x0F\xA6\x3D\xD4\x48"
+ "\xDF\x76\x0D\x81\x18\xAF\x23\xBA"
+ "\x51\xE8\x5C\xF3\x8A\x21\x95\x2C"
+ "\xC3\x37\xCE\x65\xFC\x70\x07\x9E"
+ "\x12\xA9\x40\xD7\x4B\xE2\x79\x10"
+ "\x84\x1B\xB2\x26\xBD\x54\xEB\x5F"
+ "\xF6\x8D\x01\x98\x2F\xC6\x3A\xD1"
+ "\x68\xFF\x73\x0A\xA1\x15\xAC\x43"
+ "\xDA\x4E\xE5\x7C\x13\x87\x1E\xB5"
+ "\x29\xC0\x57\xEE\x62\xF9\x90\x04"
+ "\x9B\x32\xC9\x3D\xD4\x6B\x02\x76"
+ "\x0D\xA4\x18\xAF\x46\xDD\x51\xE8"
+ "\x7F\x16\x8A\x21\xB8\x2C\xC3\x5A",
+ .rlen = 144,
+ }, { /* Generated with Crypto++ */
+ .key = "\x85\x62\x3F\x1C\xF9\xD6\x1C\xF9"
+ "\xD6\xB3\x90\x6D\x4A\x90\x6D\x4A"
+ "\x27\x04\xE1\x27\x04\xE1\xBE\x9B"
+ "\x78\xBE\x9B\x78\x55\x32\x0F\x55",
+ .klen = 32,
+ .iv = "\xE2\x24\x89\xEE\x53\xB8\x1D\x5F"
+ "\xC4\x29\x8E\xF3\x35\x9A\xFF\x64",
+ .input = "\x84\x68\xEC\xF2\x1C\x88\x20\xCA"
+ "\x37\x69\xE3\x3A\x22\x85\x48\x46"
+ "\x70\xAA\x25\xB4\xCD\x8B\x04\x4E"
+ "\x8D\x15\x2B\x98\xDF\x7B\x6D\xB9"
+ "\xE0\x4A\x73\x00\x65\xB6\x1A\x0D"
+ "\x5C\x60\xDF\x34\xDC\x60\x4C\xDF"
+ "\xB5\x1F\x26\x8C\xDA\xC1\x11\xA8"
+ "\x80\xFA\x37\x7A\x89\xAA\xAE\x7B"
+ "\x92\x6E\xB9\xDC\xC9\x62\x4F\x88"
+ "\x0A\x5D\x97\x2F\x6B\xAC\x03\x7C"
+ "\x22\xF6\x55\x5A\xFA\x35\xA5\x17"
+ "\xA1\x5C\x5E\x2B\x63\x2D\xB9\x91"
+ "\x3E\x83\x26\x00\x4E\xD5\xBE\xCE"
+ "\x79\xC4\x3D\xFC\x70\xA0\xAD\x96"
+ "\xBA\x58\x2A\x1C\xDF\xC2\x3A\xA5"
+ "\x7C\xB5\x12\x89\xED\xBF\xB6\x09"
+ "\x13\x4F\x7D\x61\x3C\x5C\x27\xFC"
+ "\x5D\xE1\x4F\xA1\xEA\xB3\xCA\xB9"
+ "\xE6\xD0\x97",
+ .ilen = 147,
+ .result = "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
+ "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
+ "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
+ "\xAC\x20\xB7\x4E\xE5\x59\xF0\x87"
+ "\x1E\x92\x29\xC0\x34\xCB\x62\xF9"
+ "\x6D\x04\x9B\x0F\xA6\x3D\xD4\x48"
+ "\xDF\x76\x0D\x81\x18\xAF\x23\xBA"
+ "\x51\xE8\x5C\xF3\x8A\x21\x95\x2C"
+ "\xC3\x37\xCE\x65\xFC\x70\x07\x9E"
+ "\x12\xA9\x40\xD7\x4B\xE2\x79\x10"
+ "\x84\x1B\xB2\x26\xBD\x54\xEB\x5F"
+ "\xF6\x8D\x01\x98\x2F\xC6\x3A\xD1"
+ "\x68\xFF\x73\x0A\xA1\x15\xAC\x43"
+ "\xDA\x4E\xE5\x7C\x13\x87\x1E\xB5"
+ "\x29\xC0\x57\xEE\x62\xF9\x90\x04"
+ "\x9B\x32\xC9\x3D\xD4\x6B\x02\x76"
+ "\x0D\xA4\x18\xAF\x46\xDD\x51\xE8"
+ "\x7F\x16\x8A\x21\xB8\x2C\xC3\x5A"
+ "\xF1\x65\xFC",
+ .rlen = 147,
+ },
+};

/* Cast6 test vectors from RFC 2612 */
#define CAST6_ENC_TEST_VECTORS 3

2011-10-17 21:03:00

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 2/7] crypto: tcrypt: add test_acipher_speed

Add test_acipher_speed for testing async block ciphers.

Also include tests for aes/des/des3/ede as these appear to have ablk_cipher
implementations available.

Signed-off-by: Jussi Kivilinna <[email protected]>
---
crypto/tcrypt.c | 250 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 250 insertions(+), 0 deletions(-)

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index ac9e4d2..dd3a0f8 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -719,6 +719,207 @@ out:
crypto_free_ahash(tfm);
}

+static inline int do_one_acipher_op(struct ablkcipher_request *req, int ret)
+{
+ if (ret == -EINPROGRESS || ret == -EBUSY) {
+ struct tcrypt_result *tr = req->base.data;
+
+ ret = wait_for_completion_interruptible(&tr->completion);
+ if (!ret)
+ ret = tr->err;
+ INIT_COMPLETION(tr->completion);
+ }
+
+ return ret;
+}
+
+static int test_acipher_jiffies(struct ablkcipher_request *req, int enc,
+ int blen, int sec)
+{
+ unsigned long start, end;
+ int bcount;
+ int ret;
+
+ for (start = jiffies, end = start + sec * HZ, bcount = 0;
+ time_before(jiffies, end); bcount++) {
+ if (enc)
+ ret = do_one_acipher_op(req,
+ crypto_ablkcipher_encrypt(req));
+ else
+ ret = do_one_acipher_op(req,
+ crypto_ablkcipher_decrypt(req));
+
+ if (ret)
+ return ret;
+ }
+
+ pr_cont("%d operations in %d seconds (%ld bytes)\n",
+ bcount, sec, (long)bcount * blen);
+ return 0;
+}
+
+static int test_acipher_cycles(struct ablkcipher_request *req, int enc,
+ int blen)
+{
+ unsigned long cycles = 0;
+ int ret = 0;
+ int i;
+
+ /* Warm-up run. */
+ for (i = 0; i < 4; i++) {
+ if (enc)
+ ret = do_one_acipher_op(req,
+ crypto_ablkcipher_encrypt(req));
+ else
+ ret = do_one_acipher_op(req,
+ crypto_ablkcipher_decrypt(req));
+
+ if (ret)
+ goto out;
+ }
+
+ /* The real thing. */
+ for (i = 0; i < 8; i++) {
+ cycles_t start, end;
+
+ start = get_cycles();
+ if (enc)
+ ret = do_one_acipher_op(req,
+ crypto_ablkcipher_encrypt(req));
+ else
+ ret = do_one_acipher_op(req,
+ crypto_ablkcipher_decrypt(req));
+ end = get_cycles();
+
+ if (ret)
+ goto out;
+
+ cycles += end - start;
+ }
+
+out:
+ if (ret == 0)
+ pr_cont("1 operation in %lu cycles (%d bytes)\n",
+ (cycles + 4) / 8, blen);
+
+ return ret;
+}
+
+static void test_acipher_speed(const char *algo, int enc, unsigned int sec,
+ struct cipher_speed_template *template,
+ unsigned int tcount, u8 *keysize)
+{
+ unsigned int ret, i, j, iv_len;
+ struct tcrypt_result tresult;
+ const char *key;
+ char iv[128];
+ struct ablkcipher_request *req;
+ struct crypto_ablkcipher *tfm;
+ const char *e;
+ u32 *b_size;
+
+ if (enc == ENCRYPT)
+ e = "encryption";
+ else
+ e = "decryption";
+
+ pr_info("\ntesting speed of async %s %s\n", algo, e);
+
+ init_completion(&tresult.completion);
+
+ tfm = crypto_alloc_ablkcipher(algo, 0, 0);
+
+ if (IS_ERR(tfm)) {
+ pr_err("failed to load transform for %s: %ld\n", algo,
+ PTR_ERR(tfm));
+ return;
+ }
+
+ req = ablkcipher_request_alloc(tfm, GFP_KERNEL);
+ if (!req) {
+ pr_err("tcrypt: skcipher: Failed to allocate request for %s\n",
+ algo);
+ goto out;
+ }
+
+ ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ tcrypt_complete, &tresult);
+
+ i = 0;
+ do {
+ b_size = block_sizes;
+
+ do {
+ struct scatterlist sg[TVMEMSIZE];
+
+ if ((*keysize + *b_size) > TVMEMSIZE * PAGE_SIZE) {
+ pr_err("template (%u) too big for "
+ "tvmem (%lu)\n", *keysize + *b_size,
+ TVMEMSIZE * PAGE_SIZE);
+ goto out_free_req;
+ }
+
+ pr_info("test %u (%d bit key, %d byte blocks): ", i,
+ *keysize * 8, *b_size);
+
+ memset(tvmem[0], 0xff, PAGE_SIZE);
+
+ /* set key, plain text and IV */
+ key = tvmem[0];
+ for (j = 0; j < tcount; j++) {
+ if (template[j].klen == *keysize) {
+ key = template[j].key;
+ break;
+ }
+ }
+
+ crypto_ablkcipher_clear_flags(tfm, ~0);
+
+ ret = crypto_ablkcipher_setkey(tfm, key, *keysize);
+ if (ret) {
+ pr_err("setkey() failed flags=%x\n",
+ crypto_ablkcipher_get_flags(tfm));
+ goto out_free_req;
+ }
+
+ sg_init_table(sg, TVMEMSIZE);
+ sg_set_buf(sg, tvmem[0] + *keysize,
+ PAGE_SIZE - *keysize);
+ for (j = 1; j < TVMEMSIZE; j++) {
+ sg_set_buf(sg + j, tvmem[j], PAGE_SIZE);
+ memset(tvmem[j], 0xff, PAGE_SIZE);
+ }
+
+ iv_len = crypto_ablkcipher_ivsize(tfm);
+ if (iv_len)
+ memset(&iv, 0xff, iv_len);
+
+ ablkcipher_request_set_crypt(req, sg, sg, *b_size, iv);
+
+ if (sec)
+ ret = test_acipher_jiffies(req, enc,
+ *b_size, sec);
+ else
+ ret = test_acipher_cycles(req, enc,
+ *b_size);
+
+ if (ret) {
+ pr_err("%s() failed flags=%x\n", e,
+ crypto_ablkcipher_get_flags(tfm));
+ break;
+ }
+ b_size++;
+ i++;
+ } while (*b_size);
+ keysize++;
+ } while (*keysize);
+
+out_free_req:
+ ablkcipher_request_free(req);
+out:
+ crypto_free_ablkcipher(tfm);
+}
+
static void test_available(void)
{
char **name = check;
@@ -1243,6 +1444,55 @@ static int do_test(int m)
case 499:
break;

+ case 500:
+ test_acipher_speed("ecb(aes)", ENCRYPT, sec, NULL, 0,
+ speed_template_16_24_32);
+ test_acipher_speed("ecb(aes)", DECRYPT, sec, NULL, 0,
+ speed_template_16_24_32);
+ test_acipher_speed("cbc(aes)", ENCRYPT, sec, NULL, 0,
+ speed_template_16_24_32);
+ test_acipher_speed("cbc(aes)", DECRYPT, sec, NULL, 0,
+ speed_template_16_24_32);
+ test_acipher_speed("lrw(aes)", ENCRYPT, sec, NULL, 0,
+ speed_template_32_40_48);
+ test_acipher_speed("lrw(aes)", DECRYPT, sec, NULL, 0,
+ speed_template_32_40_48);
+ test_acipher_speed("xts(aes)", ENCRYPT, sec, NULL, 0,
+ speed_template_32_48_64);
+ test_acipher_speed("xts(aes)", DECRYPT, sec, NULL, 0,
+ speed_template_32_48_64);
+ test_acipher_speed("ctr(aes)", ENCRYPT, sec, NULL, 0,
+ speed_template_16_24_32);
+ test_acipher_speed("ctr(aes)", DECRYPT, sec, NULL, 0,
+ speed_template_16_24_32);
+ break;
+
+ case 501:
+ test_acipher_speed("ecb(des3_ede)", ENCRYPT, sec,
+ des3_speed_template, DES3_SPEED_VECTORS,
+ speed_template_24);
+ test_acipher_speed("ecb(des3_ede)", DECRYPT, sec,
+ des3_speed_template, DES3_SPEED_VECTORS,
+ speed_template_24);
+ test_acipher_speed("cbc(des3_ede)", ENCRYPT, sec,
+ des3_speed_template, DES3_SPEED_VECTORS,
+ speed_template_24);
+ test_acipher_speed("cbc(des3_ede)", DECRYPT, sec,
+ des3_speed_template, DES3_SPEED_VECTORS,
+ speed_template_24);
+ break;
+
+ case 502:
+ test_acipher_speed("ecb(des)", ENCRYPT, sec, NULL, 0,
+ speed_template_8);
+ test_acipher_speed("ecb(des)", DECRYPT, sec, NULL, 0,
+ speed_template_8);
+ test_acipher_speed("cbc(des)", ENCRYPT, sec, NULL, 0,
+ speed_template_8);
+ test_acipher_speed("cbc(des)", DECRYPT, sec, NULL, 0,
+ speed_template_8);
+ break;
+
case 1000:
test_available();
break;

2011-10-17 21:03:05

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 3/7] crypto: tcrypt: add serpent speed tests

Signed-off-by: Jussi Kivilinna <[email protected]>
---
crypto/tcrypt.c | 30 ++++++++++++++++++++++++++++++
1 files changed, 30 insertions(+), 0 deletions(-)

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index dd3a0f8..5526065 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -1292,6 +1292,21 @@ static int do_test(int m)
speed_template_16_32);
break;

+ case 207:
+ test_cipher_speed("ecb(serpent)", ENCRYPT, sec, NULL, 0,
+ speed_template_16_32);
+ test_cipher_speed("ecb(serpent)", DECRYPT, sec, NULL, 0,
+ speed_template_16_32);
+ test_cipher_speed("cbc(serpent)", ENCRYPT, sec, NULL, 0,
+ speed_template_16_32);
+ test_cipher_speed("cbc(serpent)", DECRYPT, sec, NULL, 0,
+ speed_template_16_32);
+ test_cipher_speed("ctr(serpent)", ENCRYPT, sec, NULL, 0,
+ speed_template_16_32);
+ test_cipher_speed("ctr(serpent)", DECRYPT, sec, NULL, 0,
+ speed_template_16_32);
+ break;
+
case 300:
/* fall through */

@@ -1493,6 +1508,21 @@ static int do_test(int m)
speed_template_8);
break;

+ case 503:
+ test_acipher_speed("ecb(serpent)", ENCRYPT, sec, NULL, 0,
+ speed_template_16_32);
+ test_acipher_speed("ecb(serpent)", DECRYPT, sec, NULL, 0,
+ speed_template_16_32);
+ test_acipher_speed("cbc(serpent)", ENCRYPT, sec, NULL, 0,
+ speed_template_16_32);
+ test_acipher_speed("cbc(serpent)", DECRYPT, sec, NULL, 0,
+ speed_template_16_32);
+ test_acipher_speed("ctr(serpent)", ENCRYPT, sec, NULL, 0,
+ speed_template_16_32);
+ test_acipher_speed("ctr(serpent)", DECRYPT, sec, NULL, 0,
+ speed_template_16_32);
+ break;
+
case 1000:
test_available();
break;

2011-10-17 21:03:10

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 4/7] crypto: serpent: export common functions for x86_64/i386-sse2 assembler implementations

Serpent SSE2 assembler implementations only provide 4-way/8-way parallel
functions and need setkey and one-block encrypt/decrypt functions.

CC: Dag Arne Osvik <[email protected]>
Signed-off-by: Jussi Kivilinna <[email protected]>
---
crypto/serpent.c | 41 ++++++++++++++++++++++-------------------
include/crypto/serpent.h | 25 +++++++++++++++++++++++++
2 files changed, 47 insertions(+), 19 deletions(-)
create mode 100644 include/crypto/serpent.h

diff --git a/crypto/serpent.c b/crypto/serpent.c
index b651a55..867ca93 100644
--- a/crypto/serpent.c
+++ b/crypto/serpent.c
@@ -21,16 +21,12 @@
#include <asm/byteorder.h>
#include <linux/crypto.h>
#include <linux/types.h>
+#include <crypto/serpent.h>

/* Key is padded to the maximum of 256 bits before round key generation.
* Any key length <= 256 bits (32 bytes) is allowed by the algorithm.
*/

-#define SERPENT_MIN_KEY_SIZE 0
-#define SERPENT_MAX_KEY_SIZE 32
-#define SERPENT_EXPKEY_WORDS 132
-#define SERPENT_BLOCK_SIZE 16
-
#define PHI 0x9e3779b9UL

#define keyiter(a,b,c,d,i,j) \
@@ -210,13 +206,7 @@
x1 ^= x4; x3 ^= x4; x4 &= x0; \
x4 ^= x2;

-struct serpent_ctx {
- u32 expkey[SERPENT_EXPKEY_WORDS];
-};
-
-
-static int serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
- unsigned int keylen)
+int serpent_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen)
{
struct serpent_ctx *ctx = crypto_tfm_ctx(tfm);
u32 *k = ctx->expkey;
@@ -359,12 +349,11 @@ static int serpent_setkey(struct crypto_tfm *tfm, const u8 *key,

return 0;
}
+EXPORT_SYMBOL_GPL(serpent_setkey);

-static void serpent_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+void __serpent_encrypt(struct serpent_ctx *ctx, u8 *dst, const u8 *src)
{
- struct serpent_ctx *ctx = crypto_tfm_ctx(tfm);
- const u32
- *k = ctx->expkey;
+ const u32 *k = ctx->expkey;
const __le32 *s = (const __le32 *)src;
__le32 *d = (__le32 *)dst;
u32 r0, r1, r2, r3, r4;
@@ -418,12 +407,18 @@ static void serpent_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
d[2] = cpu_to_le32(r2);
d[3] = cpu_to_le32(r3);
}
+EXPORT_SYMBOL_GPL(__serpent_encrypt);

-static void serpent_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+static void serpent_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
struct serpent_ctx *ctx = crypto_tfm_ctx(tfm);
- const u32
- *k = ((struct serpent_ctx *)ctx)->expkey;
+
+ __serpent_encrypt(ctx, dst, src);
+}
+
+void __serpent_decrypt(struct serpent_ctx *ctx, u8 *dst, const u8 *src)
+{
+ const u32 *k = ctx->expkey;
const __le32 *s = (const __le32 *)src;
__le32 *d = (__le32 *)dst;
u32 r0, r1, r2, r3, r4;
@@ -472,6 +467,14 @@ static void serpent_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
d[2] = cpu_to_le32(r1);
d[3] = cpu_to_le32(r4);
}
+EXPORT_SYMBOL_GPL(__serpent_decrypt);
+
+static void serpent_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ struct serpent_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ __serpent_decrypt(ctx, dst, src);
+}

static struct crypto_alg serpent_alg = {
.cra_name = "serpent",
diff --git a/include/crypto/serpent.h b/include/crypto/serpent.h
new file mode 100644
index 0000000..40df885
--- /dev/null
+++ b/include/crypto/serpent.h
@@ -0,0 +1,25 @@
+/*
+ * Common values for serpent algorithms
+ */
+
+#ifndef _CRYPTO_SERPENT_H
+#define _CRYPTO_SERPENT_H
+
+#include <linux/types.h>
+#include <linux/crypto.h>
+
+#define SERPENT_MIN_KEY_SIZE 0
+#define SERPENT_MAX_KEY_SIZE 32
+#define SERPENT_EXPKEY_WORDS 132
+#define SERPENT_BLOCK_SIZE 16
+
+struct serpent_ctx {
+ u32 expkey[SERPENT_EXPKEY_WORDS];
+};
+
+int serpent_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen);
+
+void __serpent_encrypt(struct serpent_ctx *ctx, u8 *dst, const u8 *src);
+void __serpent_decrypt(struct serpent_ctx *ctx, u8 *dst, const u8 *src);
+
+#endif

2011-10-17 21:03:15

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 5/7] crypto: serpent: rename module from serpent to serpent_generic

Rename module from serpent.ko to serpent_generic.ko and add module alias. This
is to allow assembler implementation to autoload on 'modprobe serpent'. Also
add driver_name and priority for serpent cipher.

CC: Dag Arne Osvik <[email protected]>
Signed-off-by: Jussi Kivilinna <[email protected]>

---

(I choose not to do serpent.c -> serpent_generic.c rename as such patch
triggers lots of checkpatch errors. I can provide rename+style fix
patches, if you want so.)
---
crypto/Makefile | 4 +++-
crypto/serpent.c | 3 +++
2 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/crypto/Makefile b/crypto/Makefile
index fa8cbbb..dac8979 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -64,7 +64,9 @@ obj-$(CONFIG_CRYPTO_BLOWFISH) += blowfish_generic.o
obj-$(CONFIG_CRYPTO_BLOWFISH_COMMON) += blowfish_common.o
obj-$(CONFIG_CRYPTO_TWOFISH) += twofish_generic.o
obj-$(CONFIG_CRYPTO_TWOFISH_COMMON) += twofish_common.o
-obj-$(CONFIG_CRYPTO_SERPENT) += serpent.o
+
+serpent_generic-y := serpent.o
+obj-$(CONFIG_CRYPTO_SERPENT) += serpent_generic.o
obj-$(CONFIG_CRYPTO_AES) += aes_generic.o
obj-$(CONFIG_CRYPTO_CAMELLIA) += camellia.o
obj-$(CONFIG_CRYPTO_CAST5) += cast5.o
diff --git a/crypto/serpent.c b/crypto/serpent.c
index 867ca93..eb61630 100644
--- a/crypto/serpent.c
+++ b/crypto/serpent.c
@@ -478,6 +478,8 @@ static void serpent_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)

static struct crypto_alg serpent_alg = {
.cra_name = "serpent",
+ .cra_driver_name = "serpent-generic",
+ .cra_priority = 100,
.cra_flags = CRYPTO_ALG_TYPE_CIPHER,
.cra_blocksize = SERPENT_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct serpent_ctx),
@@ -588,3 +590,4 @@ MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Serpent and tnepres (kerneli compatible serpent reversed) Cipher Algorithm");
MODULE_AUTHOR("Dag Arne Osvik <[email protected]>");
MODULE_ALIAS("tnepres");
+MODULE_ALIAS("serpent");

2011-10-17 21:03:27

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 7/7] crypto: serpent: add 4-way parallel i586/SSE2 assembler implementation

Patch adds i586/SSE2 assembler implementation of serpent cipher. Assembler
functions crypt data in four block chunks.

Patch has been tested with tcrypt and automated filesystem tests.

Tcrypt benchmarks results (serpent-sse2/serpent_generic speed ratios):

Intel Atom N270:

size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec
16 0.95x 1.12x 1.02x 1.07x 0.97x 0.98x
64 1.73x 1.82x 1.08x 1.82x 1.72x 1.73x
256 2.08x 2.00x 1.04x 2.07x 1.99x 2.01x
1024 2.28x 2.18x 1.05x 2.23x 2.17x 2.20x
8192 2.28x 2.13x 1.05x 2.23x 2.18x 2.20x

Full output:
http://koti.mbnet.fi/axh/kernel/crypto/atom-n270/serpent-generic.txt
http://koti.mbnet.fi/axh/kernel/crypto/atom-n270/serpent-sse2.txt

Userspace test results:

Encryption/decryption of sse2-i586 vs generic on Intel Atom N270:
encrypt: 2.35x
decrypt: 2.54x

Encryption/decryption of sse2-i586 vs generic on AMD Phenom II:
encrypt: 1.82x
decrypt: 2.51x

Encryption/decryption of sse2-i586 vs generic on Intel Xeon E7330:
encrypt: 2.99x
decrypt: 3.48x

Signed-off-by: Jussi Kivilinna <[email protected]>
---
arch/x86/crypto/Makefile | 2
arch/x86/crypto/serpent-sse2-i586-asm_32.S | 639 ++++++++++++++++++++++++++++
arch/x86/include/asm/serpent.h | 31 +
crypto/Kconfig | 17 +
4 files changed, 689 insertions(+), 0 deletions(-)
create mode 100644 arch/x86/crypto/serpent-sse2-i586-asm_32.S

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 12ebdbd..2b0b963 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -5,6 +5,7 @@
obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
+obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o

obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
@@ -21,6 +22,7 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
aes-i586-y := aes-i586-asm_32.o aes_glue.o
twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
+serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o

aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
new file mode 100644
index 0000000..6f2486d
--- /dev/null
+++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
@@ -0,0 +1,639 @@
+/*
+ * Serpent Cipher 4-way parallel algorithm (i586/SSE2)
+ *
+ * Copyright (C) 2011 Jussi Kivilinna <[email protected]>
+ *
+ * Based on crypto/serpent.c by
+ * Copyright (C) 2002 Dag Arne Osvik <[email protected]>
+ * 2003 Herbert Valerio Riedel <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+.file "serpent-sse2-i586-asm_32.S"
+.text
+
+#define arg_ctx 4
+#define arg_dst 8
+#define arg_src 12
+#define arg_xor 16
+
+/**********************************************************************
+ 4-way SSE2 serpent
+ **********************************************************************/
+#define CTX %edx
+
+#define RA %xmm0
+#define RB %xmm1
+#define RC %xmm2
+#define RD %xmm3
+#define RE %xmm4
+
+#define RT0 %xmm5
+#define RT1 %xmm6
+
+#define RNOT %xmm7
+
+#define get_key(i, j, t) \
+ movd (4*(i)+(j))*4(CTX), t; \
+ pshufd $0, t, t;
+
+#define K(x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, x4); \
+ get_key(i, 1, RT0); \
+ get_key(i, 2, RT1); \
+ pxor x4, x0; \
+ pxor RT0, x1; \
+ pxor RT1, x2; \
+ get_key(i, 3, x4); \
+ pxor x4, x3;
+
+#define LK(x0, x1, x2, x3, x4, i) \
+ movdqa x0, x4; \
+ pslld $13, x0; \
+ psrld $(32 - 13), x4; \
+ por x4, x0; \
+ pxor x0, x1; \
+ movdqa x2, x4; \
+ pslld $3, x2; \
+ psrld $(32 - 3), x4; \
+ por x4, x2; \
+ pxor x2, x1; \
+ movdqa x1, x4; \
+ pslld $1, x1; \
+ psrld $(32 - 1), x4; \
+ por x4, x1; \
+ movdqa x0, x4; \
+ pslld $3, x4; \
+ pxor x2, x3; \
+ pxor x4, x3; \
+ movdqa x3, x4; \
+ pslld $7, x3; \
+ psrld $(32 - 7), x4; \
+ por x4, x3; \
+ movdqa x1, x4; \
+ pslld $7, x4; \
+ pxor x1, x0; \
+ pxor x3, x0; \
+ pxor x3, x2; \
+ pxor x4, x2; \
+ movdqa x0, x4; \
+ get_key(i, 1, RT0); \
+ pxor RT0, x1; \
+ get_key(i, 3, RT0); \
+ pxor RT0, x3; \
+ pslld $5, x0; \
+ psrld $(32 - 5), x4; \
+ por x4, x0; \
+ movdqa x2, x4; \
+ pslld $22, x2; \
+ psrld $(32 - 22), x4; \
+ por x4, x2; \
+ get_key(i, 0, RT0); \
+ pxor RT0, x0; \
+ get_key(i, 2, RT0); \
+ pxor RT0, x2;
+
+#define KL(x0, x1, x2, x3, x4, i) \
+ K(x0, x1, x2, x3, x4, i); \
+ movdqa x0, x4; \
+ psrld $5, x0; \
+ pslld $(32 - 5), x4; \
+ por x4, x0; \
+ movdqa x2, x4; \
+ psrld $22, x2; \
+ pslld $(32 - 22), x4; \
+ por x4, x2; \
+ pxor x3, x2; \
+ pxor x3, x0; \
+ movdqa x1, x4; \
+ pslld $7, x4; \
+ pxor x1, x0; \
+ pxor x4, x2; \
+ movdqa x1, x4; \
+ psrld $1, x1; \
+ pslld $(32 - 1), x4; \
+ por x4, x1; \
+ movdqa x3, x4; \
+ psrld $7, x3; \
+ pslld $(32 - 7), x4; \
+ por x4, x3; \
+ pxor x0, x1; \
+ movdqa x0, x4; \
+ pslld $3, x4; \
+ pxor x4, x3; \
+ movdqa x0, x4; \
+ psrld $13, x0; \
+ pslld $(32 - 13), x4; \
+ por x4, x0; \
+ pxor x2, x1; \
+ pxor x2, x3; \
+ movdqa x2, x4; \
+ psrld $3, x2; \
+ pslld $(32 - 3), x4; \
+ por x4, x2;
+
+#define S0(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ por x0, x3; \
+ pxor x4, x0; \
+ pxor x2, x4; \
+ pxor RNOT, x4; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x4, x1; \
+ pxor x0, x2; \
+ pxor x3, x0; \
+ por x0, x4; \
+ pxor x2, x0; \
+ pand x1, x2; \
+ pxor x2, x3; \
+ pxor RNOT, x1; \
+ pxor x4, x2; \
+ pxor x2, x1;
+
+#define S1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x0, x1; \
+ pxor x3, x0; \
+ pxor RNOT, x3; \
+ pand x1, x4; \
+ por x1, x0; \
+ pxor x2, x3; \
+ pxor x3, x0; \
+ pxor x3, x1; \
+ pxor x4, x3; \
+ por x4, x1; \
+ pxor x2, x4; \
+ pand x0, x2; \
+ pxor x1, x2; \
+ por x0, x1; \
+ pxor RNOT, x0; \
+ pxor x2, x0; \
+ pxor x1, x4;
+
+#define S2(x0, x1, x2, x3, x4) \
+ pxor RNOT, x3; \
+ pxor x0, x1; \
+ movdqa x0, x4; \
+ pand x2, x0; \
+ pxor x3, x0; \
+ por x4, x3; \
+ pxor x1, x2; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x2, x0; \
+ pand x3, x2; \
+ por x1, x3; \
+ pxor RNOT, x0; \
+ pxor x0, x3; \
+ pxor x0, x4; \
+ pxor x2, x0; \
+ por x2, x1;
+
+#define S3(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x3, x1; \
+ por x0, x3; \
+ pand x0, x4; \
+ pxor x2, x0; \
+ pxor x1, x2; \
+ pand x3, x1; \
+ pxor x3, x2; \
+ por x4, x0; \
+ pxor x3, x4; \
+ pxor x0, x1; \
+ pand x3, x0; \
+ pand x4, x3; \
+ pxor x2, x3; \
+ por x1, x4; \
+ pand x1, x2; \
+ pxor x3, x4; \
+ pxor x3, x0; \
+ pxor x2, x3;
+
+#define S4(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pand x0, x3; \
+ pxor x4, x0; \
+ pxor x2, x3; \
+ por x4, x2; \
+ pxor x1, x0; \
+ pxor x3, x4; \
+ por x0, x2; \
+ pxor x1, x2; \
+ pand x0, x1; \
+ pxor x4, x1; \
+ pand x2, x4; \
+ pxor x3, x2; \
+ pxor x0, x4; \
+ por x1, x3; \
+ pxor RNOT, x1; \
+ pxor x0, x3;
+
+#define S5(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ por x0, x1; \
+ pxor x1, x2; \
+ pxor RNOT, x3; \
+ pxor x0, x4; \
+ pxor x2, x0; \
+ pand x4, x1; \
+ por x3, x4; \
+ pxor x0, x4; \
+ pand x3, x0; \
+ pxor x3, x1; \
+ pxor x2, x3; \
+ pxor x1, x0; \
+ pand x4, x2; \
+ pxor x2, x1; \
+ pand x0, x2; \
+ pxor x2, x3;
+
+#define S6(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x0, x3; \
+ pxor x2, x1; \
+ pxor x0, x2; \
+ pand x3, x0; \
+ por x3, x1; \
+ pxor RNOT, x4; \
+ pxor x1, x0; \
+ pxor x2, x1; \
+ pxor x4, x3; \
+ pxor x0, x4; \
+ pand x0, x2; \
+ pxor x1, x4; \
+ pxor x3, x2; \
+ pand x1, x3; \
+ pxor x0, x3; \
+ pxor x2, x1;
+
+#define S7(x0, x1, x2, x3, x4) \
+ pxor RNOT, x1; \
+ movdqa x1, x4; \
+ pxor RNOT, x0; \
+ pand x2, x1; \
+ pxor x3, x1; \
+ por x4, x3; \
+ pxor x2, x4; \
+ pxor x3, x2; \
+ pxor x0, x3; \
+ por x1, x0; \
+ pand x0, x2; \
+ pxor x4, x0; \
+ pxor x3, x4; \
+ pand x0, x3; \
+ pxor x1, x4; \
+ pxor x4, x2; \
+ pxor x1, x3; \
+ por x0, x4; \
+ pxor x1, x4;
+
+#define SI0(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pxor x0, x1; \
+ por x1, x3; \
+ pxor x1, x4; \
+ pxor RNOT, x0; \
+ pxor x3, x2; \
+ pxor x0, x3; \
+ pand x1, x0; \
+ pxor x2, x0; \
+ pand x3, x2; \
+ pxor x4, x3; \
+ pxor x3, x2; \
+ pxor x3, x1; \
+ pand x0, x3; \
+ pxor x0, x1; \
+ pxor x2, x0; \
+ pxor x3, x4;
+
+#define SI1(x0, x1, x2, x3, x4) \
+ pxor x3, x1; \
+ movdqa x0, x4; \
+ pxor x2, x0; \
+ pxor RNOT, x2; \
+ por x1, x4; \
+ pxor x3, x4; \
+ pand x1, x3; \
+ pxor x2, x1; \
+ pand x4, x2; \
+ pxor x1, x4; \
+ por x3, x1; \
+ pxor x0, x3; \
+ pxor x0, x2; \
+ por x4, x0; \
+ pxor x4, x2; \
+ pxor x0, x1; \
+ pxor x1, x4;
+
+#define SI2(x0, x1, x2, x3, x4) \
+ pxor x1, x2; \
+ movdqa x3, x4; \
+ pxor RNOT, x3; \
+ por x2, x3; \
+ pxor x4, x2; \
+ pxor x0, x4; \
+ pxor x1, x3; \
+ por x2, x1; \
+ pxor x0, x2; \
+ pxor x4, x1; \
+ por x3, x4; \
+ pxor x3, x2; \
+ pxor x2, x4; \
+ pand x1, x2; \
+ pxor x3, x2; \
+ pxor x4, x3; \
+ pxor x0, x4;
+
+#define SI3(x0, x1, x2, x3, x4) \
+ pxor x1, x2; \
+ movdqa x1, x4; \
+ pand x2, x1; \
+ pxor x0, x1; \
+ por x4, x0; \
+ pxor x3, x4; \
+ pxor x3, x0; \
+ por x1, x3; \
+ pxor x2, x1; \
+ pxor x3, x1; \
+ pxor x2, x0; \
+ pxor x3, x2; \
+ pand x1, x3; \
+ pxor x0, x1; \
+ pand x2, x0; \
+ pxor x3, x4; \
+ pxor x0, x3; \
+ pxor x1, x0;
+
+#define SI4(x0, x1, x2, x3, x4) \
+ pxor x3, x2; \
+ movdqa x0, x4; \
+ pand x1, x0; \
+ pxor x2, x0; \
+ por x3, x2; \
+ pxor RNOT, x4; \
+ pxor x0, x1; \
+ pxor x2, x0; \
+ pand x4, x2; \
+ pxor x0, x2; \
+ por x4, x0; \
+ pxor x3, x0; \
+ pand x2, x3; \
+ pxor x3, x4; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x1, x4; \
+ pxor x3, x0;
+
+#define SI5(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ por x2, x1; \
+ pxor x4, x2; \
+ pxor x3, x1; \
+ pand x4, x3; \
+ pxor x3, x2; \
+ por x0, x3; \
+ pxor RNOT, x0; \
+ pxor x2, x3; \
+ por x0, x2; \
+ pxor x1, x4; \
+ pxor x4, x2; \
+ pand x0, x4; \
+ pxor x1, x0; \
+ pxor x3, x1; \
+ pand x2, x0; \
+ pxor x3, x2; \
+ pxor x2, x0; \
+ pxor x4, x2; \
+ pxor x3, x4;
+
+#define SI6(x0, x1, x2, x3, x4) \
+ pxor x2, x0; \
+ movdqa x0, x4; \
+ pand x3, x0; \
+ pxor x3, x2; \
+ pxor x2, x0; \
+ pxor x1, x3; \
+ por x4, x2; \
+ pxor x3, x2; \
+ pand x0, x3; \
+ pxor RNOT, x0; \
+ pxor x1, x3; \
+ pand x2, x1; \
+ pxor x0, x4; \
+ pxor x4, x3; \
+ pxor x2, x4; \
+ pxor x1, x0; \
+ pxor x0, x2;
+
+#define SI7(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pand x0, x3; \
+ pxor x2, x0; \
+ por x4, x2; \
+ pxor x1, x4; \
+ pxor RNOT, x0; \
+ por x3, x1; \
+ pxor x0, x4; \
+ pand x2, x0; \
+ pxor x1, x0; \
+ pand x2, x1; \
+ pxor x2, x3; \
+ pxor x3, x4; \
+ pand x3, x2; \
+ por x0, x3; \
+ pxor x4, x1; \
+ pxor x4, x3; \
+ pand x0, x4; \
+ pxor x2, x4;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
+ movdqa x2, t3; \
+ movdqa x0, t1; \
+ unpcklps x3, t3; \
+ movdqa x0, t2; \
+ unpcklps x1, t1; \
+ unpckhps x1, t2; \
+ movdqa t3, x1; \
+ unpckhps x3, x2; \
+ movdqa t1, x0; \
+ movhlps t1, x1; \
+ movdqa t2, t1; \
+ movlhps t3, x0; \
+ movlhps x2, t1; \
+ movhlps t2, x2; \
+ movdqa x2, x3; \
+ movdqa t1, x2;
+
+#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
+ movdqu (0*4*4)(in), x0; \
+ movdqu (1*4*4)(in), x1; \
+ movdqu (2*4*4)(in), x2; \
+ movdqu (3*4*4)(in), x3; \
+ \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ movdqu x0, (0*4*4)(out); \
+ movdqu x1, (1*4*4)(out); \
+ movdqu x2, (2*4*4)(out); \
+ movdqu x3, (3*4*4)(out);
+
+#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ movdqu (0*4*4)(out), t0; \
+ pxor t0, x0; \
+ movdqu x0, (0*4*4)(out); \
+ movdqu (1*4*4)(out), t0; \
+ pxor t0, x1; \
+ movdqu x1, (1*4*4)(out); \
+ movdqu (2*4*4)(out), t0; \
+ pxor t0, x2; \
+ movdqu x2, (2*4*4)(out); \
+ movdqu (3*4*4)(out), t0; \
+ pxor t0, x3; \
+ movdqu x3, (3*4*4)(out);
+
+.align 8
+.global __serpent_enc_blk_4way
+.type __serpent_enc_blk_4way,@function;
+
+__serpent_enc_blk_4way:
+ /* input:
+ * arg_ctx(%esp): ctx, CTX
+ * arg_dst(%esp): dst
+ * arg_src(%esp): src
+ * arg_xor(%esp): bool, if true: xor output
+ */
+
+ pcmpeqd RNOT, RNOT;
+
+ movl arg_ctx(%esp), CTX;
+
+ movl arg_src(%esp), %eax;
+ read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+ K(RA, RB, RC, RD, RE, 0);
+ S0(RA, RB, RC, RD, RE); LK(RC, RB, RD, RA, RE, 1);
+ S1(RC, RB, RD, RA, RE); LK(RE, RD, RA, RC, RB, 2);
+ S2(RE, RD, RA, RC, RB); LK(RB, RD, RE, RC, RA, 3);
+ S3(RB, RD, RE, RC, RA); LK(RC, RA, RD, RB, RE, 4);
+ S4(RC, RA, RD, RB, RE); LK(RA, RD, RB, RE, RC, 5);
+ S5(RA, RD, RB, RE, RC); LK(RC, RA, RD, RE, RB, 6);
+ S6(RC, RA, RD, RE, RB); LK(RD, RB, RA, RE, RC, 7);
+ S7(RD, RB, RA, RE, RC); LK(RC, RA, RE, RD, RB, 8);
+ S0(RC, RA, RE, RD, RB); LK(RE, RA, RD, RC, RB, 9);
+ S1(RE, RA, RD, RC, RB); LK(RB, RD, RC, RE, RA, 10);
+ S2(RB, RD, RC, RE, RA); LK(RA, RD, RB, RE, RC, 11);
+ S3(RA, RD, RB, RE, RC); LK(RE, RC, RD, RA, RB, 12);
+ S4(RE, RC, RD, RA, RB); LK(RC, RD, RA, RB, RE, 13);
+ S5(RC, RD, RA, RB, RE); LK(RE, RC, RD, RB, RA, 14);
+ S6(RE, RC, RD, RB, RA); LK(RD, RA, RC, RB, RE, 15);
+ S7(RD, RA, RC, RB, RE); LK(RE, RC, RB, RD, RA, 16);
+ S0(RE, RC, RB, RD, RA); LK(RB, RC, RD, RE, RA, 17);
+ S1(RB, RC, RD, RE, RA); LK(RA, RD, RE, RB, RC, 18);
+ S2(RA, RD, RE, RB, RC); LK(RC, RD, RA, RB, RE, 19);
+ S3(RC, RD, RA, RB, RE); LK(RB, RE, RD, RC, RA, 20);
+ S4(RB, RE, RD, RC, RA); LK(RE, RD, RC, RA, RB, 21);
+ S5(RE, RD, RC, RA, RB); LK(RB, RE, RD, RA, RC, 22);
+ S6(RB, RE, RD, RA, RC); LK(RD, RC, RE, RA, RB, 23);
+ S7(RD, RC, RE, RA, RB); LK(RB, RE, RA, RD, RC, 24);
+ S0(RB, RE, RA, RD, RC); LK(RA, RE, RD, RB, RC, 25);
+ S1(RA, RE, RD, RB, RC); LK(RC, RD, RB, RA, RE, 26);
+ S2(RC, RD, RB, RA, RE); LK(RE, RD, RC, RA, RB, 27);
+ S3(RE, RD, RC, RA, RB); LK(RA, RB, RD, RE, RC, 28);
+ S4(RA, RB, RD, RE, RC); LK(RB, RD, RE, RC, RA, 29);
+ S5(RB, RD, RE, RC, RA); LK(RA, RB, RD, RC, RE, 30);
+ S6(RA, RB, RD, RC, RE); LK(RD, RE, RB, RC, RA, 31);
+ S7(RD, RE, RB, RC, RA); K(RA, RB, RC, RD, RE, 32);
+
+ movl arg_dst(%esp), %eax;
+
+ cmpb $0, arg_xor(%esp);
+ jnz __enc_xor4;
+
+ write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+ ret;
+
+__enc_xor4:
+ xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+ ret;
+
+.align 8
+.global serpent_dec_blk_4way
+.type serpent_dec_blk_4way,@function;
+
+serpent_dec_blk_4way:
+ /* input:
+ * arg_ctx(%esp): ctx, CTX
+ * arg_dst(%esp): dst
+ * arg_src(%esp): src
+ */
+
+ pcmpeqd RNOT, RNOT;
+
+ movl arg_ctx(%esp), CTX;
+
+ movl arg_src(%esp), %eax;
+ read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+ K(RA, RB, RC, RD, RE, 32);
+ SI7(RA, RB, RC, RD, RE); KL(RB, RD, RA, RE, RC, 31);
+ SI6(RB, RD, RA, RE, RC); KL(RA, RC, RE, RB, RD, 30);
+ SI5(RA, RC, RE, RB, RD); KL(RC, RD, RA, RE, RB, 29);
+ SI4(RC, RD, RA, RE, RB); KL(RC, RA, RB, RE, RD, 28);
+ SI3(RC, RA, RB, RE, RD); KL(RB, RC, RD, RE, RA, 27);
+ SI2(RB, RC, RD, RE, RA); KL(RC, RA, RE, RD, RB, 26);
+ SI1(RC, RA, RE, RD, RB); KL(RB, RA, RE, RD, RC, 25);
+ SI0(RB, RA, RE, RD, RC); KL(RE, RC, RA, RB, RD, 24);
+ SI7(RE, RC, RA, RB, RD); KL(RC, RB, RE, RD, RA, 23);
+ SI6(RC, RB, RE, RD, RA); KL(RE, RA, RD, RC, RB, 22);
+ SI5(RE, RA, RD, RC, RB); KL(RA, RB, RE, RD, RC, 21);
+ SI4(RA, RB, RE, RD, RC); KL(RA, RE, RC, RD, RB, 20);
+ SI3(RA, RE, RC, RD, RB); KL(RC, RA, RB, RD, RE, 19);
+ SI2(RC, RA, RB, RD, RE); KL(RA, RE, RD, RB, RC, 18);
+ SI1(RA, RE, RD, RB, RC); KL(RC, RE, RD, RB, RA, 17);
+ SI0(RC, RE, RD, RB, RA); KL(RD, RA, RE, RC, RB, 16);
+ SI7(RD, RA, RE, RC, RB); KL(RA, RC, RD, RB, RE, 15);
+ SI6(RA, RC, RD, RB, RE); KL(RD, RE, RB, RA, RC, 14);
+ SI5(RD, RE, RB, RA, RC); KL(RE, RC, RD, RB, RA, 13);
+ SI4(RE, RC, RD, RB, RA); KL(RE, RD, RA, RB, RC, 12);
+ SI3(RE, RD, RA, RB, RC); KL(RA, RE, RC, RB, RD, 11);
+ SI2(RA, RE, RC, RB, RD); KL(RE, RD, RB, RC, RA, 10);
+ SI1(RE, RD, RB, RC, RA); KL(RA, RD, RB, RC, RE, 9);
+ SI0(RA, RD, RB, RC, RE); KL(RB, RE, RD, RA, RC, 8);
+ SI7(RB, RE, RD, RA, RC); KL(RE, RA, RB, RC, RD, 7);
+ SI6(RE, RA, RB, RC, RD); KL(RB, RD, RC, RE, RA, 6);
+ SI5(RB, RD, RC, RE, RA); KL(RD, RA, RB, RC, RE, 5);
+ SI4(RD, RA, RB, RC, RE); KL(RD, RB, RE, RC, RA, 4);
+ SI3(RD, RB, RE, RC, RA); KL(RE, RD, RA, RC, RB, 3);
+ SI2(RE, RD, RA, RC, RB); KL(RD, RB, RC, RA, RE, 2);
+ SI1(RD, RB, RC, RA, RE); KL(RE, RB, RC, RA, RD, 1);
+ SI0(RE, RB, RC, RA, RD); K(RC, RD, RB, RE, RA, 0);
+
+ movl arg_dst(%esp), %eax;
+ write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
+
+ ret;
+
diff --git a/arch/x86/include/asm/serpent.h b/arch/x86/include/asm/serpent.h
index ec463d7..af68733 100644
--- a/arch/x86/include/asm/serpent.h
+++ b/arch/x86/include/asm/serpent.h
@@ -4,6 +4,35 @@
#include <linux/crypto.h>
#include <crypto/serpent.h>

+#ifdef CONFIG_X86_32
+
+#define SERPENT_PARALLEL_BLOCKS 4
+
+asmlinkage void __serpent_enc_blk_4way(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src, bool xor);
+asmlinkage void serpent_dec_blk_4way(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src);
+
+static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __serpent_enc_blk_4way(ctx, dst, src, false);
+}
+
+static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __serpent_enc_blk_4way(ctx, dst, src, true);
+}
+
+static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ serpent_dec_blk_4way(ctx, dst, src);
+}
+
+#else
+
#define SERPENT_PARALLEL_BLOCKS 8

asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst,
@@ -31,3 +60,5 @@ static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,

#endif

+#endif
+
diff --git a/crypto/Kconfig b/crypto/Kconfig
index b9b3315..94a4ce8 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -775,6 +775,23 @@ config CRYPTO_SERPENT_SSE2_X86_64
See also:
<http://www.cl.cam.ac.uk/~rja14/serpent.html>

+config CRYPTO_SERPENT_SSE2_586
+ tristate "Serpent cipher algorithm (i586/SSE2)"
+ depends on X86 && !64BIT
+ select CRYPTO_ALGAPI
+ select CRYPTO_SERPENT
+ help
+ Serpent cipher algorithm, by Anderson, Biham & Knudsen.
+
+ Keys are allowed to be from 0 to 256 bits in length, in steps
+ of 8 bits.
+
+ This module provides Serpent cipher algorithm that processes four
+ blocks parallel using SSE2 instruction set.
+
+ See also:
+ <http://www.cl.cam.ac.uk/~rja14/serpent.html>
+
config CRYPTO_TEA
tristate "TEA, XTEA and XETA cipher algorithms"
select CRYPTO_ALGAPI

2011-10-17 21:03:22

by Jussi Kivilinna

[permalink] [raw]
Subject: [PATCH 6/7] crypto: serpent: add 8-way parallel x86_64/SSE2 assembler implementation

Patch adds x86_64/SSE2 assembler implementation of serpent cipher. Assembler
functions crypt data in eigth block chunks (two 4 block chunk SSE2 operations
in parallel to improve performance on out-of-order CPUs). Glue code is based
on one from AES-NI implementation, so requests from irq context are redirected
to cryptd.

Patch has been tested with tcrypt and automated filesystem tests.

Tcrypt benchmarks results (serpent-sse2/serpent_generic speed ratios):

AMD Phenom II 1055T (fam:16, model:10):

size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec
16B 1.03x 1.01x 1.03x 1.05x 1.00x 0.99x
64B 1.00x 1.01x 1.02x 1.04x 1.02x 1.01x
256B 2.34x 2.41x 0.99x 2.43x 2.39x 2.40x
1024B 2.51x 2.57x 1.00x 2.59x 2.56x 2.56x
8192B 2.50x 2.54x 1.00x 2.55x 2.57x 2.57x

Intel Celeron T1600 (fam:6, model:15, step:13):

size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec
16B 0.97x 0.97x 1.01x 1.01x 1.01x 1.02x
64B 1.00x 1.00x 1.00x 1.02x 1.01x 1.01x
256B 3.41x 3.35x 1.00x 3.39x 3.42x 3.44x
1024B 3.75x 3.72x 0.99x 3.74x 3.75x 3.75x
8192B 3.70x 3.68x 0.99x 3.68x 3.69x 3.69x

Full output:
http://koti.mbnet.fi/axh/kernel/crypto/phenom-ii-1055t/serpent-generic.txt
http://koti.mbnet.fi/axh/kernel/crypto/phenom-ii-1055t/serpent-sse2.txt
http://koti.mbnet.fi/axh/kernel/crypto/celeron-t1600/serpent-generic.txt
http://koti.mbnet.fi/axh/kernel/crypto/celeron-t1600/serpent-sse2.txt

Signed-off-by: Jussi Kivilinna <[email protected]>
---
arch/x86/crypto/Makefile | 2
arch/x86/crypto/serpent-sse2-x86_64-asm_64.S | 761 ++++++++++++++++++++++++++
arch/x86/crypto/serpent_sse2_glue.c | 719 +++++++++++++++++++++++++
arch/x86/include/asm/serpent.h | 33 +
crypto/Kconfig | 17 +
crypto/testmgr.c | 60 ++
6 files changed, 1592 insertions(+), 0 deletions(-)
create mode 100644 arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
create mode 100644 arch/x86/crypto/serpent_sse2_glue.c
create mode 100644 arch/x86/include/asm/serpent.h

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 3537d4b..12ebdbd 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
+obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o

@@ -26,6 +27,7 @@ blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
+serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o

aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o

diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
new file mode 100644
index 0000000..7f24a15
--- /dev/null
+++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
@@ -0,0 +1,761 @@
+/*
+ * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2)
+ *
+ * Copyright (C) 2011 Jussi Kivilinna <[email protected]>
+ *
+ * Based on crypto/serpent.c by
+ * Copyright (C) 2002 Dag Arne Osvik <[email protected]>
+ * 2003 Herbert Valerio Riedel <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+.file "serpent-sse2-x86_64-asm_64.S"
+.text
+
+#define CTX %rdi
+
+/**********************************************************************
+ 8-way SSE2 serpent
+ **********************************************************************/
+#define RA1 %xmm0
+#define RB1 %xmm1
+#define RC1 %xmm2
+#define RD1 %xmm3
+#define RE1 %xmm4
+
+#define RA2 %xmm5
+#define RB2 %xmm6
+#define RC2 %xmm7
+#define RD2 %xmm8
+#define RE2 %xmm9
+
+#define RNOT %xmm10
+
+#define RK0 %xmm11
+#define RK1 %xmm12
+#define RK2 %xmm13
+#define RK3 %xmm14
+
+#define S0_1(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ por x0, x3; \
+ pxor x4, x0; \
+ pxor x2, x4; \
+ pxor RNOT, x4; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x4, x1; \
+ pxor x0, x2;
+#define S0_2(x0, x1, x2, x3, x4) \
+ pxor x3, x0; \
+ por x0, x4; \
+ pxor x2, x0; \
+ pand x1, x2; \
+ pxor x2, x3; \
+ pxor RNOT, x1; \
+ pxor x4, x2; \
+ pxor x2, x1;
+
+#define S1_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x0, x1; \
+ pxor x3, x0; \
+ pxor RNOT, x3; \
+ pand x1, x4; \
+ por x1, x0; \
+ pxor x2, x3; \
+ pxor x3, x0; \
+ pxor x3, x1;
+#define S1_2(x0, x1, x2, x3, x4) \
+ pxor x4, x3; \
+ por x4, x1; \
+ pxor x2, x4; \
+ pand x0, x2; \
+ pxor x1, x2; \
+ por x0, x1; \
+ pxor RNOT, x0; \
+ pxor x2, x0; \
+ pxor x1, x4;
+
+#define S2_1(x0, x1, x2, x3, x4) \
+ pxor RNOT, x3; \
+ pxor x0, x1; \
+ movdqa x0, x4; \
+ pand x2, x0; \
+ pxor x3, x0; \
+ por x4, x3; \
+ pxor x1, x2; \
+ pxor x1, x3; \
+ pand x0, x1;
+#define S2_2(x0, x1, x2, x3, x4) \
+ pxor x2, x0; \
+ pand x3, x2; \
+ por x1, x3; \
+ pxor RNOT, x0; \
+ pxor x0, x3; \
+ pxor x0, x4; \
+ pxor x2, x0; \
+ por x2, x1;
+
+#define S3_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x3, x1; \
+ por x0, x3; \
+ pand x0, x4; \
+ pxor x2, x0; \
+ pxor x1, x2; \
+ pand x3, x1; \
+ pxor x3, x2; \
+ por x4, x0; \
+ pxor x3, x4;
+#define S3_2(x0, x1, x2, x3, x4) \
+ pxor x0, x1; \
+ pand x3, x0; \
+ pand x4, x3; \
+ pxor x2, x3; \
+ por x1, x4; \
+ pand x1, x2; \
+ pxor x3, x4; \
+ pxor x3, x0; \
+ pxor x2, x3;
+
+#define S4_1(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pand x0, x3; \
+ pxor x4, x0; \
+ pxor x2, x3; \
+ por x4, x2; \
+ pxor x1, x0; \
+ pxor x3, x4; \
+ por x0, x2; \
+ pxor x1, x2;
+#define S4_2(x0, x1, x2, x3, x4) \
+ pand x0, x1; \
+ pxor x4, x1; \
+ pand x2, x4; \
+ pxor x3, x2; \
+ pxor x0, x4; \
+ por x1, x3; \
+ pxor RNOT, x1; \
+ pxor x0, x3;
+
+#define S5_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ por x0, x1; \
+ pxor x1, x2; \
+ pxor RNOT, x3; \
+ pxor x0, x4; \
+ pxor x2, x0; \
+ pand x4, x1; \
+ por x3, x4; \
+ pxor x0, x4;
+#define S5_2(x0, x1, x2, x3, x4) \
+ pand x3, x0; \
+ pxor x3, x1; \
+ pxor x2, x3; \
+ pxor x1, x0; \
+ pand x4, x2; \
+ pxor x2, x1; \
+ pand x0, x2; \
+ pxor x2, x3;
+
+#define S6_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x0, x3; \
+ pxor x2, x1; \
+ pxor x0, x2; \
+ pand x3, x0; \
+ por x3, x1; \
+ pxor RNOT, x4; \
+ pxor x1, x0; \
+ pxor x2, x1;
+#define S6_2(x0, x1, x2, x3, x4) \
+ pxor x4, x3; \
+ pxor x0, x4; \
+ pand x0, x2; \
+ pxor x1, x4; \
+ pxor x3, x2; \
+ pand x1, x3; \
+ pxor x0, x3; \
+ pxor x2, x1;
+
+#define S7_1(x0, x1, x2, x3, x4) \
+ pxor RNOT, x1; \
+ movdqa x1, x4; \
+ pxor RNOT, x0; \
+ pand x2, x1; \
+ pxor x3, x1; \
+ por x4, x3; \
+ pxor x2, x4; \
+ pxor x3, x2; \
+ pxor x0, x3; \
+ por x1, x0;
+#define S7_2(x0, x1, x2, x3, x4) \
+ pand x0, x2; \
+ pxor x4, x0; \
+ pxor x3, x4; \
+ pand x0, x3; \
+ pxor x1, x4; \
+ pxor x4, x2; \
+ pxor x1, x3; \
+ por x0, x4; \
+ pxor x1, x4;
+
+#define SI0_1(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pxor x0, x1; \
+ por x1, x3; \
+ pxor x1, x4; \
+ pxor RNOT, x0; \
+ pxor x3, x2; \
+ pxor x0, x3; \
+ pand x1, x0; \
+ pxor x2, x0;
+#define SI0_2(x0, x1, x2, x3, x4) \
+ pand x3, x2; \
+ pxor x4, x3; \
+ pxor x3, x2; \
+ pxor x3, x1; \
+ pand x0, x3; \
+ pxor x0, x1; \
+ pxor x2, x0; \
+ pxor x3, x4;
+
+#define SI1_1(x0, x1, x2, x3, x4) \
+ pxor x3, x1; \
+ movdqa x0, x4; \
+ pxor x2, x0; \
+ pxor RNOT, x2; \
+ por x1, x4; \
+ pxor x3, x4; \
+ pand x1, x3; \
+ pxor x2, x1; \
+ pand x4, x2;
+#define SI1_2(x0, x1, x2, x3, x4) \
+ pxor x1, x4; \
+ por x3, x1; \
+ pxor x0, x3; \
+ pxor x0, x2; \
+ por x4, x0; \
+ pxor x4, x2; \
+ pxor x0, x1; \
+ pxor x1, x4;
+
+#define SI2_1(x0, x1, x2, x3, x4) \
+ pxor x1, x2; \
+ movdqa x3, x4; \
+ pxor RNOT, x3; \
+ por x2, x3; \
+ pxor x4, x2; \
+ pxor x0, x4; \
+ pxor x1, x3; \
+ por x2, x1; \
+ pxor x0, x2;
+#define SI2_2(x0, x1, x2, x3, x4) \
+ pxor x4, x1; \
+ por x3, x4; \
+ pxor x3, x2; \
+ pxor x2, x4; \
+ pand x1, x2; \
+ pxor x3, x2; \
+ pxor x4, x3; \
+ pxor x0, x4;
+
+#define SI3_1(x0, x1, x2, x3, x4) \
+ pxor x1, x2; \
+ movdqa x1, x4; \
+ pand x2, x1; \
+ pxor x0, x1; \
+ por x4, x0; \
+ pxor x3, x4; \
+ pxor x3, x0; \
+ por x1, x3; \
+ pxor x2, x1;
+#define SI3_2(x0, x1, x2, x3, x4) \
+ pxor x3, x1; \
+ pxor x2, x0; \
+ pxor x3, x2; \
+ pand x1, x3; \
+ pxor x0, x1; \
+ pand x2, x0; \
+ pxor x3, x4; \
+ pxor x0, x3; \
+ pxor x1, x0;
+
+#define SI4_1(x0, x1, x2, x3, x4) \
+ pxor x3, x2; \
+ movdqa x0, x4; \
+ pand x1, x0; \
+ pxor x2, x0; \
+ por x3, x2; \
+ pxor RNOT, x4; \
+ pxor x0, x1; \
+ pxor x2, x0; \
+ pand x4, x2;
+#define SI4_2(x0, x1, x2, x3, x4) \
+ pxor x0, x2; \
+ por x4, x0; \
+ pxor x3, x0; \
+ pand x2, x3; \
+ pxor x3, x4; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x1, x4; \
+ pxor x3, x0;
+
+#define SI5_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ por x2, x1; \
+ pxor x4, x2; \
+ pxor x3, x1; \
+ pand x4, x3; \
+ pxor x3, x2; \
+ por x0, x3; \
+ pxor RNOT, x0; \
+ pxor x2, x3; \
+ por x0, x2;
+#define SI5_2(x0, x1, x2, x3, x4) \
+ pxor x1, x4; \
+ pxor x4, x2; \
+ pand x0, x4; \
+ pxor x1, x0; \
+ pxor x3, x1; \
+ pand x2, x0; \
+ pxor x3, x2; \
+ pxor x2, x0; \
+ pxor x4, x2; \
+ pxor x3, x4;
+
+#define SI6_1(x0, x1, x2, x3, x4) \
+ pxor x2, x0; \
+ movdqa x0, x4; \
+ pand x3, x0; \
+ pxor x3, x2; \
+ pxor x2, x0; \
+ pxor x1, x3; \
+ por x4, x2; \
+ pxor x3, x2; \
+ pand x0, x3;
+#define SI6_2(x0, x1, x2, x3, x4) \
+ pxor RNOT, x0; \
+ pxor x1, x3; \
+ pand x2, x1; \
+ pxor x0, x4; \
+ pxor x4, x3; \
+ pxor x2, x4; \
+ pxor x1, x0; \
+ pxor x0, x2;
+
+#define SI7_1(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pand x0, x3; \
+ pxor x2, x0; \
+ por x4, x2; \
+ pxor x1, x4; \
+ pxor RNOT, x0; \
+ por x3, x1; \
+ pxor x0, x4; \
+ pand x2, x0; \
+ pxor x1, x0;
+#define SI7_2(x0, x1, x2, x3, x4) \
+ pand x2, x1; \
+ pxor x2, x3; \
+ pxor x3, x4; \
+ pand x3, x2; \
+ por x0, x3; \
+ pxor x4, x1; \
+ pxor x4, x3; \
+ pand x0, x4; \
+ pxor x2, x4;
+
+#define get_key(i, j, t) \
+ movd (4*(i)+(j))*4(CTX), t; \
+ pshufd $0, t, t;
+
+#define K2(x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ get_key(i, 1, RK1); \
+ get_key(i, 2, RK2); \
+ get_key(i, 3, RK3); \
+ pxor RK0, x0 ## 1; \
+ pxor RK1, x1 ## 1; \
+ pxor RK2, x2 ## 1; \
+ pxor RK3, x3 ## 1; \
+ pxor RK0, x0 ## 2; \
+ pxor RK1, x1 ## 2; \
+ pxor RK2, x2 ## 2; \
+ pxor RK3, x3 ## 2;
+
+#define LK2(x0, x1, x2, x3, x4, i) \
+ movdqa x0 ## 1, x4 ## 1; \
+ pslld $13, x0 ## 1; \
+ psrld $(32 - 13), x4 ## 1; \
+ por x4 ## 1, x0 ## 1; \
+ pxor x0 ## 1, x1 ## 1; \
+ movdqa x2 ## 1, x4 ## 1; \
+ pslld $3, x2 ## 1; \
+ psrld $(32 - 3), x4 ## 1; \
+ por x4 ## 1, x2 ## 1; \
+ pxor x2 ## 1, x1 ## 1; \
+ movdqa x0 ## 2, x4 ## 2; \
+ pslld $13, x0 ## 2; \
+ psrld $(32 - 13), x4 ## 2; \
+ por x4 ## 2, x0 ## 2; \
+ pxor x0 ## 2, x1 ## 2; \
+ movdqa x2 ## 2, x4 ## 2; \
+ pslld $3, x2 ## 2; \
+ psrld $(32 - 3), x4 ## 2; \
+ por x4 ## 2, x2 ## 2; \
+ pxor x2 ## 2, x1 ## 2; \
+ movdqa x1 ## 1, x4 ## 1; \
+ pslld $1, x1 ## 1; \
+ psrld $(32 - 1), x4 ## 1; \
+ por x4 ## 1, x1 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ pslld $3, x4 ## 1; \
+ pxor x2 ## 1, x3 ## 1; \
+ pxor x4 ## 1, x3 ## 1; \
+ movdqa x3 ## 1, x4 ## 1; \
+ get_key(i, 1, RK1); \
+ movdqa x1 ## 2, x4 ## 2; \
+ pslld $1, x1 ## 2; \
+ psrld $(32 - 1), x4 ## 2; \
+ por x4 ## 2, x1 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ pslld $3, x4 ## 2; \
+ pxor x2 ## 2, x3 ## 2; \
+ pxor x4 ## 2, x3 ## 2; \
+ movdqa x3 ## 2, x4 ## 2; \
+ get_key(i, 3, RK3); \
+ pslld $7, x3 ## 1; \
+ psrld $(32 - 7), x4 ## 1; \
+ por x4 ## 1, x3 ## 1; \
+ movdqa x1 ## 1, x4 ## 1; \
+ pslld $7, x4 ## 1; \
+ pxor x1 ## 1, x0 ## 1; \
+ pxor x3 ## 1, x0 ## 1; \
+ pxor x3 ## 1, x2 ## 1; \
+ pxor x4 ## 1, x2 ## 1; \
+ get_key(i, 0, RK0); \
+ pslld $7, x3 ## 2; \
+ psrld $(32 - 7), x4 ## 2; \
+ por x4 ## 2, x3 ## 2; \
+ movdqa x1 ## 2, x4 ## 2; \
+ pslld $7, x4 ## 2; \
+ pxor x1 ## 2, x0 ## 2; \
+ pxor x3 ## 2, x0 ## 2; \
+ pxor x3 ## 2, x2 ## 2; \
+ pxor x4 ## 2, x2 ## 2; \
+ get_key(i, 2, RK2); \
+ pxor RK1, x1 ## 1; \
+ pxor RK3, x3 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ pslld $5, x0 ## 1; \
+ psrld $(32 - 5), x4 ## 1; \
+ por x4 ## 1, x0 ## 1; \
+ movdqa x2 ## 1, x4 ## 1; \
+ pslld $22, x2 ## 1; \
+ psrld $(32 - 22), x4 ## 1; \
+ por x4 ## 1, x2 ## 1; \
+ pxor RK0, x0 ## 1; \
+ pxor RK2, x2 ## 1; \
+ pxor RK1, x1 ## 2; \
+ pxor RK3, x3 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ pslld $5, x0 ## 2; \
+ psrld $(32 - 5), x4 ## 2; \
+ por x4 ## 2, x0 ## 2; \
+ movdqa x2 ## 2, x4 ## 2; \
+ pslld $22, x2 ## 2; \
+ psrld $(32 - 22), x4 ## 2; \
+ por x4 ## 2, x2 ## 2; \
+ pxor RK0, x0 ## 2; \
+ pxor RK2, x2 ## 2;
+
+#define KL2(x0, x1, x2, x3, x4, i) \
+ pxor RK0, x0 ## 1; \
+ pxor RK2, x2 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ psrld $5, x0 ## 1; \
+ pslld $(32 - 5), x4 ## 1; \
+ por x4 ## 1, x0 ## 1; \
+ pxor RK3, x3 ## 1; \
+ pxor RK1, x1 ## 1; \
+ movdqa x2 ## 1, x4 ## 1; \
+ psrld $22, x2 ## 1; \
+ pslld $(32 - 22), x4 ## 1; \
+ por x4 ## 1, x2 ## 1; \
+ pxor x3 ## 1, x2 ## 1; \
+ pxor RK0, x0 ## 2; \
+ pxor RK2, x2 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ psrld $5, x0 ## 2; \
+ pslld $(32 - 5), x4 ## 2; \
+ por x4 ## 2, x0 ## 2; \
+ pxor RK3, x3 ## 2; \
+ pxor RK1, x1 ## 2; \
+ movdqa x2 ## 2, x4 ## 2; \
+ psrld $22, x2 ## 2; \
+ pslld $(32 - 22), x4 ## 2; \
+ por x4 ## 2, x2 ## 2; \
+ pxor x3 ## 2, x2 ## 2; \
+ pxor x3 ## 1, x0 ## 1; \
+ movdqa x1 ## 1, x4 ## 1; \
+ pslld $7, x4 ## 1; \
+ pxor x1 ## 1, x0 ## 1; \
+ pxor x4 ## 1, x2 ## 1; \
+ movdqa x1 ## 1, x4 ## 1; \
+ psrld $1, x1 ## 1; \
+ pslld $(32 - 1), x4 ## 1; \
+ por x4 ## 1, x1 ## 1; \
+ pxor x3 ## 2, x0 ## 2; \
+ movdqa x1 ## 2, x4 ## 2; \
+ pslld $7, x4 ## 2; \
+ pxor x1 ## 2, x0 ## 2; \
+ pxor x4 ## 2, x2 ## 2; \
+ movdqa x1 ## 2, x4 ## 2; \
+ psrld $1, x1 ## 2; \
+ pslld $(32 - 1), x4 ## 2; \
+ por x4 ## 2, x1 ## 2; \
+ movdqa x3 ## 1, x4 ## 1; \
+ psrld $7, x3 ## 1; \
+ pslld $(32 - 7), x4 ## 1; \
+ por x4 ## 1, x3 ## 1; \
+ pxor x0 ## 1, x1 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ pslld $3, x4 ## 1; \
+ pxor x4 ## 1, x3 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ movdqa x3 ## 2, x4 ## 2; \
+ psrld $7, x3 ## 2; \
+ pslld $(32 - 7), x4 ## 2; \
+ por x4 ## 2, x3 ## 2; \
+ pxor x0 ## 2, x1 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ pslld $3, x4 ## 2; \
+ pxor x4 ## 2, x3 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ psrld $13, x0 ## 1; \
+ pslld $(32 - 13), x4 ## 1; \
+ por x4 ## 1, x0 ## 1; \
+ pxor x2 ## 1, x1 ## 1; \
+ pxor x2 ## 1, x3 ## 1; \
+ movdqa x2 ## 1, x4 ## 1; \
+ psrld $3, x2 ## 1; \
+ pslld $(32 - 3), x4 ## 1; \
+ por x4 ## 1, x2 ## 1; \
+ psrld $13, x0 ## 2; \
+ pslld $(32 - 13), x4 ## 2; \
+ por x4 ## 2, x0 ## 2; \
+ pxor x2 ## 2, x1 ## 2; \
+ pxor x2 ## 2, x3 ## 2; \
+ movdqa x2 ## 2, x4 ## 2; \
+ psrld $3, x2 ## 2; \
+ pslld $(32 - 3), x4 ## 2; \
+ por x4 ## 2, x2 ## 2;
+
+#define S(SBOX, x0, x1, x2, x3, x4) \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
+
+#define SP(SBOX, x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 2, RK2); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ get_key(i, 3, RK3); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 1, RK1); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
+ movdqa x2, t3; \
+ movdqa x0, t1; \
+ unpcklps x3, t3; \
+ movdqa x0, t2; \
+ unpcklps x1, t1; \
+ unpckhps x1, t2; \
+ movdqa t3, x1; \
+ unpckhps x3, x2; \
+ movdqa t1, x0; \
+ movhlps t1, x1; \
+ movdqa t2, t1; \
+ movlhps t3, x0; \
+ movlhps x2, t1; \
+ movhlps t2, x2; \
+ movdqa x2, x3; \
+ movdqa t1, x2;
+
+#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
+ movdqu (0*4*4)(in), x0; \
+ movdqu (1*4*4)(in), x1; \
+ movdqu (2*4*4)(in), x2; \
+ movdqu (3*4*4)(in), x3; \
+ \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ movdqu x0, (0*4*4)(out); \
+ movdqu x1, (1*4*4)(out); \
+ movdqu x2, (2*4*4)(out); \
+ movdqu x3, (3*4*4)(out);
+
+#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ movdqu (0*4*4)(out), t0; \
+ pxor t0, x0; \
+ movdqu x0, (0*4*4)(out); \
+ movdqu (1*4*4)(out), t0; \
+ pxor t0, x1; \
+ movdqu x1, (1*4*4)(out); \
+ movdqu (2*4*4)(out), t0; \
+ pxor t0, x2; \
+ movdqu x2, (2*4*4)(out); \
+ movdqu (3*4*4)(out), t0; \
+ pxor t0, x3; \
+ movdqu x3, (3*4*4)(out);
+
+.align 8
+.global __serpent_enc_blk_8way
+.type __serpent_enc_blk_8way,@function;
+
+__serpent_enc_blk_8way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: bool, if true: xor output
+ */
+
+ pcmpeqd RNOT, RNOT;
+
+ leaq (4*4*4)(%rdx), %rax;
+ read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 0);
+ S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
+ S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
+ S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
+ S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
+ S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
+ S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
+ S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
+ S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
+ S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
+ S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
+ S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
+ S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
+ S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
+ S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
+ S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
+ S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
+ S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
+ S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
+ S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
+ S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
+ S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
+ S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
+ S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
+ S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
+ S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
+ S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
+ S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
+ S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
+ S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
+ S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
+ S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
+ S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
+
+ leaq (4*4*4)(%rsi), %rax;
+
+ testb %cl, %cl;
+ jnz __enc_xor8;
+
+ write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ ret;
+
+__enc_xor8:
+ xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ ret;
+
+.align 8
+.global serpent_dec_blk_8way
+.type serpent_dec_blk_8way,@function;
+
+serpent_dec_blk_8way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ pcmpeqd RNOT, RNOT;
+
+ leaq (4*4*4)(%rdx), %rax;
+ read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 32);
+ SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
+ SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
+ SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
+ SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
+ SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
+ SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
+ SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
+ SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
+ SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
+ SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
+ SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
+ SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
+ SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
+ SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
+ SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
+ SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
+ SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
+ SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
+ SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
+ SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
+ SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
+ SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
+ SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
+ SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
+ SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
+ SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
+ SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
+ SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
+ SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
+ SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
+ SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
+ S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
+
+ leaq (4*4*4)(%rsi), %rax;
+ write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
+ write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+
+ ret;
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
new file mode 100644
index 0000000..5cf17e2
--- /dev/null
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -0,0 +1,719 @@
+/*
+ * Glue Code for SSE2 assembler versions of Serpent Cipher
+ *
+ * Copyright (c) 2011 Jussi Kivilinna <[email protected]>
+ *
+ * Glue code based on aesni-intel_glue.c by:
+ * Copyright (C) 2008, Intel Corp.
+ * Author: Huang Ying <[email protected]>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ * Copyright (c) 2006 Herbert Xu <[email protected]>
+ * CTR part based on code (crypto/ctr.c) by:
+ * (C) Copyright IBM Corp. 2007 - Joy Latten <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/serpent.h>
+#include <crypto/cryptd.h>
+#include <crypto/b128ops.h>
+#include <crypto/ctr.h>
+#include <asm/i387.h>
+#include <asm/serpent.h>
+#include <crypto/scatterwalk.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+
+struct async_serpent_ctx {
+ struct cryptd_ablkcipher *cryptd_tfm;
+};
+
+static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
+{
+ if (fpu_enabled)
+ return true;
+
+ /* SSE2 is only used when chunk to be processed is large enough, so
+ * do not enable FPU until it is necessary.
+ */
+ if (nbytes < SERPENT_BLOCK_SIZE * SERPENT_PARALLEL_BLOCKS)
+ return false;
+
+ kernel_fpu_begin();
+ return true;
+}
+
+static inline void serpent_fpu_end(bool fpu_enabled)
+{
+ if (fpu_enabled)
+ kernel_fpu_end();
+}
+
+static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
+ bool enc)
+{
+ bool fpu_enabled = false;
+ struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ const unsigned int bsize = SERPENT_BLOCK_SIZE;
+ unsigned int nbytes;
+ int err;
+
+ err = blkcipher_walk_virt(desc, walk);
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ while ((nbytes = walk->nbytes)) {
+ u8 *wsrc = walk->src.virt.addr;
+ u8 *wdst = walk->dst.virt.addr;
+
+ fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
+
+ /* Process multi-block batch */
+ if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
+ do {
+ if (enc)
+ serpent_enc_blk_xway(ctx, wdst, wsrc);
+ else
+ serpent_dec_blk_xway(ctx, wdst, wsrc);
+
+ wsrc += bsize * SERPENT_PARALLEL_BLOCKS;
+ wdst += bsize * SERPENT_PARALLEL_BLOCKS;
+ nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
+ } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ do {
+ if (enc)
+ __serpent_encrypt(ctx, wdst, wsrc);
+ else
+ __serpent_decrypt(ctx, wdst, wsrc);
+
+ wsrc += bsize;
+ wdst += bsize;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+done:
+ err = blkcipher_walk_done(desc, walk, nbytes);
+ }
+
+ serpent_fpu_end(fpu_enabled);
+ return err;
+}
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ return ecb_crypt(desc, &walk, true);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ return ecb_crypt(desc, &walk, false);
+}
+
+static struct crypto_alg blk_ecb_alg = {
+ .cra_name = "__ecb-serpent-sse2",
+ .cra_driver_name = "__driver-ecb-serpent-sse2",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = SERPENT_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct serpent_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(blk_ecb_alg.cra_list),
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .setkey = serpent_setkey,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ },
+ },
+};
+
+static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ const unsigned int bsize = SERPENT_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u128 *src = (u128 *)walk->src.virt.addr;
+ u128 *dst = (u128 *)walk->dst.virt.addr;
+ u128 *iv = (u128 *)walk->iv;
+
+ do {
+ u128_xor(dst, src, iv);
+ __serpent_encrypt(ctx, (u8 *)dst, (u8 *)dst);
+ iv = dst;
+
+ src += 1;
+ dst += 1;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+ u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
+ return nbytes;
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ while ((nbytes = walk.nbytes)) {
+ nbytes = __cbc_encrypt(desc, &walk);
+ err = blkcipher_walk_done(desc, &walk, nbytes);
+ }
+
+ return err;
+}
+
+static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ const unsigned int bsize = SERPENT_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u128 *src = (u128 *)walk->src.virt.addr;
+ u128 *dst = (u128 *)walk->dst.virt.addr;
+ u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
+ u128 last_iv;
+ int i;
+
+ /* Start of the last block. */
+ src += nbytes / bsize - 1;
+ dst += nbytes / bsize - 1;
+
+ last_iv = *src;
+
+ /* Process multi-block batch */
+ if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
+ do {
+ nbytes -= bsize * (SERPENT_PARALLEL_BLOCKS - 1);
+ src -= SERPENT_PARALLEL_BLOCKS - 1;
+ dst -= SERPENT_PARALLEL_BLOCKS - 1;
+
+ for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
+ ivs[i] = src[i];
+
+ serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
+
+ for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
+ u128_xor(dst + (i + 1), dst + (i + 1), ivs + i);
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ goto done;
+
+ u128_xor(dst, dst, src - 1);
+ src -= 1;
+ dst -= 1;
+ } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ for (;;) {
+ __serpent_decrypt(ctx, (u8 *)dst, (u8 *)src);
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ break;
+
+ u128_xor(dst, dst, src - 1);
+ src -= 1;
+ dst -= 1;
+ }
+
+done:
+ u128_xor(dst, dst, (u128 *)walk->iv);
+ *(u128 *)walk->iv = last_iv;
+
+ return nbytes;
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ bool fpu_enabled = false;
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ while ((nbytes = walk.nbytes)) {
+ fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
+ nbytes = __cbc_decrypt(desc, &walk);
+ err = blkcipher_walk_done(desc, &walk, nbytes);
+ }
+
+ serpent_fpu_end(fpu_enabled);
+ return err;
+}
+
+static struct crypto_alg blk_cbc_alg = {
+ .cra_name = "__cbc-serpent-sse2",
+ .cra_driver_name = "__driver-cbc-serpent-sse2",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = SERPENT_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct serpent_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(blk_cbc_alg.cra_list),
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .setkey = serpent_setkey,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ },
+ },
+};
+
+static inline void u128_to_be128(be128 *dst, const u128 *src)
+{
+ dst->a = cpu_to_be64(src->a);
+ dst->b = cpu_to_be64(src->b);
+}
+
+static inline void be128_to_u128(u128 *dst, const be128 *src)
+{
+ dst->a = be64_to_cpu(src->a);
+ dst->b = be64_to_cpu(src->b);
+}
+
+static inline void u128_inc(u128 *i)
+{
+ i->b++;
+ if (!i->b)
+ i->a++;
+}
+
+static void ctr_crypt_final(struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ u8 *ctrblk = walk->iv;
+ u8 keystream[SERPENT_BLOCK_SIZE];
+ u8 *src = walk->src.virt.addr;
+ u8 *dst = walk->dst.virt.addr;
+ unsigned int nbytes = walk->nbytes;
+
+ __serpent_encrypt(ctx, keystream, ctrblk);
+ crypto_xor(keystream, src, nbytes);
+ memcpy(dst, keystream, nbytes);
+
+ crypto_inc(ctrblk, SERPENT_BLOCK_SIZE);
+}
+
+static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ const unsigned int bsize = SERPENT_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u128 *src = (u128 *)walk->src.virt.addr;
+ u128 *dst = (u128 *)walk->dst.virt.addr;
+ u128 ctrblk;
+ be128 ctrblocks[SERPENT_PARALLEL_BLOCKS];
+ int i;
+
+ be128_to_u128(&ctrblk, (be128 *)walk->iv);
+
+ /* Process multi-block batch */
+ if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
+ do {
+ /* create ctrblks for parallel encrypt */
+ for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
+ if (dst != src)
+ dst[i] = src[i];
+
+ u128_to_be128(&ctrblocks[i], &ctrblk);
+ u128_inc(&ctrblk);
+ }
+
+ serpent_enc_blk_xway_xor(ctx, (u8 *)dst,
+ (u8 *)ctrblocks);
+
+ src += SERPENT_PARALLEL_BLOCKS;
+ dst += SERPENT_PARALLEL_BLOCKS;
+ nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
+ } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ do {
+ if (dst != src)
+ *dst = *src;
+
+ u128_to_be128(&ctrblocks[0], &ctrblk);
+ u128_inc(&ctrblk);
+
+ __serpent_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
+ u128_xor(dst, dst, (u128 *)ctrblocks);
+
+ src += 1;
+ dst += 1;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+done:
+ u128_to_be128((be128 *)walk->iv, &ctrblk);
+ return nbytes;
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ bool fpu_enabled = false;
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt_block(desc, &walk, SERPENT_BLOCK_SIZE);
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ while ((nbytes = walk.nbytes) >= SERPENT_BLOCK_SIZE) {
+ fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
+ nbytes = __ctr_crypt(desc, &walk);
+ err = blkcipher_walk_done(desc, &walk, nbytes);
+ }
+
+ serpent_fpu_end(fpu_enabled);
+
+ if (walk.nbytes) {
+ ctr_crypt_final(desc, &walk);
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+
+ return err;
+}
+
+static struct crypto_alg blk_ctr_alg = {
+ .cra_name = "__ctr-serpent-sse2",
+ .cra_driver_name = "__driver-ctr-serpent-sse2",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct serpent_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list),
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = serpent_setkey,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ },
+ },
+};
+
+static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+ struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
+ int err;
+
+ crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
+ crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
+ & CRYPTO_TFM_REQ_MASK);
+ err = crypto_ablkcipher_setkey(child, key, key_len);
+ crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
+ & CRYPTO_TFM_RES_MASK);
+ return err;
+}
+
+static int __ablk_encrypt(struct ablkcipher_request *req)
+{
+ struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+ struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+ struct blkcipher_desc desc;
+
+ desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
+ desc.info = req->info;
+ desc.flags = 0;
+
+ return crypto_blkcipher_crt(desc.tfm)->encrypt(
+ &desc, req->dst, req->src, req->nbytes);
+}
+
+static int ablk_encrypt(struct ablkcipher_request *req)
+{
+ struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+ struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+
+ if (!irq_fpu_usable()) {
+ struct ablkcipher_request *cryptd_req =
+ ablkcipher_request_ctx(req);
+
+ memcpy(cryptd_req, req, sizeof(*req));
+ ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+
+ return crypto_ablkcipher_encrypt(cryptd_req);
+ } else {
+ return __ablk_encrypt(req);
+ }
+}
+
+static int ablk_decrypt(struct ablkcipher_request *req)
+{
+ struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+ struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+
+ if (!irq_fpu_usable()) {
+ struct ablkcipher_request *cryptd_req =
+ ablkcipher_request_ctx(req);
+
+ memcpy(cryptd_req, req, sizeof(*req));
+ ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+
+ return crypto_ablkcipher_decrypt(cryptd_req);
+ } else {
+ struct blkcipher_desc desc;
+
+ desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
+ desc.info = req->info;
+ desc.flags = 0;
+
+ return crypto_blkcipher_crt(desc.tfm)->decrypt(
+ &desc, req->dst, req->src, req->nbytes);
+ }
+}
+
+static void ablk_exit(struct crypto_tfm *tfm)
+{
+ struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ cryptd_free_ablkcipher(ctx->cryptd_tfm);
+}
+
+static void ablk_init_common(struct crypto_tfm *tfm,
+ struct cryptd_ablkcipher *cryptd_tfm)
+{
+ struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ ctx->cryptd_tfm = cryptd_tfm;
+ tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
+ crypto_ablkcipher_reqsize(&cryptd_tfm->base);
+}
+
+static int ablk_ecb_init(struct crypto_tfm *tfm)
+{
+ struct cryptd_ablkcipher *cryptd_tfm;
+
+ cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ecb-serpent-sse2", 0, 0);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+ ablk_init_common(tfm, cryptd_tfm);
+ return 0;
+}
+
+static struct crypto_alg ablk_ecb_alg = {
+ .cra_name = "ecb(serpent)",
+ .cra_driver_name = "ecb-serpent-sse2",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+ .cra_blocksize = SERPENT_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct async_serpent_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(ablk_ecb_alg.cra_list),
+ .cra_init = ablk_ecb_init,
+ .cra_exit = ablk_exit,
+ .cra_u = {
+ .ablkcipher = {
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ },
+ },
+};
+
+static int ablk_cbc_init(struct crypto_tfm *tfm)
+{
+ struct cryptd_ablkcipher *cryptd_tfm;
+
+ cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-serpent-sse2", 0, 0);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+ ablk_init_common(tfm, cryptd_tfm);
+ return 0;
+}
+
+static struct crypto_alg ablk_cbc_alg = {
+ .cra_name = "cbc(serpent)",
+ .cra_driver_name = "cbc-serpent-sse2",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+ .cra_blocksize = SERPENT_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct async_serpent_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(ablk_cbc_alg.cra_list),
+ .cra_init = ablk_cbc_init,
+ .cra_exit = ablk_exit,
+ .cra_u = {
+ .ablkcipher = {
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = __ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ },
+ },
+};
+
+static int ablk_ctr_init(struct crypto_tfm *tfm)
+{
+ struct cryptd_ablkcipher *cryptd_tfm;
+
+ cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-serpent-sse2", 0, 0);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+ ablk_init_common(tfm, cryptd_tfm);
+ return 0;
+}
+
+static struct crypto_alg ablk_ctr_alg = {
+ .cra_name = "ctr(serpent)",
+ .cra_driver_name = "ctr-serpent-sse2",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct async_serpent_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list),
+ .cra_init = ablk_ctr_init,
+ .cra_exit = ablk_exit,
+ .cra_u = {
+ .ablkcipher = {
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = ablk_encrypt,
+ .decrypt = ablk_encrypt,
+ .geniv = "chainiv",
+ },
+ },
+};
+
+static int __init serpent_sse2_init(void)
+{
+ int err;
+
+ if (!cpu_has_xmm2) {
+ printk(KERN_INFO "SSE2 instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ err = crypto_register_alg(&blk_ecb_alg);
+ if (err)
+ goto blk_ecb_err;
+ err = crypto_register_alg(&blk_cbc_alg);
+ if (err)
+ goto blk_cbc_err;
+ err = crypto_register_alg(&blk_ctr_alg);
+ if (err)
+ goto blk_ctr_err;
+ err = crypto_register_alg(&ablk_ecb_alg);
+ if (err)
+ goto ablk_ecb_err;
+ err = crypto_register_alg(&ablk_cbc_alg);
+ if (err)
+ goto ablk_cbc_err;
+ err = crypto_register_alg(&ablk_ctr_alg);
+ if (err)
+ goto ablk_ctr_err;
+ return err;
+
+ablk_ctr_err:
+ crypto_unregister_alg(&ablk_cbc_alg);
+ablk_cbc_err:
+ crypto_unregister_alg(&ablk_ecb_alg);
+ablk_ecb_err:
+ crypto_unregister_alg(&blk_ctr_alg);
+blk_ctr_err:
+ crypto_unregister_alg(&blk_cbc_alg);
+blk_cbc_err:
+ crypto_unregister_alg(&blk_ecb_alg);
+blk_ecb_err:
+ return err;
+}
+
+static void __exit serpent_sse2_exit(void)
+{
+ crypto_unregister_alg(&ablk_ctr_alg);
+ crypto_unregister_alg(&ablk_cbc_alg);
+ crypto_unregister_alg(&ablk_ecb_alg);
+ crypto_unregister_alg(&blk_ctr_alg);
+ crypto_unregister_alg(&blk_cbc_alg);
+ crypto_unregister_alg(&blk_ecb_alg);
+}
+
+module_init(serpent_sse2_init);
+module_exit(serpent_sse2_exit);
+
+MODULE_DESCRIPTION("Serpent Cipher Algorithm, SSE2 optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("serpent");
+
diff --git a/arch/x86/include/asm/serpent.h b/arch/x86/include/asm/serpent.h
new file mode 100644
index 0000000..ec463d7
--- /dev/null
+++ b/arch/x86/include/asm/serpent.h
@@ -0,0 +1,33 @@
+#ifndef ASM_X86_SERPENT_H
+#define ASM_X86_SERPENT_H
+
+#include <linux/crypto.h>
+#include <crypto/serpent.h>
+
+#define SERPENT_PARALLEL_BLOCKS 8
+
+asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src, bool xor);
+asmlinkage void serpent_dec_blk_8way(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src);
+
+static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __serpent_enc_blk_8way(ctx, dst, src, false);
+}
+
+static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __serpent_enc_blk_8way(ctx, dst, src, true);
+}
+
+static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ serpent_dec_blk_8way(ctx, dst, src);
+}
+
+#endif
+
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 404a846..b9b3315 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -758,6 +758,23 @@ config CRYPTO_SERPENT
See also:
<http://www.cl.cam.ac.uk/~rja14/serpent.html>

+config CRYPTO_SERPENT_SSE2_X86_64
+ tristate "Serpent cipher algorithm (x86_64/SSE2)"
+ depends on X86 && 64BIT
+ select CRYPTO_ALGAPI
+ select CRYPTO_SERPENT
+ help
+ Serpent cipher algorithm, by Anderson, Biham & Knudsen.
+
+ Keys are allowed to be from 0 to 256 bits in length, in steps
+ of 8 bits.
+
+ This module provides Serpent cipher algorithm that processes eigth
+ blocks parallel using SSE2 instruction set.
+
+ See also:
+ <http://www.cl.cam.ac.uk/~rja14/serpent.html>
+
config CRYPTO_TEA
tristate "TEA, XTEA and XETA cipher algorithms"
select CRYPTO_ALGAPI
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 49436b9..2018379 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1534,6 +1534,21 @@ static int alg_test_null(const struct alg_test_desc *desc,
/* Please keep this list sorted by algorithm name. */
static const struct alg_test_desc alg_test_descs[] = {
{
+ .alg = "__cbc-serpent-sse2",
+ .test = alg_test_null,
+ .suite = {
+ .cipher = {
+ .enc = {
+ .vecs = NULL,
+ .count = 0
+ },
+ .dec = {
+ .vecs = NULL,
+ .count = 0
+ }
+ }
+ }
+ }, {
.alg = "__driver-cbc-aes-aesni",
.test = alg_test_null,
.suite = {
@@ -1549,6 +1564,21 @@ static const struct alg_test_desc alg_test_descs[] = {
}
}
}, {
+ .alg = "__driver-cbc-serpent-sse2",
+ .test = alg_test_null,
+ .suite = {
+ .cipher = {
+ .enc = {
+ .vecs = NULL,
+ .count = 0
+ },
+ .dec = {
+ .vecs = NULL,
+ .count = 0
+ }
+ }
+ }
+ }, {
.alg = "__driver-ecb-aes-aesni",
.test = alg_test_null,
.suite = {
@@ -1564,6 +1594,21 @@ static const struct alg_test_desc alg_test_descs[] = {
}
}
}, {
+ .alg = "__driver-ecb-serpent-sse2",
+ .test = alg_test_null,
+ .suite = {
+ .cipher = {
+ .enc = {
+ .vecs = NULL,
+ .count = 0
+ },
+ .dec = {
+ .vecs = NULL,
+ .count = 0
+ }
+ }
+ }
+ }, {
.alg = "__ghash-pclmulqdqni",
.test = alg_test_null,
.suite = {
@@ -1746,6 +1791,21 @@ static const struct alg_test_desc alg_test_descs[] = {
}
}
}, {
+ .alg = "cryptd(__driver-ecb-serpent-sse2)",
+ .test = alg_test_null,
+ .suite = {
+ .cipher = {
+ .enc = {
+ .vecs = NULL,
+ .count = 0
+ },
+ .dec = {
+ .vecs = NULL,
+ .count = 0
+ }
+ }
+ }
+ }, {
.alg = "cryptd(__ghash-pclmulqdqni)",
.test = alg_test_null,
.suite = {

2011-11-09 03:44:46

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH 6/7] crypto: serpent: add 8-way parallel x86_64/SSE2 assembler implementation

On Tue, Oct 18, 2011 at 12:03:18AM +0300, Jussi Kivilinna wrote:
> Patch adds x86_64/SSE2 assembler implementation of serpent cipher. Assembler
> functions crypt data in eigth block chunks (two 4 block chunk SSE2 operations
> in parallel to improve performance on out-of-order CPUs). Glue code is based
> on one from AES-NI implementation, so requests from irq context are redirected
> to cryptd.

This doesn't build for me:

CC [M] arch/x86/crypto/serpent_sse2_glue.o
arch/x86/crypto/serpent_sse2_glue.c:154: error: ‘THIS_MODULE’ undeclared here (not in a function)
arch/x86/crypto/serpent_sse2_glue.c:716: error: expected declaration specifiers or ‘...’ before string constant
arch/x86/crypto/serpent_sse2_glue.c:716: warning: data definition has no type or storage class
arch/x86/crypto/serpent_sse2_glue.c:716: warning: type defaults to ‘int’ in declaration of ‘MODULE_DESCRIPTION’
arch/x86/crypto/serpent_sse2_glue.c:716: warning: function declaration isn’t a prototype
arch/x86/crypto/serpent_sse2_glue.c:717: error: expected declaration specifiers or ‘...’ before string constant
arch/x86/crypto/serpent_sse2_glue.c:717: warning: data definition has no type or storage class
arch/x86/crypto/serpent_sse2_glue.c:717: warning: type defaults to ‘int’ in declaration of ‘MODULE_LICENSE’
arch/x86/crypto/serpent_sse2_glue.c:717: warning: function declaration isn’t a prototype
arch/x86/crypto/serpent_sse2_glue.c:718: error: expected declaration specifiers or ‘...’ before string constant
arch/x86/crypto/serpent_sse2_glue.c:718: warning: data definition has no type or storage class
arch/x86/crypto/serpent_sse2_glue.c:718: warning: type defaults to ‘int’ in declaration of ‘MODULE_ALIAS’
arch/x86/crypto/serpent_sse2_glue.c:718: warning: function declaration isn’t a prototype
make[2]: *** [arch/x86/crypto/serpent_sse2_glue.o] Error 1
make[1]: *** [arch/x86/crypto] Error 2
make: *** [arch/x86] Error 2

So I've applied patches 1-5, please resubmit 6-7 after you've
fixed them up.

Thanks,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt