2009-09-16 01:35:45

by Huang, Ying

[permalink] [raw]
Subject: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation

PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
carry-less multiplication. More information about PCLMULQDQ can be
found at:

http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/

Because PCLMULQDQ changes XMM state, its usage must be enclosed with
kernel_fpu_begin/end, which can be used only in process context, the
acceleration is implemented as crypto_ahash. That is, request in soft
IRQ context will be defered to the cryptd kernel thread.

v4:
- Fix some style issues.

v3:
- Revise GHASH implementation, performance increase about 2x.

Signed-off-by: Huang Ying <[email protected]>
---
arch/x86/crypto/Makefile | 3
arch/x86/crypto/ghash-clmulni-intel_asm.S | 157 +++++++++++++
arch/x86/crypto/ghash-clmulni-intel_glue.c | 333 +++++++++++++++++++++++++++++
arch/x86/include/asm/cpufeature.h | 1
crypto/Kconfig | 8
crypto/cryptd.c | 7
include/crypto/cryptd.h | 1
7 files changed, 510 insertions(+)
create mode 100644 arch/x86/crypto/ghash-clmulni-intel_asm.S
create mode 100644 arch/x86/crypto/ghash-clmulni-intel_glue.c

--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
+obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o

obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o

@@ -24,3 +25,5 @@ twofish-x86_64-y := twofish-x86_64-asm_6
salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o

aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
+
+ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
--- /dev/null
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -0,0 +1,157 @@
+/*
+ * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
+ * instructions. This file contains accelerated part of ghash
+ * implementation. More information about PCLMULQDQ can be found at:
+ *
+ * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
+ *
+ * Copyright (c) 2009 Intel Corp.
+ * Author: Huang Ying <[email protected]>
+ * Vinodh Gopal
+ * Erdinc Ozturk
+ * Deniz Karakoyunlu
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+
+.align 16
+.Lbswap_mask:
+ .octa 0x000102030405060708090a0b0c0d0e0f
+.Lpoly:
+ .octa 0xc2000000000000000000000000000001
+.Ltwo_one:
+ .octa 0x00000001000000000000000000000001
+
+#define DATA %xmm0
+#define SHASH %xmm1
+#define T1 %xmm2
+#define T2 %xmm3
+#define T3 %xmm4
+#define BSWAP %xmm5
+#define IN1 %xmm6
+
+.text
+
+/*
+ * __clmul_gf128mul_ble: internal ABI
+ * input:
+ * DATA: operand1
+ * SHASH: operand2, hash_key << 1 mod poly
+ * output:
+ * DATA: operand1 * operand2 mod poly
+ * changed:
+ * T1
+ * T2
+ * T3
+ */
+__clmul_gf128mul_ble:
+ movaps DATA, T1
+ pshufd $0b01001110, DATA, T2
+ pshufd $0b01001110, SHASH, T3
+ pxor DATA, T2
+ pxor SHASH, T3
+
+ # pclmulqdq $0x00, SHASH, DATA # DATA = a0 * b0
+ .byte 0x66, 0x0f, 0x3a, 0x44, 0xc1, 0x00
+ # pclmulqdq $0x11, SHASH, T1 # T1 = a1 * b1
+ .byte 0x66, 0x0f, 0x3a, 0x44, 0xd1, 0x11
+ # pclmulqdq $0x00, T3, T2 # T2 = (a1 + a0) * (b1 + b0)
+ .byte 0x66, 0x0f, 0x3a, 0x44, 0xdc, 0x00
+ pxor DATA, T2
+ pxor T1, T2 # T2 = a0 * b1 + a1 * b0
+
+ movaps T2, T3
+ pslldq $8, T3
+ psrldq $8, T2
+ pxor T3, DATA
+ pxor T2, T1 # <T1:DATA> is result of
+ # carry-less multiplication
+
+ # first phase of the reduction
+ movaps DATA, T3
+ psllq $1, T3
+ pxor DATA, T3
+ psllq $5, T3
+ pxor DATA, T3
+ psllq $57, T3
+ movaps T3, T2
+ pslldq $8, T2
+ psrldq $8, T3
+ pxor T2, DATA
+ pxor T3, T1
+
+ # second phase of the reduction
+ movaps DATA, T2
+ psrlq $5, T2
+ pxor DATA, T2
+ psrlq $1, T2
+ pxor DATA, T2
+ psrlq $1, T2
+ pxor T2, T1
+ pxor T1, DATA
+ ret
+
+/* void clmul_ghash_mul(char *dst, const be128 *shash) */
+ENTRY(clmul_ghash_mul)
+ movups (%rdi), DATA
+ movups (%rsi), SHASH
+ movaps .Lbswap_mask, BSWAP
+ pshufb BSWAP, DATA
+ call __clmul_gf128mul_ble
+ pshufb BSWAP, DATA
+ movups DATA, (%rdi)
+ ret
+
+/*
+ * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
+ * const be128 *shash);
+ */
+ENTRY(clmul_ghash_update)
+ cmp $16, %rdx
+ jb .Lupdate_just_ret # check length
+ movaps .Lbswap_mask, BSWAP
+ movups (%rdi), DATA
+ movups (%rcx), SHASH
+ pshufb BSWAP, DATA
+.align 4
+.Lupdate_loop:
+ movups (%rsi), IN1
+ pshufb BSWAP, IN1
+ pxor IN1, DATA
+ call __clmul_gf128mul_ble
+ sub $16, %rdx
+ add $16, %rsi
+ cmp $16, %rdx
+ jge .Lupdate_loop
+ pshufb BSWAP, DATA
+ movups DATA, (%rdi)
+.Lupdate_just_ret:
+ ret
+
+/*
+ * void clmul_ghash_setkey(be128 *shash, const u8 *key);
+ *
+ * Calculate hash_key << 1 mod poly
+ */
+ENTRY(clmul_ghash_setkey)
+ movaps .Lbswap_mask, BSWAP
+ movups (%rsi), %xmm0
+ pshufb BSWAP, %xmm0
+ movaps %xmm0, %xmm1
+ psllq $1, %xmm0
+ psrlq $63, %xmm1
+ movaps %xmm1, %xmm2
+ pslldq $8, %xmm1
+ psrldq $8, %xmm2
+ por %xmm1, %xmm0
+ # reduction
+ pshufd $0b00100100, %xmm2, %xmm1
+ pcmpeqd .Ltwo_one, %xmm1
+ pand .Lpoly, %xmm1
+ pxor %xmm1, %xmm0
+ movups %xmm0, (%rdi)
+ ret
--- /dev/null
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -0,0 +1,333 @@
+/*
+ * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
+ * instructions. This file contains glue code.
+ *
+ * Copyright (c) 2009 Intel Corp.
+ * Author: Huang Ying <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/crypto.h>
+#include <crypto/algapi.h>
+#include <crypto/cryptd.h>
+#include <crypto/gf128mul.h>
+#include <crypto/internal/hash.h>
+#include <asm/i387.h>
+
+#define GHASH_BLOCK_SIZE 16
+#define GHASH_DIGEST_SIZE 16
+
+void clmul_ghash_mul(char *dst, const be128 *shash);
+
+void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
+ const be128 *shash);
+
+void clmul_ghash_setkey(be128 *shash, const u8 *key);
+
+struct ghash_async_ctx {
+ struct cryptd_ahash *cryptd_tfm;
+};
+
+struct ghash_ctx {
+ be128 shash;
+};
+
+struct ghash_desc_ctx {
+ u8 buffer[GHASH_BLOCK_SIZE];
+ u32 bytes;
+};
+
+static int ghash_init(struct shash_desc *desc)
+{
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ memset(dctx, 0, sizeof(*dctx));
+
+ return 0;
+}
+
+static int ghash_setkey(struct crypto_shash *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ struct ghash_ctx *ctx = crypto_shash_ctx(tfm);
+
+ if (keylen != GHASH_BLOCK_SIZE) {
+ crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+
+ clmul_ghash_setkey(&ctx->shash, key);
+
+ return 0;
+}
+
+static int ghash_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+ struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
+ u8 *dst = dctx->buffer;
+
+ kernel_fpu_begin();
+ if (dctx->bytes) {
+ int n = min(srclen, dctx->bytes);
+ u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
+
+ dctx->bytes -= n;
+ srclen -= n;
+
+ while (n--)
+ *pos++ ^= *src++;
+
+ if (!dctx->bytes)
+ clmul_ghash_mul(dst, &ctx->shash);
+ }
+
+ clmul_ghash_update(dst, src, srclen, &ctx->shash);
+ kernel_fpu_end();
+
+ if (srclen & 0xf) {
+ src += srclen - (srclen & 0xf);
+ srclen &= 0xf;
+ dctx->bytes = GHASH_BLOCK_SIZE - srclen;
+ while (srclen--)
+ *dst++ ^= *src++;
+ }
+
+ return 0;
+}
+
+static void ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx)
+{
+ u8 *dst = dctx->buffer;
+
+ if (dctx->bytes) {
+ u8 *tmp = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
+
+ while (dctx->bytes--)
+ *tmp++ ^= 0;
+
+ kernel_fpu_begin();
+ clmul_ghash_mul(dst, &ctx->shash);
+ kernel_fpu_end();
+ }
+
+ dctx->bytes = 0;
+}
+
+static int ghash_final(struct shash_desc *desc, u8 *dst)
+{
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+ struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
+ u8 *buf = dctx->buffer;
+
+ ghash_flush(ctx, dctx);
+ memcpy(dst, buf, GHASH_BLOCK_SIZE);
+
+ return 0;
+}
+
+static struct shash_alg ghash_alg = {
+ .digestsize = GHASH_DIGEST_SIZE,
+ .init = ghash_init,
+ .update = ghash_update,
+ .final = ghash_final,
+ .setkey = ghash_setkey,
+ .descsize = sizeof(struct ghash_desc_ctx),
+ .base = {
+ .cra_name = "__ghash",
+ .cra_driver_name = "__ghash-pclmulqdqni",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_blocksize = GHASH_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct ghash_ctx),
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(ghash_alg.base.cra_list),
+ },
+};
+
+static int ghash_async_init(struct ahash_request *req)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+ struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+
+ if (irq_fpu_usable()) {
+ memcpy(cryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
+ return crypto_ahash_init(cryptd_req);
+ } else {
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
+
+ desc->tfm = child;
+ desc->flags = req->base.flags;
+ return crypto_shash_init(desc);
+ }
+}
+
+static int ghash_async_update(struct ahash_request *req)
+{
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+
+ if (irq_fpu_usable()) {
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+
+ memcpy(cryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
+ return crypto_ahash_update(cryptd_req);
+ } else {
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ return shash_ahash_update(req, desc);
+ }
+}
+
+static int ghash_async_final(struct ahash_request *req)
+{
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+
+ if (irq_fpu_usable()) {
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+
+ memcpy(cryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
+ return crypto_ahash_final(cryptd_req);
+ } else {
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ return crypto_shash_final(desc, req->result);
+ }
+}
+
+static int ghash_async_digest(struct ahash_request *req)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+ struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+
+ if (irq_fpu_usable()) {
+ memcpy(cryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
+ return crypto_ahash_digest(cryptd_req);
+ } else {
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
+
+ desc->tfm = child;
+ desc->flags = req->base.flags;
+ return shash_ahash_digest(req, desc);
+ }
+}
+
+static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct crypto_ahash *child = &ctx->cryptd_tfm->base;
+ int err;
+
+ crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK);
+ crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm)
+ & CRYPTO_TFM_REQ_MASK);
+ err = crypto_ahash_setkey(child, key, keylen);
+ crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child)
+ & CRYPTO_TFM_RES_MASK);
+
+ return 0;
+}
+
+static int ghash_async_init_tfm(struct crypto_tfm *tfm)
+{
+ struct cryptd_ahash *cryptd_tfm;
+ struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", 0, 0);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+ ctx->cryptd_tfm = cryptd_tfm;
+ crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+ sizeof(struct ahash_request) +
+ crypto_ahash_reqsize(&cryptd_tfm->base));
+
+ return 0;
+}
+
+static void ghash_async_exit_tfm(struct crypto_tfm *tfm)
+{
+ struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ cryptd_free_ahash(ctx->cryptd_tfm);
+}
+
+static struct ahash_alg ghash_async_alg = {
+ .init = ghash_async_init,
+ .update = ghash_async_update,
+ .final = ghash_async_final,
+ .setkey = ghash_async_setkey,
+ .digest = ghash_async_digest,
+ .halg = {
+ .digestsize = GHASH_DIGEST_SIZE,
+ .base = {
+ .cra_name = "ghash",
+ .cra_driver_name = "ghash-clmulni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC,
+ .cra_blocksize = GHASH_BLOCK_SIZE,
+ .cra_type = &crypto_ahash_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(ghash_async_alg.halg.base.cra_list),
+ .cra_init = ghash_async_init_tfm,
+ .cra_exit = ghash_async_exit_tfm,
+ },
+ },
+};
+
+static int __init ghash_pclmulqdqni_mod_init(void)
+{
+ int err;
+
+ if (!cpu_has_pclmulqdq) {
+ printk(KERN_INFO "Intel PCLMULQDQ-NI instructions are not"
+ " detected.\n");
+ return -ENODEV;
+ }
+
+ err = crypto_register_shash(&ghash_alg);
+ if (err)
+ goto err_out;
+ err = crypto_register_ahash(&ghash_async_alg);
+ if (err)
+ goto err_shash;
+
+ return 0;
+
+err_shash:
+ crypto_unregister_shash(&ghash_alg);
+err_out:
+ return err;
+}
+
+static void __exit ghash_pclmulqdqni_mod_exit(void)
+{
+ crypto_unregister_ahash(&ghash_async_alg);
+ crypto_unregister_shash(&ghash_alg);
+}
+
+module_init(ghash_pclmulqdqni_mod_init);
+module_exit(ghash_pclmulqdqni_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GHASH Message Digest Algorithm, "
+ "acclerated by PCLMULQDQ-NI");
+MODULE_ALIAS("ghash");
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -247,6 +247,7 @@ extern const char * const x86_power_flag
#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC)
#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
+#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)

#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
# define cpu_has_invlpg 1
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -440,6 +440,14 @@ config CRYPTO_WP512
See also:
<http://planeta.terra.com.br/informatica/paulobarreto/WhirlpoolPage.html>

+config CRYPTO_GHASH_CLMUL_NI_INTEL
+ tristate "GHASH digest algorithm (CLMUL-NI accelerated)"
+ select CRYPTO_SHASH
+ select CRYPTO_CRYPTD
+ help
+ GHASH is message digest algorithm for GCM (Galois/Counter Mode).
+ The implementation is accelerated by CLMUL-NI of Intel.
+
comment "Ciphers"

config CRYPTO_AES
--- a/crypto/cryptd.c
+++ b/crypto/cryptd.c
@@ -711,6 +711,13 @@ struct crypto_shash *cryptd_ahash_child(
}
EXPORT_SYMBOL_GPL(cryptd_ahash_child);

+struct shash_desc *cryptd_shash_desc(struct ahash_request *req)
+{
+ struct cryptd_hash_request_ctx *rctx = ahash_request_ctx(req);
+ return &rctx->desc;
+}
+EXPORT_SYMBOL_GPL(cryptd_shash_desc);
+
void cryptd_free_ahash(struct cryptd_ahash *tfm)
{
crypto_free_ahash(&tfm->base);
--- a/include/crypto/cryptd.h
+++ b/include/crypto/cryptd.h
@@ -39,6 +39,7 @@ static inline struct cryptd_ahash *__cry
struct cryptd_ahash *cryptd_alloc_ahash(const char *alg_name,
u32 type, u32 mask);
struct crypto_shash *cryptd_ahash_child(struct cryptd_ahash *tfm);
+struct shash_desc *cryptd_shash_desc(struct ahash_request *req);
void cryptd_free_ahash(struct cryptd_ahash *tfm);

#endif




2009-10-19 02:53:31

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation

On Wed, Sep 16, 2009 at 09:35:46AM +0800, Huang Ying wrote:
> PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
> carry-less multiplication. More information about PCLMULQDQ can be
> found at:
>
> http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
>
> Because PCLMULQDQ changes XMM state, its usage must be enclosed with
> kernel_fpu_begin/end, which can be used only in process context, the
> acceleration is implemented as crypto_ahash. That is, request in soft
> IRQ context will be defered to the cryptd kernel thread.
>
> v4:
> - Fix some style issues.
>
> v3:
> - Revise GHASH implementation, performance increase about 2x.
>
> Signed-off-by: Huang Ying <[email protected]>

Patch applied to cryptodev. Thanks!
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2009-11-01 00:30:54

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation

On Mon, 19 Oct 2009 11:53:33 +0900 Herbert Xu <[email protected]> wrote:

> On Wed, Sep 16, 2009 at 09:35:46AM +0800, Huang Ying wrote:
> > PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
> > carry-less multiplication. More information about PCLMULQDQ can be
> > found at:
> >
> > http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
> >
> > Because PCLMULQDQ changes XMM state, its usage must be enclosed with
> > kernel_fpu_begin/end, which can be used only in process context, the
> > acceleration is implemented as crypto_ahash. That is, request in soft
> > IRQ context will be defered to the cryptd kernel thread.
> >
> > v4:
> > - Fix some style issues.
> >
> > v3:
> > - Revise GHASH implementation, performance increase about 2x.
> >
> > Signed-off-by: Huang Ying <[email protected]>
>
> Patch applied to cryptodev. Thanks!

x86_64 allmodconfig, GNU assembler 2.16.1:

arch/x86/crypto/ghash-clmulni-intel_asm.S: Assembler messages:
arch/x86/crypto/ghash-clmulni-intel_asm.S:103: Error: no such instruction: `pshufb %xmm5,%xmm0'
arch/x86/crypto/ghash-clmulni-intel_asm.S:105: Error: no such instruction: `pshufb %xmm5,%xmm0'
arch/x86/crypto/ghash-clmulni-intel_asm.S:119: Error: no such instruction: `pshufb %xmm5,%xmm0'
arch/x86/crypto/ghash-clmulni-intel_asm.S:123: Error: no such instruction: `pshufb %xmm5,%xmm6'
arch/x86/crypto/ghash-clmulni-intel_asm.S:130: Error: no such instruction: `pshufb %xmm5,%xmm0'
arch/x86/crypto/ghash-clmulni-intel_asm.S:143: Error: no such instruction: `pshufb %xmm5,%xmm0'


2009-11-01 17:50:46

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation

On Sat, Oct 31, 2009 at 05:30:15PM -0700, Andrew Morton wrote:
>
> x86_64 allmodconfig, GNU assembler 2.16.1:
>
> arch/x86/crypto/ghash-clmulni-intel_asm.S: Assembler messages:
> arch/x86/crypto/ghash-clmulni-intel_asm.S:103: Error: no such instruction: `pshufb %xmm5,%xmm0'
> arch/x86/crypto/ghash-clmulni-intel_asm.S:105: Error: no such instruction: `pshufb %xmm5,%xmm0'
> arch/x86/crypto/ghash-clmulni-intel_asm.S:119: Error: no such instruction: `pshufb %xmm5,%xmm0'
> arch/x86/crypto/ghash-clmulni-intel_asm.S:123: Error: no such instruction: `pshufb %xmm5,%xmm6'
> arch/x86/crypto/ghash-clmulni-intel_asm.S:130: Error: no such instruction: `pshufb %xmm5,%xmm0'
> arch/x86/crypto/ghash-clmulni-intel_asm.S:143: Error: no such instruction: `pshufb %xmm5,%xmm0'

This patch should fix it.

commit 2d06ef7f42ed8c9969c9aa84e95df5d5c6378327
Author: Herbert Xu <[email protected]>
Date: Sun Nov 1 12:49:44 2009 -0500

crypto: ghash-intel - Hard-code pshufb

Old gases don't have a clue what pshufb stands for so we have
to hard-code it for now.

Reported-by: Andrew Morton <[email protected]>
Signed-off-by: Herbert Xu <[email protected]>

diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index b9e787a..71768d5 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -100,9 +100,11 @@ ENTRY(clmul_ghash_mul)
movups (%rdi), DATA
movups (%rsi), SHASH
movaps .Lbswap_mask, BSWAP
- pshufb BSWAP, DATA
+ # pshufb BSWAP, DATA
+ .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
call __clmul_gf128mul_ble
- pshufb BSWAP, DATA
+ # pshufb BSWAP, DATA
+ .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
movups DATA, (%rdi)
ret

@@ -116,18 +118,21 @@ ENTRY(clmul_ghash_update)
movaps .Lbswap_mask, BSWAP
movups (%rdi), DATA
movups (%rcx), SHASH
- pshufb BSWAP, DATA
+ # pshufb BSWAP, DATA
+ .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
.align 4
.Lupdate_loop:
movups (%rsi), IN1
- pshufb BSWAP, IN1
+ # pshufb BSWAP, IN1
+ .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
pxor IN1, DATA
call __clmul_gf128mul_ble
sub $16, %rdx
add $16, %rsi
cmp $16, %rdx
jge .Lupdate_loop
- pshufb BSWAP, DATA
+ # pshufb BSWAP, DATA
+ .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
movups DATA, (%rdi)
.Lupdate_just_ret:
ret
@@ -140,7 +145,8 @@ ENTRY(clmul_ghash_update)
ENTRY(clmul_ghash_setkey)
movaps .Lbswap_mask, BSWAP
movups (%rsi), %xmm0
- pshufb BSWAP, %xmm0
+ # pshufb BSWAP, %xmm0
+ .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
movaps %xmm0, %xmm1
psllq $1, %xmm0
psrlq $63, %xmm1
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2009-11-02 07:50:55

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation


* Herbert Xu <[email protected]> wrote:

> - pshufb BSWAP, DATA
> + # pshufb BSWAP, DATA
> + .byte 0x66, 0x0f, 0x38, 0x00, 0xc5

A cleanup request: mind creating two macros for this PSHUFB MMX/SSE
instruction in arch/x86/include/asm/i387.h, instead of open-coding the
.byte sequences in ~6 places?

( After the .33 merge window we'll collect such instruction format
knowledge in arch/x86/include/asm/inst.h. That file is not upstream
yet so i387.h will do for now for FPU/SSE instructions. )

Thanks,

Ingo

2009-11-02 14:28:37

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation

On Mon, Nov 02, 2009 at 08:50:39AM +0100, Ingo Molnar wrote:
>
> A cleanup request: mind creating two macros for this PSHUFB MMX/SSE
> instruction in arch/x86/include/asm/i387.h, instead of open-coding the
> .byte sequences in ~6 places?

I had a go at doing that, but it seems that i387.h isn't really
meant to be included in an asm file at this point :)

> ( After the .33 merge window we'll collect such instruction format
> knowledge in arch/x86/include/asm/inst.h. That file is not upstream
> yet so i387.h will do for now for FPU/SSE instructions. )

I'm happy to revisit this once inst.h exists.

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2009-11-02 14:33:10

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation


* Herbert Xu <[email protected]> wrote:

> On Mon, Nov 02, 2009 at 08:50:39AM +0100, Ingo Molnar wrote:
> >
> > A cleanup request: mind creating two macros for this PSHUFB MMX/SSE
> > instruction in arch/x86/include/asm/i387.h, instead of open-coding the
> > .byte sequences in ~6 places?
>
> I had a go at doing that, but it seems that i387.h isn't really meant
> to be included in an asm file at this point :)

Please use the standard construct and put an #ifndef __ASSEMBLY__ around
it.

> > ( After the .33 merge window we'll collect such instruction format
> > knowledge in arch/x86/include/asm/inst.h. That file is not upstream
> > yet so i387.h will do for now for FPU/SSE instructions. )
>
> I'm happy to revisit this once inst.h exists.

No reason to not do most of the change first though, the way i suggested
it.

Ingo

2009-11-02 14:46:29

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation

On Mon, Nov 02, 2009 at 03:32:58PM +0100, Ingo Molnar wrote:
>
> Please use the standard construct and put an #ifndef __ASSEMBLY__ around
> it.

You mean like this?

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 0b20bbb..e22d237 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -10,6 +10,13 @@
#ifndef _ASM_X86_I387_H
#define _ASM_X86_I387_H

+#ifdef __ASSEMBLY__
+
+#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
+#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
+
+#else
+
#include <linux/sched.h>
#include <linux/kernel_stat.h>
#include <linux/regset.h>
@@ -411,4 +418,5 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
}
}

+#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_I387_H */

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2009-11-02 15:46:15

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation


* Herbert Xu <[email protected]> wrote:

> On Mon, Nov 02, 2009 at 03:32:58PM +0100, Ingo Molnar wrote:
> >
> > Please use the standard construct and put an #ifndef __ASSEMBLY__ around
> > it.
>
> You mean like this?
>
> diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
> index 0b20bbb..e22d237 100644
> --- a/arch/x86/include/asm/i387.h
> +++ b/arch/x86/include/asm/i387.h
> @@ -10,6 +10,13 @@
> #ifndef _ASM_X86_I387_H
> #define _ASM_X86_I387_H
>
> +#ifdef __ASSEMBLY__
> +
> +#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
> +#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
> +
> +#else
> +
> #include <linux/sched.h>
> #include <linux/kernel_stat.h>
> #include <linux/regset.h>
> @@ -411,4 +418,5 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
> }
> }
>
> +#endif /* __ASSEMBLY__ */
> #endif /* _ASM_X86_I387_H */

Yeah. Or just a single block of:


#ifndef __ASSEMBLY__
...
#endif /* __ASSEMBLY__ */

around the C bits - anything outside that is good for assembly as well.

Ingo

2009-11-03 05:47:14

by Huang, Ying

[permalink] [raw]
Subject: Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation

On Mon, 2009-11-02 at 22:32 +0800, Ingo Molnar wrote:
> * Herbert Xu <[email protected]> wrote:
>
> > On Mon, Nov 02, 2009 at 08:50:39AM +0100, Ingo Molnar wrote:
> > >
> > > A cleanup request: mind creating two macros for this PSHUFB MMX/SSE
> > > instruction in arch/x86/include/asm/i387.h, instead of open-coding the
> > > .byte sequences in ~6 places?
> >
> > I had a go at doing that, but it seems that i387.h isn't really meant
> > to be included in an asm file at this point :)
>
> Please use the standard construct and put an #ifndef __ASSEMBLY__ around
> it.
>
> > > ( After the .33 merge window we'll collect such instruction format
> > > knowledge in arch/x86/include/asm/inst.h. That file is not upstream
> > > yet so i387.h will do for now for FPU/SSE instructions. )
> >
> > I'm happy to revisit this once inst.h exists.
>
> No reason to not do most of the change first though, the way i suggested
> it.

How about something as below? But it seems not appropriate to put these
bits into i387.h, that is, to combine C and gas syntax.

Best Regards,
Huang Ying

.macro xmm_num opd xmm
.ifc \xmm,%xmm0
\opd = 0
.endif
.ifc \xmm,%xmm1
\opd = 1
.endif
.ifc \xmm,%xmm2
\opd = 2
.endif
.ifc \xmm,%xmm3
\opd = 3
.endif
.ifc \xmm,%xmm4
\opd = 4
.endif
.ifc \xmm,%xmm5
\opd = 5
.endif
.ifc \xmm,%xmm6
\opd = 6
.endif
.ifc \xmm,%xmm7
\opd = 7
.endif
.ifc \xmm,%xmm8
\opd = 8
.endif
.ifc \xmm,%xmm9
\opd = 9
.endif
.ifc \xmm,%xmm10
\opd = 10
.endif
.ifc \xmm,%xmm11
\opd = 11
.endif
.ifc \xmm,%xmm12
\opd = 12
.endif
.ifc \xmm,%xmm13
\opd = 13
.endif
.ifc \xmm,%xmm14
\opd = 14
.endif
.ifc \xmm,%xmm15
\opd = 15
.endif
.endm

.macro PSHUFB_XMM xmm1 xmm2
xmm_num pshufb_opd1 \xmm1
xmm_num pshufb_opd2 \xmm2
.if (pshufb_opd1 < 8) && (pshufb_opd2 < 8)
.byte 0x66, 0x0f, 0x38, 0x00, 0xc0 | pshufb_opd1 | (pshufb_opd2<<3)
.elseif (pshufb_opd1 >= 8) && (pshufb_opd2 < 8)
.byte 0x66, 0x41, 0x0f, 0x38, 0x00, 0xc0 | (pshufb_opd1-8) | (pshufb_opd2<<3)
.elseif (pshufb_opd1 < 8) && (pshufb_opd2 >= 8)
.byte 0x66, 0x44, 0x0f, 0x38, 0x00, 0xc0 | pshufb_opd1 | ((pshufb_opd2-8)<<3)
.else
.byte 0x66, 0x45, 0x0f, 0x38, 0x00, 0xc0 | (pshufb_opd1-8) | ((pshufb_opd2-8)<<3)
.endif
.endm



2009-11-03 09:03:29

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation


* Huang Ying <[email protected]> wrote:

> On Mon, 2009-11-02 at 22:32 +0800, Ingo Molnar wrote:
> > * Herbert Xu <[email protected]> wrote:
> >
> > > On Mon, Nov 02, 2009 at 08:50:39AM +0100, Ingo Molnar wrote:
> > > >
> > > > A cleanup request: mind creating two macros for this PSHUFB MMX/SSE
> > > > instruction in arch/x86/include/asm/i387.h, instead of open-coding the
> > > > .byte sequences in ~6 places?
> > >
> > > I had a go at doing that, but it seems that i387.h isn't really meant
> > > to be included in an asm file at this point :)
> >
> > Please use the standard construct and put an #ifndef __ASSEMBLY__ around
> > it.
> >
> > > > ( After the .33 merge window we'll collect such instruction format
> > > > knowledge in arch/x86/include/asm/inst.h. That file is not upstream
> > > > yet so i387.h will do for now for FPU/SSE instructions. )
> > >
> > > I'm happy to revisit this once inst.h exists.
> >
> > No reason to not do most of the change first though, the way i suggested
> > it.
>
> How about something as below? But it seems not appropriate to put these
> bits into i387.h, that is, to combine C and gas syntax.
>
> Best Regards,
> Huang Ying
>
> .macro xmm_num opd xmm
> .ifc \xmm,%xmm0
> \opd = 0
> .endif
> .ifc \xmm,%xmm1
> \opd = 1
> .endif
> .ifc \xmm,%xmm2
> \opd = 2
> .endif
> .ifc \xmm,%xmm3
> \opd = 3
> .endif
> .ifc \xmm,%xmm4
> \opd = 4
> .endif
> .ifc \xmm,%xmm5
> \opd = 5
> .endif
> .ifc \xmm,%xmm6
> \opd = 6
> .endif
> .ifc \xmm,%xmm7
> \opd = 7
> .endif
> .ifc \xmm,%xmm8
> \opd = 8
> .endif
> .ifc \xmm,%xmm9
> \opd = 9
> .endif
> .ifc \xmm,%xmm10
> \opd = 10
> .endif
> .ifc \xmm,%xmm11
> \opd = 11
> .endif
> .ifc \xmm,%xmm12
> \opd = 12
> .endif
> .ifc \xmm,%xmm13
> \opd = 13
> .endif
> .ifc \xmm,%xmm14
> \opd = 14
> .endif
> .ifc \xmm,%xmm15
> \opd = 15
> .endif
> .endm
>
> .macro PSHUFB_XMM xmm1 xmm2
> xmm_num pshufb_opd1 \xmm1
> xmm_num pshufb_opd2 \xmm2
> .if (pshufb_opd1 < 8) && (pshufb_opd2 < 8)
> .byte 0x66, 0x0f, 0x38, 0x00, 0xc0 | pshufb_opd1 | (pshufb_opd2<<3)
> .elseif (pshufb_opd1 >= 8) && (pshufb_opd2 < 8)
> .byte 0x66, 0x41, 0x0f, 0x38, 0x00, 0xc0 | (pshufb_opd1-8) | (pshufb_opd2<<3)
> .elseif (pshufb_opd1 < 8) && (pshufb_opd2 >= 8)
> .byte 0x66, 0x44, 0x0f, 0x38, 0x00, 0xc0 | pshufb_opd1 | ((pshufb_opd2-8)<<3)
> .else
> .byte 0x66, 0x45, 0x0f, 0x38, 0x00, 0xc0 | (pshufb_opd1-8) | ((pshufb_opd2-8)<<3)
> .endif
> .endm

Looks far too clever, i like it :-) We have quite a few assembly macros
in arch/x86/include/asm/. The above one could be put into calling.h for
example.

But the simpler .byte solution in i387.h would be fine too.

If you guys want to put helper define into arch/x86/include/asm/ into
the crypto tree, feel free:

Acked-by: Ingo Molnar <[email protected]>

it would be clumsy to keep it separately in the x86 tree. Just dont
spread raw .byte sequences in .S files please ...

Ingo

2009-11-03 14:12:51

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation

On Mon, Nov 02, 2009 at 04:46:04PM +0100, Ingo Molnar wrote:
>
> Yeah. Or just a single block of:
>
>
> #ifndef __ASSEMBLY__
> ...
> #endif /* __ASSEMBLY__ */
>
> around the C bits - anything outside that is good for assembly as well.

OK I'll throw this into cryptodev:

commit 3b0d65969b549b796abc6f0230f6142fed365d49
Author: Herbert Xu <[email protected]>
Date: Tue Nov 3 09:11:15 2009 -0500

crypto: ghash-intel - Add PSHUFB macros

Add PSHUFB macros instead of repeating byte sequences, suggested
by Ingo.

Signed-off-by: Herbert Xu <[email protected]>
Acked-by: Ingo Molnar <[email protected]>

diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index 71768d5..5958498 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -17,6 +17,7 @@
*/

#include <linux/linkage.h>
+#include <asm/i387.h>

.align 16
.Lbswap_mask:
@@ -101,7 +102,7 @@ ENTRY(clmul_ghash_mul)
movups (%rsi), SHASH
movaps .Lbswap_mask, BSWAP
# pshufb BSWAP, DATA
- .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
+ PSHUFB_XMM5_XMM0
call __clmul_gf128mul_ble
# pshufb BSWAP, DATA
.byte 0x66, 0x0f, 0x38, 0x00, 0xc5
@@ -119,12 +120,12 @@ ENTRY(clmul_ghash_update)
movups (%rdi), DATA
movups (%rcx), SHASH
# pshufb BSWAP, DATA
- .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
+ PSHUFB_XMM5_XMM0
.align 4
.Lupdate_loop:
movups (%rsi), IN1
# pshufb BSWAP, IN1
- .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
+ PSHUFB_XMM5_XMM6
pxor IN1, DATA
call __clmul_gf128mul_ble
sub $16, %rdx
@@ -132,7 +133,7 @@ ENTRY(clmul_ghash_update)
cmp $16, %rdx
jge .Lupdate_loop
# pshufb BSWAP, DATA
- .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
+ PSHUFB_XMM5_XMM0
movups DATA, (%rdi)
.Lupdate_just_ret:
ret
@@ -146,7 +147,7 @@ ENTRY(clmul_ghash_setkey)
movaps .Lbswap_mask, BSWAP
movups (%rsi), %xmm0
# pshufb BSWAP, %xmm0
- .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
+ PSHUFB_XMM5_XMM0
movaps %xmm0, %xmm1
psllq $1, %xmm0
psrlq $63, %xmm1
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 0b20bbb..ebfb8a9 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -10,6 +10,8 @@
#ifndef _ASM_X86_I387_H
#define _ASM_X86_I387_H

+#ifndef __ASSEMBLY__
+
#include <linux/sched.h>
#include <linux/kernel_stat.h>
#include <linux/regset.h>
@@ -411,4 +413,9 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
}
}

+#endif /* __ASSEMBLY__ */
+
+#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
+#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
+
#endif /* _ASM_X86_I387_H */

Thanks,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2009-11-04 00:59:00

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH -v4] crypto: Add PCLMULQDQ accelerated GHASH implementation

On 11/03/2009 01:03 AM, Ingo Molnar wrote:
>>
>> .macro xmm_num opd xmm
>> .ifc \xmm,%xmm0
>> \opd = 0
>> .endif
>> .ifc \xmm,%xmm1
>> \opd = 1
>> .endif
>> .ifc \xmm,%xmm2
>> \opd = 2
>> .endif
>> .ifc \xmm,%xmm3
>> \opd = 3
>> .endif
>> .ifc \xmm,%xmm4
>> \opd = 4
>> .endif
>> .ifc \xmm,%xmm5
>> \opd = 5
>> .endif
>> .ifc \xmm,%xmm6
>> \opd = 6
>> .endif
>> .ifc \xmm,%xmm7
>> \opd = 7
>> .endif
>> .ifc \xmm,%xmm8
>> \opd = 8
>> .endif
>> .ifc \xmm,%xmm9
>> \opd = 9
>> .endif
>> .ifc \xmm,%xmm10
>> \opd = 10
>> .endif
>> .ifc \xmm,%xmm11
>> \opd = 11
>> .endif
>> .ifc \xmm,%xmm12
>> \opd = 12
>> .endif
>> .ifc \xmm,%xmm13
>> \opd = 13
>> .endif
>> .ifc \xmm,%xmm14
>> \opd = 14
>> .endif
>> .ifc \xmm,%xmm15
>> \opd = 15
>> .endif
>> .endm
>>
>> .macro PSHUFB_XMM xmm1 xmm2
>> xmm_num pshufb_opd1 \xmm1
>> xmm_num pshufb_opd2 \xmm2
>> .if (pshufb_opd1 < 8) && (pshufb_opd2 < 8)
>> .byte 0x66, 0x0f, 0x38, 0x00, 0xc0 | pshufb_opd1 | (pshufb_opd2<<3)
>> .elseif (pshufb_opd1 >= 8) && (pshufb_opd2 < 8)
>> .byte 0x66, 0x41, 0x0f, 0x38, 0x00, 0xc0 | (pshufb_opd1-8) | (pshufb_opd2<<3)
>> .elseif (pshufb_opd1 < 8) && (pshufb_opd2 >= 8)
>> .byte 0x66, 0x44, 0x0f, 0x38, 0x00, 0xc0 | pshufb_opd1 | ((pshufb_opd2-8)<<3)
>> .else
>> .byte 0x66, 0x45, 0x0f, 0x38, 0x00, 0xc0 | (pshufb_opd1-8) | ((pshufb_opd2-8)<<3)
>> .endif
>> .endm
>
> Looks far too clever, i like it :-) We have quite a few assembly macros
> in arch/x86/include/asm/. The above one could be put into calling.h for
> example.
>

I would really like to see something like that, with only one minor
tweak: please use submacros to generate the REX and MODRM bytes, since
we are *guaranteed* to want to do the same thing again.

-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.