2008-08-04 09:36:47

by Austin Zhang

[permalink] [raw]
Subject: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

>From Nehalem processor onward, Intel processors can support hardware
accelerated CRC32c algorithm with the new CRC32 instruction in SSE 4.2
instruction set.
The patch detects the availability of the feature, and chooses the
most proper way to calculate CRC32c checksum.
Byte code instructions are used for compiler compatibility.
No MMX / XMM registers is involved in the implementation.

Signed-off-by: Austin Zhang <[email protected]>
Signed-off-by: Kent Liu <[email protected]>
---
arch/x86/crypto/Makefile | 2
arch/x86/crypto/crc32c-intel.c | 192 +++++++++++++++++++++++++++++++++++++++++
crypto/Kconfig | 11 ++
include/asm-x86/cpufeature.h | 2
4 files changed, 207 insertions(+)

diff -Naurp linux-2.6/arch/x86/crypto/crc32c-intel.c linux-2.6-patch/arch/x86/crypto/crc32c-intel.c
--- linux-2.6/arch/x86/crypto/crc32c-intel.c 1969-12-31 19:00:00.000000000 -0500
+++ linux-2.6-patch/arch/x86/crypto/crc32c-intel.c 2008-08-04 01:59:00.000000000 -0400
@@ -0,0 +1,192 @@
+/*
+ * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
+ * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
+ * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2A: Instruction Set Reference, A-M
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <crypto/internal/hash.h>
+
+#include <asm/cpufeature.h>
+
+#define CHKSUM_BLOCK_SIZE 1
+#define CHKSUM_DIGEST_SIZE 4
+
+#ifdef CONFIG_X86_64
+#define REX_PRE "0x48, "
+#define SCALE_F 8
+#else
+#define REX_PRE
+#define SCALE_F 4
+#endif
+
+u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
+{
+ while (length--) {
+ __asm__ __volatile__(
+ ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
+ :"=S"(crc)
+ :"0"(crc), "c"(*data)
+ );
+ data++;
+ }
+
+ return crc;
+}
+
+u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len)
+{
+ unsigned int iquotient = len / SCALE_F;
+ unsigned int iremainder = len % SCALE_F;
+#ifdef CONFIG_X86_64
+ u64 *ptmp = (u64 *)p;
+#else
+ u32 *ptmp = (u32 *)p;
+#endif
+
+ while (iquotient--) {
+ __asm__ __volatile__(
+ ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
+ :"=S"(crc)
+ :"0"(crc), "c"(*ptmp)
+ );
+ ptmp++;
+ }
+
+ if (iremainder)
+ crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp,
+ iremainder);
+
+ return crc;
+}
+
+/*
+ * Setting the seed allows arbitrary accumulators and flexible XOR policy
+ * If your algorithm starts with ~0, then XOR with ~0 before you set
+ * the seed.
+ */
+static int crc32c_intel_setkey(struct crypto_ahash *hash, const u8 *key,
+ unsigned int keylen)
+{
+ u32 *mctx = crypto_ahash_ctx(hash);
+
+ if (keylen != sizeof(u32)) {
+ crypto_ahash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+ *mctx = le32_to_cpup((__le32 *)key);
+ return 0;
+}
+
+static int crc32c_intel_init(struct ahash_request *req)
+{
+ u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
+ u32 *crcp = ahash_request_ctx(req);
+
+ *crcp = *mctx;
+
+ return 0;
+}
+
+static int crc32c_intel_update(struct ahash_request *req)
+{
+ struct crypto_hash_walk walk;
+ u32 *crcp = ahash_request_ctx(req);
+ u32 crc = *crcp;
+ int nbytes;
+
+ for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
+ nbytes = crypto_hash_walk_done(&walk, 0))
+ crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
+
+ *crcp = crc;
+ return 0;
+}
+
+static int crc32c_intel_final(struct ahash_request *req)
+{
+ u32 *crcp = ahash_request_ctx(req);
+
+ *(__le32 *)req->result = ~cpu_to_le32p(crcp);
+ return 0;
+}
+
+static int crc32c_intel_digest(struct ahash_request *req)
+{
+ struct crypto_hash_walk walk;
+ u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
+ u32 crc = *mctx;
+ int nbytes;
+
+ for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
+ nbytes = crypto_hash_walk_done(&walk, 0))
+ crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
+
+ *(__le32 *)req->result = ~cpu_to_le32(crc);
+ return 0;
+}
+
+static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
+{
+ u32 *key = crypto_tfm_ctx(tfm);
+
+ *key = ~0;
+
+ tfm->crt_ahash.reqsize = sizeof(u32);
+
+ if (cpu_has_xmm4_2)
+ return 0;
+ else
+ return -1;
+}
+
+static struct crypto_alg alg = {
+ .cra_name = "crc32c",
+ .cra_driver_name = "crc32c-intel",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_AHASH,
+ .cra_blocksize = CHKSUM_BLOCK_SIZE,
+ .cra_alignmask = 3,
+ .cra_ctxsize = sizeof(u32),
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(alg.cra_list),
+ .cra_init = crc32c_intel_cra_init,
+ .cra_type = &crypto_ahash_type,
+ .cra_u = {
+ .ahash = {
+ .digestsize = CHKSUM_DIGEST_SIZE,
+ .setkey = crc32c_intel_setkey,
+ .init = crc32c_intel_init,
+ .update = crc32c_intel_update,
+ .final = crc32c_intel_final,
+ .digest = crc32c_intel_digest,
+ }
+ }
+};
+
+
+static int __init crc32c_intel_mod_init(void)
+{
+ return crypto_register_alg(&alg);
+}
+
+static void __exit crc32c_intel_mod_fini(void)
+{
+ crypto_unregister_alg(&alg);
+}
+
+module_init(crc32c_intel_mod_init);
+module_exit(crc32c_intel_mod_fini);
+
+MODULE_AUTHOR("Austin Zhang <[email protected]>, Kent Liu <[email protected]>");
+MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware.");
+MODULE_LICENSE("GPL");
+
+MODULE_ALIAS("crc32c");
+MODULE_ALIAS("crc32c-intel");
+
diff -Naurp linux-2.6/arch/x86/crypto/Makefile linux-2.6-patch/arch/x86/crypto/Makefile
--- linux-2.6/arch/x86/crypto/Makefile 2008-08-04 01:08:00.000000000 -0400
+++ linux-2.6-patch/arch/x86/crypto/Makefile 2008-08-04 01:59:00.000000000 -0400
@@ -10,6 +10,8 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o

+obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
+
aes-i586-y := aes-i586-asm_32.o aes_glue.o
twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
diff -Naurp linux-2.6/crypto/Kconfig linux-2.6-patch/crypto/Kconfig
--- linux-2.6/crypto/Kconfig 2008-08-04 01:08:00.000000000 -0400
+++ linux-2.6-patch/crypto/Kconfig 2008-08-04 01:59:00.000000000 -0400
@@ -221,6 +221,17 @@ config CRYPTO_CRC32C
See Castagnoli93. This implementation uses lib/libcrc32c.
Module will be crc32c.

+config CRYPTO_CRC32C_INTEL
+ tristate "CRC32c INTEL hardware acceleration"
+ select CRYPTO_HASH
+ help
+ In Intel processor with SSE4.2 supported, the processor will
+ support CRC32C implemetation using hardware accelerated CRC32
+ instruction. This option will create 'crc32c-intel' module,
+ which will enable any routine to use the CRC32 instruction to
+ gain performance compared with software implementation.
+ Module will be crc32c-intel.
+
config CRYPTO_MD4
tristate "MD4 digest algorithm"
select CRYPTO_ALGAPI
diff -Naurp linux-2.6/include/asm-x86/cpufeature.h linux-2.6-patch/include/asm-x86/cpufeature.h
--- linux-2.6/include/asm-x86/cpufeature.h 2008-08-04 01:08:08.000000000 -0400
+++ linux-2.6-patch/include/asm-x86/cpufeature.h 2008-08-04 01:59:00.000000000 -0400
@@ -91,6 +91,7 @@
#define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */
#define X86_FEATURE_XTPR (4*32+14) /* Send Task Priority Messages */
#define X86_FEATURE_DCA (4*32+18) /* Direct Cache Access */
+#define X86_FEATURE_XMM4_2 (4*32+20) /* Streaming SIMD Extensions-4.2 */

/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
#define X86_FEATURE_XSTORE (5*32+ 2) /* on-CPU RNG present (xstore insn) */
@@ -189,6 +190,7 @@ extern const char * const x86_power_flag
#define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES)
#define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON)
#define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT)
+#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2)

#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
# define cpu_has_invlpg 1



2008-08-04 10:12:21

by David Woodhouse

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, 2008-08-04 at 05:35 -0400, Austin Zhang wrote:
> +u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t
> len)
> +{
> + unsigned int iquotient = len / SCALE_F;
> + unsigned int iremainder = len % SCALE_F;
> +#ifdef CONFIG_X86_64
> + u64 *ptmp = (u64 *)p;
> +#else
> + u32 *ptmp = (u32 *)p;
> +#endif

You could perhaps just use 'unsigned long' here, to avoid the ifdef.

And it would be nice if we could make libcrc32c use this too, rather
than just the 'crypto' users.

--
David Woodhouse Open Source Technology Centre
[email protected] Intel Corporation




Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

* David Woodhouse | 2008-08-04 11:12:05 [+0100]:

>On Mon, 2008-08-04 at 05:35 -0400, Austin Zhang wrote:
>> +u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t
>> len)
>> +{
>> + unsigned int iquotient = len / SCALE_F;
>> + unsigned int iremainder = len % SCALE_F;
>> +#ifdef CONFIG_X86_64
>> + u64 *ptmp = (u64 *)p;
>> +#else
>> + u32 *ptmp = (u32 *)p;
>> +#endif
>
>You could perhaps just use 'unsigned long' here, to avoid the ifdef.
>
>And it would be nice if we could make libcrc32c use this too, rather
>than just the 'crypto' users.

I'm not sure if I remeber correctly but I thing Herbert was planning to
convert all users over to the crypto API to avoid compile time
dependency.

Sebastian

2008-08-04 10:28:11

by David Woodhouse

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, 2008-08-04 at 12:25 +0200, Sebastian Siewior wrote:
> * David Woodhouse | 2008-08-04 11:12:05 [+0100]:
> >And it would be nice if we could make libcrc32c use this too, rather
> >than just the 'crypto' users.
>
> I'm not sure if I remeber correctly but I thing Herbert was planning to
> convert all users over to the crypto API to avoid compile time
> dependency.

That's one way of doing it, although it seems a little bit like overkill
in this particular case.

--
David Woodhouse Open Source Technology Centre
[email protected] Intel Corporation




2008-08-04 10:37:18

by Austin Zhang

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, 2008-08-04 at 11:12 +0100, David Woodhouse wrote:
> You could perhaps just use 'unsigned long' here, to avoid the ifdef.
Thanks.
> And it would be nice if we could make libcrc32c use this too, rather
> than just the 'crypto' users.
>From previous discussing, herbert would like to transfer the libcrc32c
interface by new crypto because there were few user using the current
libcrc32c interface.


2008-08-04 10:45:33

by David Woodhouse

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, 2008-08-04 at 06:35 -0400, Austin Zhang wrote:
> On Mon, 2008-08-04 at 11:12 +0100, David Woodhouse wrote:
> > You could perhaps just use 'unsigned long' here, to avoid the ifdef.
> Thanks.
> > And it would be nice if we could make libcrc32c use this too, rather
> > than just the 'crypto' users.
> From previous discussing, herbert would like to transfer the libcrc32c
> interface by new crypto because there were few user using the current
> libcrc32c interface.

Are we deprecating libcrc32c, then? Or just turning it into a wrapper
around the crypto code?

Either way, does it really make sense to force all crc32 users to pull
in the whole crypto framework? Some may get fractious about that...

--
David Woodhouse Open Source Technology Centre
[email protected] Intel Corporation




2008-08-04 10:48:28

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, Aug 04, 2008 at 12:25:57PM +0200, Sebastian Siewior wrote:
>
> I'm not sure if I remeber correctly but I thing Herbert was planning to
> convert all users over to the crypto API to avoid compile time
> dependency.

Yes that's the plan. I've been busy with the crypto testing
stuff but I'll get onto this soon.

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2008-08-04 10:59:45

by Austin Zhang

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, 2008-08-04 at 11:45 +0100, David Woodhouse wrote:
> Are we deprecating libcrc32c, then? Or just turning it into a wrapper
> around the crypto code?
Maybe I can pick up crc32c_intel_le_hw_byte and crc32c_intel_le_hw
into
one arch-related file and make the current new crypto interface and
libcrc32c
call it.
> Either way, does it really make sense to force all crc32 users to pull
> in the whole crypto framework? Some may get fractious about that...
If there were really few (or no) user using that previous interface, it will
be reasonable to merge the crc32c totally into crypto subsystem as a
digest method.
And I remembered Herbert had mentioned he will convert those previous
interface
calling to new crypto API. For the crc32c, he had done for it.

BTW, why did I always got each email on this thread twice:(? (the same
email twice)

2008-08-04 12:39:09

by David Woodhouse

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, 2008-08-04 at 06:58 -0400, Austin Zhang wrote:
> BTW, why did I always got each email on this thread twice:(? (the same
> email twice)

You're probably subscribed to the linux-kernel list, and you're also
being Cc'd directly.

Normally, your filters should notice the copy which has a Return-Path:
matching 'linux-kernel-owner.*@vger.kernel.org', and put that into your
lkml folder with a few hundred other messages each day -- while the copy
which is direct will still have the original sender, and would go into
your inbox where you'll see it.

--
dwmw2


2008-08-04 14:04:46

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, Aug 04, 2008 at 11:45:26AM +0100, David Woodhouse wrote:
>
> Are we deprecating libcrc32c, then? Or just turning it into a wrapper
> around the crypto code?

It will be removed.

> Either way, does it really make sense to force all crc32 users to pull
> in the whole crypto framework? Some may get fractious about that...

There only three crc32c users in the kernel tree and the crypto
interface will serve the perfectly.

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2008-08-04 14:13:39

by David Woodhouse

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, 2008-08-04 at 05:35 -0400, Austin Zhang wrote:
> +static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
> +{
> + u32 *key = crypto_tfm_ctx(tfm);
> +
> + *key = ~0;
> +
> + tfm->crt_ahash.reqsize = sizeof(u32);
> +
> + if (cpu_has_xmm4_2)
> + return 0;
> + else
> + return -1;
> +}

...

> +static int __init crc32c_intel_mod_init(void)
> +{
> + return crypto_register_alg(&alg);
> +}
> +

Am I missing something here, or are you registering the crypto algorithm
_unconditionally_ and then just causing init requests for it to fail on
older hardware? Wouldn't it be better to register the driver _only_
when the hardware is capable? Or at least "if at least one cpu is
capable".

> --- linux-2.6/crypto/Kconfig 2008-08-04 01:08:00.000000000 -0400
> +++ linux-2.6-patch/crypto/Kconfig 2008-08-04 01:59:00.000000000 -0400
> @@ -221,6 +221,17 @@ config CRYPTO_CRC32C
> See Castagnoli93. This implementation uses lib/libcrc32c.
> Module will be crc32c.
>
> +config CRYPTO_CRC32C_INTEL
> + tristate "CRC32c INTEL hardware acceleration"
> + select CRYPTO_HASH
> + help
> + In Intel processor with SSE4.2 supported, the processor will
> + support CRC32C implemetation using hardware accelerated CRC32
> + instruction. This option will create 'crc32c-intel' module,
> + which will enable any routine to use the CRC32 instruction to
> + gain performance compared with software implementation.
> + Module will be crc32c-intel.
> +
> config CRYPTO_MD4
> tristate "MD4 digest algorithm"
> select CRYPTO_ALGAPI

I think that should depend on CONFIG_X86?

--
David Woodhouse Open Source Technology Centre
[email protected] Intel Corporation




2008-08-04 14:17:40

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, Aug 04, 2008 at 03:13:34PM +0100, David Woodhouse wrote:
>
> > +static int __init crc32c_intel_mod_init(void)
> > +{
> > + return crypto_register_alg(&alg);
> > +}
> > +
>
> Am I missing something here, or are you registering the crypto algorithm
> _unconditionally_ and then just causing init requests for it to fail on
> older hardware? Wouldn't it be better to register the driver _only_
> when the hardware is capable? Or at least "if at least one cpu is
> capable".

Yes I think this is a show-stopper :)

Thanks,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2008-08-04 14:18:29

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, Aug 04, 2008 at 05:35:07AM -0400, Austin Zhang wrote:
>
> +static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
> +{
> + u32 *key = crypto_tfm_ctx(tfm);
> +
> + *key = ~0;
> +
> + tfm->crt_ahash.reqsize = sizeof(u32);
> +
> + if (cpu_has_xmm4_2)
> + return 0;
> + else
> + return -1;
> +}

This check needs to be moved to the module init function and
if it fails the module should not register the algorithm.

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2008-08-04 14:19:20

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, Aug 04, 2008 at 05:35:07AM -0400, Austin Zhang wrote:
>
> +config CRYPTO_CRC32C_INTEL
> + tristate "CRC32c INTEL hardware acceleration"
> + select CRYPTO_HASH

You need some sort of a dependency here. See what the other
assembly algorithms do it.

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2008-08-04 14:20:21

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, 4 Aug 2008 22:04:35 +0800
Herbert Xu <[email protected]> wrote:

> There only three crc32c users in the kernel tree and the crypto
> interface will serve the perfectly.

isn't it a bit heavy for something as simple as a crc?
(which after all is one instruction now ;0)


--
If you want to reach me at my work email, use [email protected]
For development, discussion and tips for power savings,
visit http://www.lesswatts.org

2008-08-04 14:49:07

by David Woodhouse

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, 2008-08-04 at 07:20 -0700, Arjan van de Ven wrote:
> On Mon, 4 Aug 2008 22:04:35 +0800
> Herbert Xu <[email protected]> wrote:
>
> > There only three crc32c users in the kernel tree and the crypto
> > interface will serve the perfectly.
>
> isn't it a bit heavy for something as simple as a crc?
> (which after all is one instruction now ;0)

It does seem that way. For users who care about 'bloat' and are _only_
interested in crc32, it's yet another chunk of extra infrastructure
which offers no benefit to them.

And even for people who don't care about that, it doesn't look
particularly good. It looks like btrfs would need either to keep setting
up a crypto context and then tearing it down, or have a pool of
long-standing contexts and do some kind of locking on them -- neither of
which seem particularly optimal compared with just calling into
libcrc32c.

We can't even set up one context per cpu and disable preempt while we
use it, can we? The routines are allowed to sleep?

(Although I have to admit I do like the fact that it'd only be available
through EXPORT_SYMBOL_GPL if we do force people to use the crypto
API...)

--
David Woodhouse Open Source Technology Centre
[email protected] Intel Corporation




2008-08-04 14:54:21

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, Aug 04, 2008 at 03:49:01PM +0100, David Woodhouse wrote:
>
> And even for people who don't care about that, it doesn't look
> particularly good. It looks like btrfs would need either to keep setting
> up a crypto context and then tearing it down, or have a pool of
> long-standing contexts and do some kind of locking on them -- neither of
> which seem particularly optimal compared with just calling into
> libcrc32c.

No you don't have to set things up every time you use crc32c.
The crypto interface lets you have a single tfm that can be
used by multiple users simultaneously. For ahash algorithms
all the state is stored in the request which can stay on the
stack.

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2008-08-04 15:12:17

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

Arjan van de Ven <[email protected]> wrote:
> On Mon, 4 Aug 2008 22:04:35 +0800
> Herbert Xu <[email protected]> wrote:
>
>> There only three crc32c users in the kernel tree and the crypto
>> interface will serve the perfectly.
>
> isn't it a bit heavy for something as simple as a crc?
> (which after all is one instruction now ;0)

Well AES on the PadLock is also a single instruction and nobody
has ever complained :)

Seriously, the crypto code is extremely small on the data path.
The heaviest part is the indirect function call but you have to
have that in order to support multiple implementations cleanly.

All the fat is on the control path, i.e., tfm allocation. For
crc32c you only need a single tfm since all the state is stored
in the request object.

Note that you should ignore the existing crc32c user, iSCSI as
it was written before the new crypto hash interface was available.
I will be converting it along with the other two crc32c users. to
the new ahash interface.

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2008-08-04 15:14:22

by Chris Mason

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, 2008-08-04 at 11:45 +0100, David Woodhouse wrote:
> On Mon, 2008-08-04 at 06:35 -0400, Austin Zhang wrote:
> > On Mon, 2008-08-04 at 11:12 +0100, David Woodhouse wrote:
> > > You could perhaps just use 'unsigned long' here, to avoid the ifdef.
> > Thanks.
> > > And it would be nice if we could make libcrc32c use this too, rather
> > > than just the 'crypto' users.
> > From previous discussing, herbert would like to transfer the libcrc32c
> > interface by new crypto because there were few user using the current
> > libcrc32c interface.
>
> Are we deprecating libcrc32c, then? Or just turning it into a wrapper
> around the crypto code?
>

Long term I'd like to switch btrfs to the crypto api, but right now I'm
using libcrc32c.

>From a performance point of view I'm probably reading the crypto API
code wrong, but it looks like my choices are to either have a long
standing context and use locking around the digest/hash calls to protect
internal crypto state, or create a new context every time and take a
perf hit while crypto looks up the right module.

Either way it looks slower than just calling good old libcrc32c.

-chris


2008-08-04 15:43:16

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

Chris Mason <[email protected]> wrote:
>
>>From a performance point of view I'm probably reading the crypto API
> code wrong, but it looks like my choices are to either have a long
> standing context and use locking around the digest/hash calls to protect
> internal crypto state, or create a new context every time and take a
> perf hit while crypto looks up the right module.

You're looking at the old hash interface. New users should use the
ahash interface which was only recently added to the kernel. It
lets you store the state in the request object which you pass to
the algorithm on every call. This means that you only need one
tfm in the entire system for crc32c.

BTW, don't let the a in ahash intimidate you. It's meant to support
synchronous implementations such as the Intel instruction just as
well as asynchronous ones.

And if you're still not convinced here is the benchmark on the
digest_null algorithm:

testing speed of stub_digest_null
test 0 ( 16 byte blocks, 16 bytes per update, 1 updates): 190 cycles/operation, 11 cycles/byte
test 1 ( 64 byte blocks, 16 bytes per update, 4 updates): 367 cycles/operation, 5 cycles/byte
test 2 ( 64 byte blocks, 64 bytes per update, 1 updates): 192 cycles/operation, 3 cycles/byte
test 3 ( 256 byte blocks, 16 bytes per update, 16 updates): 1006 cycles/operation, 3 cycles/byte
test 4 ( 256 byte blocks, 64 bytes per update, 4 updates): 378 cycles/operation, 1 cycles/byte
test 5 ( 256 byte blocks, 256 bytes per update, 1 updates): 191 cycles/operation, 0 cycles/byte
test 6 ( 1024 byte blocks, 16 bytes per update, 64 updates): 3557 cycles/operation, 3 cycles/byte
test 7 ( 1024 byte blocks, 256 bytes per update, 4 updates): 365 cycles/operation, 0 cycles/byte
test 8 ( 1024 byte blocks, 1024 bytes per update, 1 updates): 191 cycles/operation, 0 cycles/byte
test 9 ( 2048 byte blocks, 16 bytes per update, 128 updates): 6903 cycles/operation, 3 cycles/byte
test 10 ( 2048 byte blocks, 256 bytes per update, 8 updates): 574 cycles/operation, 0 cycles/byte
test 11 ( 2048 byte blocks, 1024 bytes per update, 2 updates): 259 cycles/operation, 0 cycles/byte
test 12 ( 2048 byte blocks, 2048 bytes per update, 1 updates): 192 cycles/operation, 0 cycles/byte
test 13 ( 4096 byte blocks, 16 bytes per update, 256 updates): 13626 cycles/operation, 3 cycles/byte
test 14 ( 4096 byte blocks, 256 bytes per update, 16 updates): 1008 cycles/operation, 0 cycles/byte
test 15 ( 4096 byte blocks, 1024 bytes per update, 4 updates): 370 cycles/operation, 0 cycles/byte
test 16 ( 4096 byte blocks, 4096 bytes per update, 1 updates): 193 cycles/operation, 0 cycles/byte
test 17 ( 8192 byte blocks, 16 bytes per update, 512 updates): 27042 cycles/operation, 3 cycles/byte
test 18 ( 8192 byte blocks, 256 bytes per update, 32 updates): 1854 cycles/operation, 0 cycles/byte
test 19 ( 8192 byte blocks, 1024 bytes per update, 8 updates): 576 cycles/operation, 0 cycles/byte
test 20 ( 8192 byte blocks, 4096 bytes per update, 2 updates): 253 cycles/operation, 0 cycles/byte
test 21 ( 8192 byte blocks, 8192 bytes per update, 1 updates): 241 cycles/operation, 0 cycles/byte

This is a dry run with a digest_null where all the functions
are stubbed out (i.e., just a return 0). So this measures the
overhead of the benchmark itself.

Now with a run over a digest_null that simply touches all the
data:

testing speed of digest_null
test 0 ( 16 byte blocks, 16 bytes per update, 1 updates): 193 cycles/operation, 12 cycles/byte
test 1 ( 64 byte blocks, 16 bytes per update, 4 updates): 369 cycles/operation, 5 cycles/byte
test 2 ( 64 byte blocks, 64 bytes per update, 1 updates): 193 cycles/operation, 3 cycles/byte
test 3 ( 256 byte blocks, 16 bytes per update, 16 updates): 1010 cycles/operation, 3 cycles/byte
test 4 ( 256 byte blocks, 64 bytes per update, 4 updates): 364 cycles/operation, 1 cycles/byte
test 5 ( 256 byte blocks, 256 bytes per update, 1 updates): 191 cycles/operation, 0 cycles/byte
test 6 ( 1024 byte blocks, 16 bytes per update, 64 updates): 3538 cycles/operation, 3 cycles/byte
test 7 ( 1024 byte blocks, 256 bytes per update, 4 updates): 370 cycles/operation, 0 cycles/byte
test 8 ( 1024 byte blocks, 1024 bytes per update, 1 updates): 192 cycles/operation, 0 cycles/byte
test 9 ( 2048 byte blocks, 16 bytes per update, 128 updates): 6927 cycles/operation, 3 cycles/byte
test 10 ( 2048 byte blocks, 256 bytes per update, 8 updates): 576 cycles/operation, 0 cycles/byte
test 11 ( 2048 byte blocks, 1024 bytes per update, 2 updates): 259 cycles/operation, 0 cycles/byte
test 12 ( 2048 byte blocks, 2048 bytes per update, 1 updates): 192 cycles/operation, 0 cycles/byte
test 13 ( 4096 byte blocks, 16 bytes per update, 256 updates): 13624 cycles/operation, 3 cycles/byte
test 14 ( 4096 byte blocks, 256 bytes per update, 16 updates): 1001 cycles/operation, 0 cycles/byte
test 15 ( 4096 byte blocks, 1024 bytes per update, 4 updates): 365 cycles/operation, 0 cycles/byte
test 16 ( 4096 byte blocks, 4096 bytes per update, 1 updates): 192 cycles/operation, 0 cycles/byte
test 17 ( 8192 byte blocks, 16 bytes per update, 512 updates): 27095 cycles/operation, 3 cycles/byte
test 18 ( 8192 byte blocks, 256 bytes per update, 32 updates): 1854 cycles/operation, 0 cycles/byte
test 19 ( 8192 byte blocks, 1024 bytes per update, 8 updates): 578 cycles/operation, 0 cycles/byte
test 20 ( 8192 byte blocks, 4096 bytes per update, 2 updates): 255 cycles/operation, 0 cycles/byte
test 21 ( 8192 byte blocks, 8192 bytes per update, 1 updates): 241 cycles/operation, 0 cycles/byte

As you can see, the crypto API overhead is pretty much lost in
the noise.

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2008-08-04 16:16:35

by Chris Mason

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, 2008-08-04 at 23:42 +0800, Herbert Xu wrote:
> Chris Mason <[email protected]> wrote:
> >
> >>From a performance point of view I'm probably reading the crypto API
> > code wrong, but it looks like my choices are to either have a long
> > standing context and use locking around the digest/hash calls to protect
> > internal crypto state, or create a new context every time and take a
> > perf hit while crypto looks up the right module.
>
> You're looking at the old hash interface. New users should use the
> ahash interface which was only recently added to the kernel. It
> lets you store the state in the request object which you pass to
> the algorithm on every call. This means that you only need one
> tfm in the entire system for crc32c.
>

Great to hear, that solves my main concern then. There is still the
embedded argument against needing all of crypto api just for libcrc32c.

It does make sense to me to have a libcrc32c that does the HW detection
and uses HW assist when present, and just have the cypto api call that.

-chris



2008-08-04 16:45:52

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

Chris Mason <[email protected]> wrote:
>
> Great to hear, that solves my main concern then. There is still the
> embedded argument against needing all of crypto api just for libcrc32c.

The existing users are iSCSI, SCTP, Infiniband, all of which are
clearly crucial in the embedded space :)

> It does make sense to me to have a libcrc32c that does the HW detection
> and uses HW assist when present, and just have the cypto api call that.

Well then you're going to have to do the check on every call.

Seriously, I'm happy to trim off any fat from the crypto API for the
embedded space. For a start, if you only needed hashing then we
could do without the legacy cipher/compress support. That shaves
off 800 bytes on i386. There is also still some legacy code in
api.c itself. Getting rid of them should get us to around 2K.

On the other hand, one of the advantages of doing it through the
crypto API is that this kind of selection is useful for quite a
few operations, e.g., xor or even memcpy.

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2008-08-04 17:06:15

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Tue, 05 Aug 2008 00:45:34 +0800
Herbert Xu <[email protected]> wrote:

> On the other hand, one of the advantages of doing it through the
> crypto API is that this kind of selection is useful for quite a
> few operations, e.g., xor or even memcpy.

well you still have that indirect function call

for libcrc32 we could alternatives() that...


--
If you want to reach me at my work email, use [email protected]
For development, discussion and tips for power savings,
visit http://www.lesswatts.org

2008-08-04 17:10:38

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, Aug 04, 2008 at 10:06:05AM -0700, Arjan van de Ven wrote:
>
> well you still have that indirect function call
>
> for libcrc32 we could alternatives() that...

I don't see why you couldn't do that for the crypto API too
if you wanted to. That way it would benefit all crypto users
rather than just the crc32c (note the extra c) users.

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2008-08-04 17:13:35

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Tue, Aug 05, 2008 at 01:10:17AM +0800, Herbert Xu wrote:
>
> I don't see why you couldn't do that for the crypto API too
> if you wanted to. That way it would benefit all crypto users
> rather than just the crc32c (note the extra c) users.

Anyway, the point here is the crc32c is nothing special. It's
just one out of many algorithms that has/will have hardware
acceleration support.

Rather than doing ad-hoc implementations and optimising that
whenever such a thing pops up, let's spend our effort in creating
a common platform that can be reused.

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2008-08-04 17:29:30

by Randy Dunlap

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, 04 Aug 2008 05:35:07 -0400 Austin Zhang wrote:

> diff -Naurp linux-2.6/crypto/Kconfig linux-2.6-patch/crypto/Kconfig
> --- linux-2.6/crypto/Kconfig 2008-08-04 01:08:00.000000000 -0400
> +++ linux-2.6-patch/crypto/Kconfig 2008-08-04 01:59:00.000000000 -0400
> @@ -221,6 +221,17 @@ config CRYPTO_CRC32C
> See Castagnoli93. This implementation uses lib/libcrc32c.
> Module will be crc32c.
>
> +config CRYPTO_CRC32C_INTEL
> + tristate "CRC32c INTEL hardware acceleration"
> + select CRYPTO_HASH
> + help
> + In Intel processor with SSE4.2 supported, the processor will
> + support CRC32C implemetation using hardware accelerated CRC32

implementation

> + instruction. This option will create 'crc32c-intel' module,
> + which will enable any routine to use the CRC32 instruction to
> + gain performance compared with software implementation.
> + Module will be crc32c-intel.

and don't end lines with spaces...


---
~Randy
Linux Plumbers Conference, 17-19 September 2008, Portland, Oregon USA
http://linuxplumbersconf.org/

2008-08-04 18:02:38

by Benoit Boissinot

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, Aug 4, 2008 at 5:42 PM, Herbert Xu <[email protected]> wrote:
> Chris Mason <[email protected]> wrote:
>>
>>>From a performance point of view I'm probably reading the crypto API
>> code wrong, but it looks like my choices are to either have a long
>> standing context and use locking around the digest/hash calls to protect
>> internal crypto state, or create a new context every time and take a
>> perf hit while crypto looks up the right module.
>
> You're looking at the old hash interface. New users should use the
> ahash interface which was only recently added to the kernel. It
> lets you store the state in the request object which you pass to
> the algorithm on every call. This means that you only need one
> tfm in the entire system for crc32c.
>
> BTW, don't let the a in ahash intimidate you. It's meant to support
> synchronous implementations such as the Intel instruction just as
> well as asynchronous ones.

Since I couldn't find any ahash user in the tree (outside of tcrypt.c), can you
provide some example source code as to how to use it (especially synchonously).

For example the code for the digest_null testing would be fine.

regards,

Benoit

2008-08-05 02:09:11

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

Benoit Boissinot <[email protected]> wrote:
>
> Since I couldn't find any ahash user in the tree (outside of tcrypt.c), can you
> provide some example source code as to how to use it (especially synchonously).
>
> For example the code for the digest_null testing would be fine.

Sure, here is the async hash speed test. I haven't pushed it
yet because I'm thinking of picking up on David Howells' idea
of creating a sync hash interface that doesn't use scatterlists.

Note that you'll need the appended patch for this to compile as
the partial ahash functions were missing prototypes.

static int test_hash_cycles(struct ahash_request *req, struct scatterlist *sg,
int blen, int plen, char *out)
{
unsigned long cycles = 0;
int i, pcount;
int ret;

if (plen == blen)
return test_hash_cycles_digest(req, sg, blen, out);

ahash_request_set_crypt(req, sg, out, plen);

local_bh_disable();
local_irq_disable();

/* Warm-up run. */
for (i = 0; i < 4; i++) {
ret = crypto_ahash_init(req);
if (ret)
goto out;
for (pcount = 0; pcount < blen; pcount += plen) {
ret = crypto_ahash_update(req);
if (ret)
goto out;
}
ret = crypto_ahash_final(req);
if (ret)
goto out;
}

/* The real thing. */
for (i = 0; i < 8; i++) {
cycles_t start, end;

start = get_cycles();

ret = crypto_ahash_init(req);
if (ret)
goto out;
for (pcount = 0; pcount < blen; pcount += plen) {
ret = crypto_ahash_update(req);
if (ret)
goto out;
}
ret = crypto_ahash_final(req);
if (ret)
goto out;

end = get_cycles();

cycles += end - start;
}

out:
local_irq_enable();
local_bh_enable();

if (ret)
return ret;

printk("%6lu cycles/operation, %4lu cycles/byte\n",
cycles / 8, cycles / (8 * blen));

return 0;
}

static void test_hash_speed(const char *algo, unsigned int sec,
struct hash_speed *speed)
{
struct scatterlist sg[TVMEMSIZE];
struct crypto_ahash *tfm;
char output[1024];
int i;
int ret;

printk("\ntesting speed of %s\n", algo);

tfm = crypto_alloc_ahash(algo, 0, CRYPTO_ALG_ASYNC);

if (IS_ERR(tfm)) {
printk("failed to load transform for %s: %ld\n", algo,
PTR_ERR(tfm));
return;
}

{
struct {
struct ahash_request req;
char ctx[crypto_ahash_reqsize(tfm)];
} req;

ahash_request_set_tfm(&req.req, tfm);
ahash_request_set_callback(&req.req, 0, NULL, NULL);

if (crypto_ahash_digestsize(tfm) > sizeof(output)) {
printk("digestsize(%u) > outputbuffer(%zu)\n",
crypto_ahash_digestsize(tfm), sizeof(output));
goto out;
}

sg_init_table(sg, TVMEMSIZE);
for (i = 0; i < TVMEMSIZE; i++) {
sg_set_buf(sg + i, tvmem[i], PAGE_SIZE);
memset(tvmem[i], 0xff, PAGE_SIZE);
}

for (i = 0; speed[i].blen != 0; i++) {
if (speed[i].blen > TVMEMSIZE * PAGE_SIZE) {
printk("template (%u) too big for tvmem (%lu)\n",
speed[i].blen, TVMEMSIZE * PAGE_SIZE);
goto out;
}

printk("test%3u (%5u byte blocks,%5u bytes per update,%4u updates): ",
i, speed[i].blen, speed[i].plen, speed[i].blen / speed[i].plen);

if (sec)
ret = test_hash_jiffies(&req.req, sg, speed[i].blen,
speed[i].plen, output, sec);
else
ret = test_hash_cycles(&req.req, sg, speed[i].blen,
speed[i].plen, output);

if (ret) {
printk("hashing failed ret=%d\n", ret);
break;
}
}
}

out:
crypto_free_ahash(tfm);
}

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index d12498e..ee48ef8 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -101,6 +101,24 @@ static inline int crypto_ahash_digest(struct ahash_request *req)
return crt->digest(req);
}

+static inline int crypto_ahash_init(struct ahash_request *req)
+{
+ struct ahash_tfm *crt = crypto_ahash_crt(crypto_ahash_reqtfm(req));
+ return crt->init(req);
+}
+
+static inline int crypto_ahash_update(struct ahash_request *req)
+{
+ struct ahash_tfm *crt = crypto_ahash_crt(crypto_ahash_reqtfm(req));
+ return crt->update(req);
+}
+
+static inline int crypto_ahash_final(struct ahash_request *req)
+{
+ struct ahash_tfm *crt = crypto_ahash_crt(crypto_ahash_reqtfm(req));
+ return crt->final(req);
+}
+
static inline void ahash_request_set_tfm(struct ahash_request *req,
struct crypto_ahash *tfm)
{

2008-08-05 09:53:22

by Austin Zhang

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, 2008-08-04 at 15:13 +0100, David Woodhouse wrote:
> Am I missing something here, or are you registering the crypto algorithm
> _unconditionally_ and then just causing init requests for it to fail on
> older hardware? Wouldn't it be better to register the driver _only_
> when the hardware is capable? Or at least "if at least one cpu is
> capable".
Thanks, I will move it to module init.
> I think that should depend on CONFIG_X86?
Thanks.


2008-08-05 10:00:56

by Austin Zhang

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, 2008-08-04 at 22:19 +0800, Herbert Xu wrote:
> On Mon, Aug 04, 2008 at 05:35:07AM -0400, Austin Zhang wrote:
> >
> > +config CRYPTO_CRC32C_INTEL
> > + tristate "CRC32c INTEL hardware acceleration"
> > + select CRYPTO_HASH
>
> You need some sort of a dependency here. See what the other
> assembly algorithms do it.
>
> Cheers,
How about:
+config CRYPTO_CRC32C_INTEL
+ tristate "CRC32c INTEL hardware acceleration"
+ depends on X86
+ select CRYPTO_ALGAPI
+ help
+ In Intel processor with SSE4.2 supported, the processor will
......

It should only depend on X86.


2008-08-05 10:14:44

by Austin Zhang

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Mon, 2008-08-04 at 10:27 -0700, Randy Dunlap wrote:
> On Mon, 04 Aug 2008 05:35:07 -0400 Austin Zhang wrote:
>
> > diff -Naurp linux-2.6/crypto/Kconfig linux-2.6-patch/crypto/Kconfig
> > --- linux-2.6/crypto/Kconfig 2008-08-04 01:08:00.000000000 -0400
> > +++ linux-2.6-patch/crypto/Kconfig 2008-08-04 01:59:00.000000000 -0400
> > @@ -221,6 +221,17 @@ config CRYPTO_CRC32C
> > See Castagnoli93. This implementation uses lib/libcrc32c.
> > Module will be crc32c.
> >
> > +config CRYPTO_CRC32C_INTEL
> > + tristate "CRC32c INTEL hardware acceleration"
> > + select CRYPTO_HASH
> > + help
> > + In Intel processor with SSE4.2 supported, the processor will
> > + support CRC32C implemetation using hardware accelerated CRC32
>
> implementation
>
> > + instruction. This option will create 'crc32c-intel' module,
> > + which will enable any routine to use the CRC32 instruction to
> > + gain performance compared with software implementation.
> > + Module will be crc32c-intel.
>
> and don't end lines with spaces...
>
>
Thanks a lot:)


2008-08-05 10:44:14

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Tue, Aug 05, 2008 at 05:59:11AM -0400, Austin Zhang wrote:
>
> How about:
> +config CRYPTO_CRC32C_INTEL
> + tristate "CRC32c INTEL hardware acceleration"
> + depends on X86
> + select CRYPTO_ALGAPI
> + help
> + In Intel processor with SSE4.2 supported, the processor will
> ......
>
> It should only depend on X86.

Yes that looks good.

Thanks,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2008-08-05 11:10:29

by Helge Hafting

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

Herbert Xu wrote:
> On Tue, Aug 05, 2008 at 01:10:17AM +0800, Herbert Xu wrote:
>
>> I don't see why you couldn't do that for the crypto API too
>> if you wanted to. That way it would benefit all crypto users
>> rather than just the crc32c (note the extra c) users.
>>
>
> Anyway, the point here is the crc32c is nothing special. It's
> just one out of many algorithms that has/will have hardware
> acceleration support.
>
> Rather than doing ad-hoc implementations and optimising that
> whenever such a thing pops up, let's spend our effort in creating
> a common platform that can be reused.
>
How about making crc32c an inline function then?
On processors that have this feature, this compiles to that single
instruction, plus whatever setup it needs. Nice and efficient.
On other processors, either inline the algorithm or inline
a call to an out of line function, depending on how bulky this is.

Similiar for any other functions that may or may not have hw support.

Helge Hafting

2008-08-05 14:04:58

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH] Using Intel CRC32 instruction to accelerate CRC32c algorithm by new crypto API.

On Tue, Aug 05, 2008 at 01:10:25PM +0200, Helge Hafting wrote:
>
> How about making crc32c an inline function then?
> On processors that have this feature, this compiles to that single
> instruction, plus whatever setup it needs. Nice and efficient.
> On other processors, either inline the algorithm or inline
> a call to an out of line function, depending on how bulky this is.

Please read the thread carefully. Being a single instruction
is nothing special. The same thing applies for other algorithms
too, e.g., AES is also just a single instruction with the VIA
PadLock (and Intel in future).

The crypto API has handled this just fine.

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt