Subject: [RFC] [crypto] padlock-aes loadkey ondemand

Signed-off-by: Sebastian Siewior <[email protected]>
---
Stefan, if you have some spare time could you please look if this patch
improves padlock + xts performance somehow?

drivers/crypto/padlock-aes.c | 35 +++++++++++++++++++++++++++++------
1 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c
index bb30eb9..1ebbe8c 100644
--- a/drivers/crypto/padlock-aes.c
+++ b/drivers/crypto/padlock-aes.c
@@ -48,6 +48,8 @@ struct aes_ctx {
u32 *D;
};

+static struct aes_ctx *last_key;
+
/* Tells whether the ACE is capable to generate
the extended key for a given key_len. */
static inline int
@@ -115,6 +117,7 @@ static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
ctx->cword.encrypt.ksize = (key_len - 16) / 8;
ctx->cword.decrypt.ksize = ctx->cword.encrypt.ksize;

+ last_key = ctx;
/* Don't generate extended keys if the hardware can do it. */
if (aes_hw_extkey_available(key_len))
return 0;
@@ -205,14 +208,22 @@ static inline u8 *padlock_xcrypt_cbc(const u8 *input, u8 *output, void *key,
static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
{
struct aes_ctx *ctx = aes_ctx(tfm);
- padlock_reset_key();
+
+ if (last_key != ctx) {
+ last_key = ctx;
+ padlock_reset_key();
+ }
aes_crypt(in, out, ctx->E, &ctx->cword.encrypt);
}

static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
{
struct aes_ctx *ctx = aes_ctx(tfm);
- padlock_reset_key();
+
+ if (last_key != ctx) {
+ last_key = ctx;
+ padlock_reset_key();
+ }
aes_crypt(in, out, ctx->D, &ctx->cword.decrypt);
}

@@ -245,7 +256,10 @@ static int ecb_aes_encrypt(struct blkcipher_desc *desc,
struct blkcipher_walk walk;
int err;

- padlock_reset_key();
+ if (last_key != ctx) {
+ last_key = ctx;
+ padlock_reset_key();
+ }

blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
@@ -269,7 +283,10 @@ static int ecb_aes_decrypt(struct blkcipher_desc *desc,
struct blkcipher_walk walk;
int err;

- padlock_reset_key();
+ if (last_key != ctx) {
+ last_key = ctx;
+ padlock_reset_key();
+ }

blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
@@ -315,7 +332,10 @@ static int cbc_aes_encrypt(struct blkcipher_desc *desc,
struct blkcipher_walk walk;
int err;

- padlock_reset_key();
+ if (last_key != ctx) {
+ last_key = ctx;
+ padlock_reset_key();
+ }

blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
@@ -341,7 +361,10 @@ static int cbc_aes_decrypt(struct blkcipher_desc *desc,
struct blkcipher_walk walk;
int err;

- padlock_reset_key();
+ if (last_key != ctx) {
+ last_key = ctx;
+ padlock_reset_key();
+ }

blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
--
1.5.4.3



2008-03-30 19:03:36

by Stefan Hellermann

[permalink] [raw]
Subject: Re: [RFC] [crypto] padlock-aes loadkey ondemand

Sebastian Siewior schrieb:
> Signed-off-by: Sebastian Siewior <[email protected]>
> ---
> Stefan, if you have some spare time could you please look if this patch
> improves padlock + xts performance somehow?

Hi,

I tested this patch with success, "512 bit key, 8192 byte blocks" is 29% faster!
Thanks for this work!

Tested-By: Stefan Hellermann

(Or what is the appropriate line here?)


Here's tcrypt mode=200 output for xts without this patch
testing speed of xts(aes) decryption
test 0 (256 bit key, 16 byte blocks): 1 operation in 947 cycles (16 bytes)
test 1 (256 bit key, 64 byte blocks): 1 operation in 1645 cycles (64 bytes)
test 2 (256 bit key, 256 byte blocks): 1 operation in 4393 cycles (256 bytes)
test 3 (256 bit key, 1024 byte blocks): 1 operation in 15385 cycles (1024 bytes)
test 4 (256 bit key, 8192 byte blocks): 1 operation in 118990 cycles (8192 bytes)
test 5 (384 bit key, 16 byte blocks): 1 operation in 962 cycles (16 bytes)
test 6 (384 bit key, 64 byte blocks): 1 operation in 1688 cycles (64 bytes)
test 7 (384 bit key, 256 byte blocks): 1 operation in 4532 cycles (256 bytes)
test 8 (384 bit key, 1024 byte blocks): 1 operation in 15908 cycles (1024 bytes)
test 9 (384 bit key, 8192 byte blocks): 1 operation in 123097 cycles (8192 bytes)
test 10 (512 bit key, 16 byte blocks): 1 operation in 974 cycles (16 bytes)
test 11 (512 bit key, 64 byte blocks): 1 operation in 1710 cycles (64 bytes)
test 12 (512 bit key, 256 byte blocks): 1 operation in 4664 cycles (256 bytes)
test 13 (512 bit key, 1024 byte blocks): 1 operation in 16424 cycles (1024 bytes)
test 14 (512 bit key, 8192 byte blocks): 1 operation in 127197 cycles (8192 bytes)

and here's the output with this patch:
testing speed of xts(aes) decryption
test 0 (256 bit key, 16 byte blocks): 1 operation in 952 cycles (16 bytes)
test 1 (256 bit key, 64 byte blocks): 1 operation in 1462 cycles (64 bytes)
test 2 (256 bit key, 256 byte blocks): 1 operation in 3454 cycles (256 bytes)
test 3 (256 bit key, 1024 byte blocks): 1 operation in 11422 cycles (1024 bytes)
test 4 (256 bit key, 8192 byte blocks): 1 operation in 86779 cycles (8192 bytes)
test 5 (384 bit key, 16 byte blocks): 1 operation in 967 cycles (16 bytes)
test 6 (384 bit key, 64 byte blocks): 1 operation in 1467 cycles (64 bytes)
test 7 (384 bit key, 256 byte blocks): 1 operation in 3473 cycles (256 bytes)
test 8 (384 bit key, 1024 byte blocks): 1 operation in 11441 cycles (1024 bytes)
test 9 (384 bit key, 8192 byte blocks): 1 operation in 86798 cycles (8192 bytes)
test 10 (512 bit key, 16 byte blocks): 1 operation in 979 cycles (16 bytes)
test 11 (512 bit key, 64 byte blocks): 1 operation in 1503 cycles (64 bytes)
test 12 (512 bit key, 256 byte blocks): 1 operation in 3605 cycles (256 bytes)
test 13 (512 bit key, 1024 byte blocks): 1 operation in 11957 cycles (1024 bytes)
test 14 (512 bit key, 8192 byte blocks): 1 operation in 90898 cycles (8192 bytes)

But it's far away from aes(cbc)
test 14 (256 bit key, 8192 byte blocks): 1 operation in 8912 cycles (8192 bytes)
... cbc is now only 10 times faster, not 14 times :)

Stefan

>
> drivers/crypto/padlock-aes.c | 35 +++++++++++++++++++++++++++++------
> 1 files changed, 29 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c
> index bb30eb9..1ebbe8c 100644
> --- a/drivers/crypto/padlock-aes.c
> +++ b/drivers/crypto/padlock-aes.c
> @@ -48,6 +48,8 @@ struct aes_ctx {
> u32 *D;
> };
>
> +static struct aes_ctx *last_key;
> +
> /* Tells whether the ACE is capable to generate
> the extended key for a given key_len. */
> static inline int
> @@ -115,6 +117,7 @@ static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
> ctx->cword.encrypt.ksize = (key_len - 16) / 8;
> ctx->cword.decrypt.ksize = ctx->cword.encrypt.ksize;
>
> + last_key = ctx;
> /* Don't generate extended keys if the hardware can do it. */
> if (aes_hw_extkey_available(key_len))
> return 0;
> @@ -205,14 +208,22 @@ static inline u8 *padlock_xcrypt_cbc(const u8 *input, u8 *output, void *key,
> static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
> {
> struct aes_ctx *ctx = aes_ctx(tfm);
> - padlock_reset_key();
> +
> + if (last_key != ctx) {
> + last_key = ctx;
> + padlock_reset_key();
> + }
> aes_crypt(in, out, ctx->E, &ctx->cword.encrypt);
> }
>
> static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
> {
> struct aes_ctx *ctx = aes_ctx(tfm);
> - padlock_reset_key();
> +
> + if (last_key != ctx) {
> + last_key = ctx;
> + padlock_reset_key();
> + }
> aes_crypt(in, out, ctx->D, &ctx->cword.decrypt);
> }
>
> @@ -245,7 +256,10 @@ static int ecb_aes_encrypt(struct blkcipher_desc *desc,
> struct blkcipher_walk walk;
> int err;
>
> - padlock_reset_key();
> + if (last_key != ctx) {
> + last_key = ctx;
> + padlock_reset_key();
> + }
>
> blkcipher_walk_init(&walk, dst, src, nbytes);
> err = blkcipher_walk_virt(desc, &walk);
> @@ -269,7 +283,10 @@ static int ecb_aes_decrypt(struct blkcipher_desc *desc,
> struct blkcipher_walk walk;
> int err;
>
> - padlock_reset_key();
> + if (last_key != ctx) {
> + last_key = ctx;
> + padlock_reset_key();
> + }
>
> blkcipher_walk_init(&walk, dst, src, nbytes);
> err = blkcipher_walk_virt(desc, &walk);
> @@ -315,7 +332,10 @@ static int cbc_aes_encrypt(struct blkcipher_desc *desc,
> struct blkcipher_walk walk;
> int err;
>
> - padlock_reset_key();
> + if (last_key != ctx) {
> + last_key = ctx;
> + padlock_reset_key();
> + }
>
> blkcipher_walk_init(&walk, dst, src, nbytes);
> err = blkcipher_walk_virt(desc, &walk);
> @@ -341,7 +361,10 @@ static int cbc_aes_decrypt(struct blkcipher_desc *desc,
> struct blkcipher_walk walk;
> int err;
>
> - padlock_reset_key();
> + if (last_key != ctx) {
> + last_key = ctx;
> + padlock_reset_key();
> + }
>
> blkcipher_walk_init(&walk, dst, src, nbytes);
> err = blkcipher_walk_virt(desc, &walk);

Subject: Re: [RFC] [crypto] padlock-aes loadkey ondemand

* Stefan Hellermann | 2008-03-30 21:03:18 [+0200]:

>Hi,
Hi,

>I tested this patch with success, "512 bit key, 8192 byte blocks" is 29% faster!
That's something.

>Thanks for this work!
np :)

>Tested-By: Stefan Hellermann
>
>(Or what is the appropriate line here?)
I thing that is okey if we add some numbers you provided.

>Here's tcrypt mode=200 output for xts without this patch
As far as I remember you posted once some MiB/sec dm-crypt numbers the
last time. Could you please post them as well since I don't which block
size is the majority of the read/write request.

>But it's far away from aes(cbc)
>test 14 (256 bit key, 8192 byte blocks): 1 operation in 8912 cycles (8192 bytes)
>... cbc is now only 10 times faster, not 14 times :)
hehe

>Stefan
Sebastian

2008-04-01 18:39:04

by Stefan Hellermann

[permalink] [raw]
Subject: Re: [RFC] [crypto] padlock-aes loadkey ondemand

Sebastian Siewior schrieb:
> * Stefan Hellermann | 2008-03-30 21:03:18 [+0200]:
>
>> Hi,
> Hi,
>
>> I tested this patch with success, "512 bit key, 8192 byte blocks" is 29% faster!
> That's something.
>
>> Thanks for this work!
> np :)
>
>> Tested-By: Stefan Hellermann
>>
>> (Or what is the appropriate line here?)
> I thing that is okey if we add some numbers you provided.
>
>> Here's tcrypt mode=200 output for xts without this patch
> As far as I remember you posted once some MiB/sec dm-crypt numbers the
> last time. Could you please post them as well since I don't which block
> size is the majority of the read/write request.

Now I have more numbers :)
read from a device through dm-crypt with aes-cts-plain, 256-bit mode (512bit keysize for
cryptsetup)
without this patch: 54MB/s with plain dd(blocksize 512b), 51MB/s with dd and a blocksize
of 512kib (Yes, it's slower with a larger blocksize!)
with this patch: 63MB/s with dd and a blocksize of 512b, 64MB/s with a blocksize of 512k
(This time the larger blocksize was a little bit faster in each test)
I did 5 tests for each combination, top shows everytime ~90% user and ~0% wait

Stefan

2008-04-02 06:04:03

by Herbert Xu

[permalink] [raw]
Subject: Re: [RFC] [crypto] padlock-aes loadkey ondemand

Sebastian Siewior <[email protected]> wrote:
>
> @@ -205,14 +208,22 @@ static inline u8 *padlock_xcrypt_cbc(const u8 *input, u8 *output, void *key,
> static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
> {
> struct aes_ctx *ctx = aes_ctx(tfm);
> - padlock_reset_key();
> +
> + if (last_key != ctx) {
> + last_key = ctx;
> + padlock_reset_key();
> + }

What if user-space invokes the padlock when xts is preempted?

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

Subject: Re: [RFC] [crypto] padlock-aes loadkey ondemand

* Herbert Xu | 2008-04-02 14:03:54 [+0800]:

>Sebastian Siewior <[email protected]> wrote:
>>
>> @@ -205,14 +208,22 @@ static inline u8 *padlock_xcrypt_cbc(const u8 *input, u8 *output, void *key,
>> static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>> {
>> struct aes_ctx *ctx = aes_ctx(tfm);
>> - padlock_reset_key();
>> +
>> + if (last_key != ctx) {
>> + last_key = ctx;
>> + padlock_reset_key();
>> + }
>
>What if user-space invokes the padlock when xts is preempted?

>From VIA's programming guide:
|- EFLAGS:30 is set to 0 by any x86 instruction, interrupt, exception,
|task switch, etc. operation that causes EFLAGS to be stored (even if
|executed in 16-bit mode). Centaue Technology recommends using the
|instruction sequence PUSHFL; POPFL; prior to any REP XCRYPT instruction
|which uses a different key than the previous REP XCRYPT instruction.
|- EFLAGS:30 cannot be set to 1 by any x86 instruction that causes EFLAGS
|to be loaded. Only REP XCRYPT instructions set this bit to 1.

In that case we are safe aren't we? If we get preempted and user
space is invoked (or any other kernel thread) then the EFLAGS of the
task are restored what sets EFLAGS:30 to zero. Or did I misinterprete
it?

Sebastian

2008-04-02 06:31:59

by Herbert Xu

[permalink] [raw]
Subject: Re: [RFC] [crypto] padlock-aes loadkey ondemand

On Wed, Apr 02, 2008 at 08:26:59AM +0200, Sebastian Siewior wrote:
>
> In that case we are safe aren't we? If we get preempted and user
> space is invoked (or any other kernel thread) then the EFLAGS of the
> task are restored what sets EFLAGS:30 to zero. Or did I misinterprete
> it?

Good point. It should be fine then.

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

Subject: Re: [RFC] [crypto] padlock-aes loadkey ondemand

* Stefan Hellermann | 2008-04-01 20:38:46 [+0200]:

>Now I have more numbers :)
>read from a device through dm-crypt with aes-cts-plain, 256-bit mode (512bit keysize for
>cryptsetup)
>without this patch: 54MB/s with plain dd(blocksize 512b), 51MB/s with dd and a blocksize
>of 512kib (Yes, it's slower with a larger blocksize!)
>with this patch: 63MB/s with dd and a blocksize of 512b, 64MB/s with a blocksize of 512k
>(This time the larger blocksize was a little bit faster in each test)
>I did 5 tests for each combination, top shows everytime ~90% user and ~0% wait
Thank you for you excellent work.

Herbert: I was going to Cc you and ask if the speed improvements are
okey / enough to consider this patch for inclusion but I see now that
you found this thread on your own :) Do you want me to form a patch?

>
>Stefan

Sebastian

2008-04-02 06:50:23

by Herbert Xu

[permalink] [raw]
Subject: Re: [RFC] [crypto] padlock-aes loadkey ondemand

On Wed, Apr 02, 2008 at 08:44:19AM +0200, Sebastian Siewior wrote:
>
> Herbert: I was going to Cc you and ask if the speed improvements are
> okey / enough to consider this patch for inclusion but I see now that
> you found this thread on your own :) Do you want me to form a patch?

Yes please! BTW, can VIA do SMP? If so then we'll need some code
to deal with that and if not, we should add a run-time check to
fail the module insertion if ever such a beast should arise.

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

Subject: Re: [RFC] [crypto] padlock-aes loadkey ondemand

* Herbert Xu | 2008-04-02 14:50:15 [+0800]:

>On Wed, Apr 02, 2008 at 08:44:19AM +0200, Sebastian Siewior wrote:
>>
>> Herbert: I was going to Cc you and ask if the speed improvements are
>> okey / enough to consider this patch for inclusion but I see now that
>> you found this thread on your own :) Do you want me to form a patch?
>
>Yes please! BTW, can VIA do SMP? If so then we'll need some code
>to deal with that and if not, we should add a run-time check to
>fail the module insertion if ever such a beast should arise.
I'm not sure if they have SMP or multicore out / plans but I gave a
little thought on this early morning: The only problem is if we get
swapped to another cores before encryption process beginns and after key
reset thing. If the scheduler decides to swap cores than EFLAGS should
be reloaded and wer are fine. The only problem that I see with cpus > 1
is that we might reload the key if it is not required. This could be
solved with old_ctx pointer beeing a percpu.

>Cheers,
Sebastian

2008-04-02 07:23:05

by Herbert Xu

[permalink] [raw]
Subject: Re: [RFC] [crypto] padlock-aes loadkey ondemand

On Wed, Apr 02, 2008 at 09:17:22AM +0200, Sebastian Siewior wrote:
> I'm not sure if they have SMP or multicore out / plans but I gave a
> little thought on this early morning: The only problem is if we get
> swapped to another cores before encryption process beginns and after key
> reset thing. If the scheduler decides to swap cores than EFLAGS should
> be reloaded and wer are fine. The only problem that I see with cpus > 1
> is that we might reload the key if it is not required. This could be
> solved with old_ctx pointer beeing a percpu.

Yeah I think having a percpu would be good even if it isn't needed
right now.

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2008-04-02 15:49:26

by Stefan Hellermann

[permalink] [raw]
Subject: Re: [RFC] [crypto] padlock-aes loadkey ondemand

Herbert Xu schrieb:
> On Wed, Apr 02, 2008 at 08:44:19AM +0200, Sebastian Siewior wrote:
>> Herbert: I was going to Cc you and ask if the speed improvements are
>> okey / enough to consider this patch for inclusion but I see now that
>> you found this thread on your own :) Do you want me to form a patch?
>
> Yes please! BTW, can VIA do SMP? If so then we'll need some code
> to deal with that and if not, we should add a run-time check to
> fail the module insertion if ever such a beast should arise.

Till now VIA has no SMP and nothing planed (as far as I know, I'm not a VIA guy). There
next gen processors will be called isaiah, without smp but with 64bit and lots of other
features ( http://www.via.com.tw/en/products/processors/isaiah-arch/ )

Cheers
Stefan

>
> Cheers,

2008-04-09 16:38:40

by Herbert Xu

[permalink] [raw]
Subject: Re: [RFC] [crypto] padlock-aes loadkey ondemand

On Wed, Apr 02, 2008 at 05:48:33PM +0200, Stefan Hellermann wrote:
>
> Till now VIA has no SMP and nothing planed (as far as I know, I'm not a VIA guy). There
> next gen processors will be called isaiah, without smp but with 64bit and lots of other
> features ( http://www.via.com.tw/en/products/processors/isaiah-arch/ )

Apparently they're planning a multi-core version in future so
we should definitely plan for that.

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2008-08-07 14:53:26

by Herbert Xu

[permalink] [raw]
Subject: Re: [RFC] [crypto] padlock-aes loadkey ondemand

On Wed, Apr 02, 2008 at 03:22:58PM +0800, Herbert Xu wrote:
> On Wed, Apr 02, 2008 at 09:17:22AM +0200, Sebastian Siewior wrote:
> > I'm not sure if they have SMP or multicore out / plans but I gave a
> > little thought on this early morning: The only problem is if we get
> > swapped to another cores before encryption process beginns and after key
> > reset thing. If the scheduler decides to swap cores than EFLAGS should
> > be reloaded and wer are fine. The only problem that I see with cpus > 1
> > is that we might reload the key if it is not required. This could be
> > solved with old_ctx pointer beeing a percpu.
>
> Yeah I think having a percpu would be good even if it isn't needed
> right now.

I've coded this up with the per-cpu structure and it seems to work.
It doesn't make much of a difference for ECB/CBC but with LRW/XTS
it gives a huge boost.

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c
index 54a2a16..c71256a 100644
--- a/drivers/crypto/padlock-aes.c
+++ b/drivers/crypto/padlock-aes.c
@@ -15,6 +15,8 @@
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
#include <asm/byteorder.h>
#include "padlock.h"

@@ -48,6 +50,8 @@ struct aes_ctx {
u32 *D;
};

+static DEFINE_PER_CPU(struct cword *, last_cword);
+
/* Tells whether the ACE is capable to generate
the extended key for a given key_len. */
static inline int
@@ -88,6 +92,7 @@ static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
const __le32 *key = (const __le32 *)in_key;
u32 *flags = &tfm->crt_flags;
struct crypto_aes_ctx gen_aes;
+ int cpu;

if (key_len % 8) {
*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
@@ -117,7 +122,7 @@ static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,

/* Don't generate extended keys if the hardware can do it. */
if (aes_hw_extkey_available(key_len))
- return 0;
+ goto ok;

ctx->D = ctx->d_data;
ctx->cword.encrypt.keygen = 1;
@@ -130,19 +135,34 @@ static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,

memcpy(ctx->E, gen_aes.key_enc, AES_MAX_KEYLENGTH);
memcpy(ctx->D, gen_aes.key_dec, AES_MAX_KEYLENGTH);
+
+ok:
+ for_each_online_cpu(cpu)
+ if (&ctx->cword.encrypt == per_cpu(last_cword, cpu) ||
+ &ctx->cword.decrypt == per_cpu(last_cword, cpu))
+ per_cpu(last_cword, cpu) = NULL;
+
return 0;
}

/* ====== Encryption/decryption routines ====== */

/* These are the real call to PadLock. */
-static inline void padlock_reset_key(void)
+static inline void padlock_reset_key(struct cword *cword)
{
- asm volatile ("pushfl; popfl");
+ int cpu = raw_smp_processor_id();
+
+ if (cword != per_cpu(last_cword, cpu))
+ asm volatile ("pushfl; popfl");
+}
+
+static inline void padlock_store_cword(struct cword *cword)
+{
+ per_cpu(last_cword, raw_smp_processor_id()) = cword;
}

static inline void padlock_xcrypt(const u8 *input, u8 *output, void *key,
- void *control_word)
+ struct cword *control_word)
{
asm volatile (".byte 0xf3,0x0f,0xa7,0xc8" /* rep xcryptecb */
: "+S"(input), "+D"(output)
@@ -205,15 +225,17 @@ static inline u8 *padlock_xcrypt_cbc(const u8 *input, u8 *output, void *key,
static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
{
struct aes_ctx *ctx = aes_ctx(tfm);
- padlock_reset_key();
+ padlock_reset_key(&ctx->cword.encrypt);
aes_crypt(in, out, ctx->E, &ctx->cword.encrypt);
+ padlock_store_cword(&ctx->cword.encrypt);
}

static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
{
struct aes_ctx *ctx = aes_ctx(tfm);
- padlock_reset_key();
+ padlock_reset_key(&ctx->cword.decrypt);
aes_crypt(in, out, ctx->D, &ctx->cword.decrypt);
+ padlock_store_cword(&ctx->cword.decrypt);
}

static struct crypto_alg aes_alg = {
@@ -245,7 +267,7 @@ static int ecb_aes_encrypt(struct blkcipher_desc *desc,
struct blkcipher_walk walk;
int err;

- padlock_reset_key();
+ padlock_reset_key(&ctx->cword.encrypt);

blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
@@ -258,6 +280,8 @@ static int ecb_aes_encrypt(struct blkcipher_desc *desc,
err = blkcipher_walk_done(desc, &walk, nbytes);
}

+ padlock_store_cword(&ctx->cword.encrypt);
+
return err;
}

@@ -269,7 +293,7 @@ static int ecb_aes_decrypt(struct blkcipher_desc *desc,
struct blkcipher_walk walk;
int err;

- padlock_reset_key();
+ padlock_reset_key(&ctx->cword.decrypt);

blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
@@ -282,6 +306,8 @@ static int ecb_aes_decrypt(struct blkcipher_desc *desc,
err = blkcipher_walk_done(desc, &walk, nbytes);
}

+ padlock_store_cword(&ctx->cword.decrypt);
+
return err;
}

@@ -315,7 +341,7 @@ static int cbc_aes_encrypt(struct blkcipher_desc *desc,
struct blkcipher_walk walk;
int err;

- padlock_reset_key();
+ padlock_reset_key(&ctx->cword.encrypt);

blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
@@ -330,6 +356,8 @@ static int cbc_aes_encrypt(struct blkcipher_desc *desc,
err = blkcipher_walk_done(desc, &walk, nbytes);
}

+ padlock_store_cword(&ctx->cword.encrypt);
+
return err;
}

@@ -341,7 +369,7 @@ static int cbc_aes_decrypt(struct blkcipher_desc *desc,
struct blkcipher_walk walk;
int err;

- padlock_reset_key();
+ padlock_reset_key(&ctx->cword.decrypt);

blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
@@ -354,6 +382,8 @@ static int cbc_aes_decrypt(struct blkcipher_desc *desc,
err = blkcipher_walk_done(desc, &walk, nbytes);
}

+ padlock_store_cword(&ctx->cword.decrypt);
+
return err;
}


Subject: Re: [RFC] [crypto] padlock-aes loadkey ondemand

* Herbert Xu | 2008-08-07 22:53:12 [+0800]:

>I've coded this up with the per-cpu structure and it seems to work.
yup, excellent. Thanks for finishing this. I had the following comment
above my DEFINE_PER_CPU definition (I replaced my definition with
last_cword):
|/*
| * PER-CPU last_cword is here to make this code SMP / future safe and works as
| * follows: After every crypt operation, the CPU sets EFLAGS:30 to indicate
| * that the key is already in the CPU and catch-the-key-from-memory operation
| * can be saved. This bit needs to be cleared if we change the key. This bit is
| * auto-cleared by the CPU on every context switch (and can not be set by the
| * kernel). We don't lock our self to one CPU during the crypt process because
| * it isn't required: If we get scheduled to another CPU after the var check
| * and before the crypt operation than the CPU clears the EFLAGS:30 bit and the
| * CPU reloads the key anyway. The same things happens if we get scheduled to
| * another CPU after the crypt operation but before we set last_cword.
| */

I had a few missing pieces and was never was in the mode to finish it.

>It doesn't make much of a difference for ECB/CBC but with LRW/XTS
>it gives a huge boost.
Well, the spec says the key reload takes a lot of time so it should give
a big boost to everything what isn't supported directly in HW.
The padlock supports a few other modes like OFB but I doubt they are
required since they aren't even implemented in SW.

>Cheers,

Sebastian

2008-08-31 06:01:22

by Herbert Xu

[permalink] [raw]
Subject: Re: [RFC] [crypto] padlock-aes loadkey ondemand

On Sat, Aug 09, 2008 at 01:20:13PM +0200, Sebastian Siewior wrote:
>
> Well, the spec says the key reload takes a lot of time so it should give
> a big boost to everything what isn't supported directly in HW.
> The padlock supports a few other modes like OFB but I doubt they are
> required since they aren't even implemented in SW.

I've finally committed it as follows.

commit d94604ab04c6711d7cf9ad086cf5f6fa446d8eb6
Author: Herbert Xu <[email protected]>
Date: Sun Aug 31 15:58:45 2008 +1000

crypto: padlock - Avoid resetting cword on successive operations

Resetting the control word is quite expensive. Fortunately this
isn't an issue for the common operations such as CBC and ECB as
the whole operation is done through a single call. However, modes
such as LRW and XTS have to call padlock over and over again for
one operation which really hurts if each call resets the control
word.

This patch uses an idea by Sebastian Siewior to store the last
control word used on a CPU and only reset the control word if
that changes.

Note that any task switch automatically resets the control word
so we only need to be accurate with regard to the stored control
word when no task switches occur.

Signed-off-by: Herbert Xu <[email protected]>

diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c
index bf2917d..856b3cc 100644
--- a/drivers/crypto/padlock-aes.c
+++ b/drivers/crypto/padlock-aes.c
@@ -15,6 +15,8 @@
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
#include <asm/byteorder.h>
#include <asm/i387.h>
#include "padlock.h"
@@ -49,6 +51,8 @@ struct aes_ctx {
u32 *D;
};

+static DEFINE_PER_CPU(struct cword *, last_cword);
+
/* Tells whether the ACE is capable to generate
the extended key for a given key_len. */
static inline int
@@ -89,6 +93,7 @@ static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
const __le32 *key = (const __le32 *)in_key;
u32 *flags = &tfm->crt_flags;
struct crypto_aes_ctx gen_aes;
+ int cpu;

if (key_len % 8) {
*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
@@ -118,7 +123,7 @@ static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,

/* Don't generate extended keys if the hardware can do it. */
if (aes_hw_extkey_available(key_len))
- return 0;
+ goto ok;

ctx->D = ctx->d_data;
ctx->cword.encrypt.keygen = 1;
@@ -131,15 +136,30 @@ static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,

memcpy(ctx->E, gen_aes.key_enc, AES_MAX_KEYLENGTH);
memcpy(ctx->D, gen_aes.key_dec, AES_MAX_KEYLENGTH);
+
+ok:
+ for_each_online_cpu(cpu)
+ if (&ctx->cword.encrypt == per_cpu(last_cword, cpu) ||
+ &ctx->cword.decrypt == per_cpu(last_cword, cpu))
+ per_cpu(last_cword, cpu) = NULL;
+
return 0;
}

/* ====== Encryption/decryption routines ====== */

/* These are the real call to PadLock. */
-static inline void padlock_reset_key(void)
+static inline void padlock_reset_key(struct cword *cword)
+{
+ int cpu = raw_smp_processor_id();
+
+ if (cword != per_cpu(last_cword, cpu))
+ asm volatile ("pushfl; popfl");
+}
+
+static inline void padlock_store_cword(struct cword *cword)
{
- asm volatile ("pushfl; popfl");
+ per_cpu(last_cword, raw_smp_processor_id()) = cword;
}

/*
@@ -149,7 +169,7 @@ static inline void padlock_reset_key(void)
*/

static inline void padlock_xcrypt(const u8 *input, u8 *output, void *key,
- void *control_word)
+ struct cword *control_word)
{
asm volatile (".byte 0xf3,0x0f,0xa7,0xc8" /* rep xcryptecb */
: "+S"(input), "+D"(output)
@@ -213,22 +233,24 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
{
struct aes_ctx *ctx = aes_ctx(tfm);
int ts_state;
- padlock_reset_key();

+ padlock_reset_key(&ctx->cword.encrypt);
ts_state = irq_ts_save();
aes_crypt(in, out, ctx->E, &ctx->cword.encrypt);
irq_ts_restore(ts_state);
+ padlock_store_cword(&ctx->cword.encrypt);
}

static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
{
struct aes_ctx *ctx = aes_ctx(tfm);
int ts_state;
- padlock_reset_key();

+ padlock_reset_key(&ctx->cword.encrypt);
ts_state = irq_ts_save();
aes_crypt(in, out, ctx->D, &ctx->cword.decrypt);
irq_ts_restore(ts_state);
+ padlock_store_cword(&ctx->cword.encrypt);
}

static struct crypto_alg aes_alg = {
@@ -261,7 +283,7 @@ static int ecb_aes_encrypt(struct blkcipher_desc *desc,
int err;
int ts_state;

- padlock_reset_key();
+ padlock_reset_key(&ctx->cword.encrypt);

blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
@@ -276,6 +298,8 @@ static int ecb_aes_encrypt(struct blkcipher_desc *desc,
}
irq_ts_restore(ts_state);

+ padlock_store_cword(&ctx->cword.encrypt);
+
return err;
}

@@ -288,7 +312,7 @@ static int ecb_aes_decrypt(struct blkcipher_desc *desc,
int err;
int ts_state;

- padlock_reset_key();
+ padlock_reset_key(&ctx->cword.decrypt);

blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
@@ -302,6 +326,9 @@ static int ecb_aes_decrypt(struct blkcipher_desc *desc,
err = blkcipher_walk_done(desc, &walk, nbytes);
}
irq_ts_restore(ts_state);
+
+ padlock_store_cword(&ctx->cword.encrypt);
+
return err;
}

@@ -336,7 +363,7 @@ static int cbc_aes_encrypt(struct blkcipher_desc *desc,
int err;
int ts_state;

- padlock_reset_key();
+ padlock_reset_key(&ctx->cword.encrypt);

blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
@@ -353,6 +380,8 @@ static int cbc_aes_encrypt(struct blkcipher_desc *desc,
}
irq_ts_restore(ts_state);

+ padlock_store_cword(&ctx->cword.decrypt);
+
return err;
}

@@ -365,7 +394,7 @@ static int cbc_aes_decrypt(struct blkcipher_desc *desc,
int err;
int ts_state;

- padlock_reset_key();
+ padlock_reset_key(&ctx->cword.encrypt);

blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt(desc, &walk);
@@ -380,6 +409,9 @@ static int cbc_aes_decrypt(struct blkcipher_desc *desc,
}

irq_ts_restore(ts_state);
+
+ padlock_store_cword(&ctx->cword.encrypt);
+
return err;
}

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt