2020-08-06 16:43:05

by Andrei Botila

[permalink] [raw]
Subject: [PATCH RESEND 1/9] crypto: caam/jr - add fallback for XTS with more than 8B IV

From: Andrei Botila <[email protected]>

A hardware limitation exists for CAAM until Era 9 which restricts
the accelerator to IVs with only 8 bytes. When CAAM has a lower era
a fallback is necessary to process 16 bytes IV.

Fixes: c6415a6016bf ("crypto: caam - add support for acipher xts(aes)")
Cc: <[email protected]> # v4.4+
Signed-off-by: Andrei Botila <[email protected]>
---
drivers/crypto/caam/caamalg.c | 68 ++++++++++++++++++++++++++++++++---
1 file changed, 64 insertions(+), 4 deletions(-)

diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index 91feda5b63f6..ebf4dc87ca2e 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -57,6 +57,7 @@
#include "key_gen.h"
#include "caamalg_desc.h"
#include <crypto/engine.h>
+#include <asm/unaligned.h>

/*
* crypto alg
@@ -114,10 +115,12 @@ struct caam_ctx {
struct alginfo adata;
struct alginfo cdata;
unsigned int authsize;
+ struct crypto_skcipher *fallback;
};

struct caam_skcipher_req_ctx {
struct skcipher_edesc *edesc;
+ struct skcipher_request fallback_req;
};

struct caam_aead_req_ctx {
@@ -830,12 +833,17 @@ static int xts_skcipher_setkey(struct crypto_skcipher *skcipher, const u8 *key,
struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher);
struct device *jrdev = ctx->jrdev;
u32 *desc;
+ int err;

if (keylen != 2 * AES_MIN_KEY_SIZE && keylen != 2 * AES_MAX_KEY_SIZE) {
dev_dbg(jrdev, "key size mismatch\n");
return -EINVAL;
}

+ err = crypto_skcipher_setkey(ctx->fallback, key, keylen);
+ if (err)
+ return err;
+
ctx->cdata.keylen = keylen;
ctx->cdata.key_virt = key;
ctx->cdata.key_inline = true;
@@ -1755,6 +1763,20 @@ static int skcipher_do_one_req(struct crypto_engine *engine, void *areq)
return ret;
}

+static bool xts_skcipher_ivsize(struct skcipher_request *req)
+{
+ struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req);
+ unsigned int ivsize = crypto_skcipher_ivsize(skcipher);
+ u64 size = 0;
+
+ if (IS_ALIGNED((unsigned long)req->iv, __alignof__(u64)))
+ size = *(u64 *)(req->iv + (ivsize / 2));
+ else
+ size = get_unaligned((u64 *)(req->iv + (ivsize / 2)));
+
+ return !!size;
+}
+
static inline int skcipher_crypt(struct skcipher_request *req, bool encrypt)
{
struct skcipher_edesc *edesc;
@@ -1768,6 +1790,21 @@ static inline int skcipher_crypt(struct skcipher_request *req, bool encrypt)
if (!req->cryptlen)
return 0;

+ if (ctx->fallback && xts_skcipher_ivsize(req)) {
+ struct caam_skcipher_req_ctx *rctx = skcipher_request_ctx(req);
+
+ skcipher_request_set_tfm(&rctx->fallback_req, ctx->fallback);
+ skcipher_request_set_callback(&rctx->fallback_req,
+ req->base.flags,
+ req->base.complete,
+ req->base.data);
+ skcipher_request_set_crypt(&rctx->fallback_req, req->src,
+ req->dst, req->cryptlen, req->iv);
+
+ return encrypt ? crypto_skcipher_encrypt(&rctx->fallback_req) :
+ crypto_skcipher_decrypt(&rctx->fallback_req);
+ }
+
/* allocate extended descriptor */
edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ);
if (IS_ERR(edesc))
@@ -1905,6 +1942,7 @@ static struct caam_skcipher_alg driver_algs[] = {
.base = {
.cra_name = "xts(aes)",
.cra_driver_name = "xts-aes-caam",
+ .cra_flags = CRYPTO_ALG_NEED_FALLBACK,
.cra_blocksize = AES_BLOCK_SIZE,
},
.setkey = xts_skcipher_setkey,
@@ -3344,12 +3382,30 @@ static int caam_cra_init(struct crypto_skcipher *tfm)
struct caam_skcipher_alg *caam_alg =
container_of(alg, typeof(*caam_alg), skcipher);
struct caam_ctx *ctx = crypto_skcipher_ctx(tfm);
+ u32 alg_aai = caam_alg->caam.class1_alg_type & OP_ALG_AAI_MASK;

crypto_skcipher_set_reqsize(tfm, sizeof(struct caam_skcipher_req_ctx));

ctx->enginectx.op.do_one_request = skcipher_do_one_req;

- return caam_init_common(crypto_skcipher_ctx(tfm), &caam_alg->caam,
+ if (alg_aai == OP_ALG_AAI_XTS) {
+ const char *tfm_name = crypto_tfm_alg_name(&tfm->base);
+ struct crypto_skcipher *fallback;
+
+ fallback = crypto_alloc_skcipher(tfm_name, 0,
+ CRYPTO_ALG_NEED_FALLBACK);
+ if (IS_ERR(fallback)) {
+ pr_err("Failed to allocate %s fallback: %ld\n",
+ tfm_name, PTR_ERR(fallback));
+ return PTR_ERR(fallback);
+ }
+
+ ctx->fallback = fallback;
+ crypto_skcipher_set_reqsize(tfm, sizeof(struct caam_skcipher_req_ctx) +
+ crypto_skcipher_reqsize(fallback));
+ }
+
+ return caam_init_common(ctx, &caam_alg->caam,
false);
}

@@ -3378,7 +3434,11 @@ static void caam_exit_common(struct caam_ctx *ctx)

static void caam_cra_exit(struct crypto_skcipher *tfm)
{
- caam_exit_common(crypto_skcipher_ctx(tfm));
+ struct caam_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ if (ctx->fallback)
+ crypto_free_skcipher(ctx->fallback);
+ caam_exit_common(ctx);
}

static void caam_aead_exit(struct crypto_aead *tfm)
@@ -3412,8 +3472,8 @@ static void caam_skcipher_alg_init(struct caam_skcipher_alg *t_alg)
alg->base.cra_module = THIS_MODULE;
alg->base.cra_priority = CAAM_CRA_PRIORITY;
alg->base.cra_ctxsize = sizeof(struct caam_ctx);
- alg->base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY |
- CRYPTO_ALG_KERN_DRIVER_ONLY;
+ alg->base.cra_flags |= (CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY |
+ CRYPTO_ALG_KERN_DRIVER_ONLY);

alg->init = caam_cra_init;
alg->exit = caam_cra_exit;
--
2.17.1


2020-08-11 14:31:39

by Horia Geanta

[permalink] [raw]
Subject: Re: [PATCH RESEND 1/9] crypto: caam/jr - add fallback for XTS with more than 8B IV

On 8/6/2020 7:36 PM, Andrei Botila (OSS) wrote:
> @@ -3344,12 +3382,30 @@ static int caam_cra_init(struct crypto_skcipher *tfm)
> struct caam_skcipher_alg *caam_alg =
> container_of(alg, typeof(*caam_alg), skcipher);
> struct caam_ctx *ctx = crypto_skcipher_ctx(tfm);
> + u32 alg_aai = caam_alg->caam.class1_alg_type & OP_ALG_AAI_MASK;
>
> crypto_skcipher_set_reqsize(tfm, sizeof(struct caam_skcipher_req_ctx));
This is being called twice in case of XTS.

>
> ctx->enginectx.op.do_one_request = skcipher_do_one_req;
>
> - return caam_init_common(crypto_skcipher_ctx(tfm), &caam_alg->caam,
> + if (alg_aai == OP_ALG_AAI_XTS) {
> + const char *tfm_name = crypto_tfm_alg_name(&tfm->base);
> + struct crypto_skcipher *fallback;
> +
> + fallback = crypto_alloc_skcipher(tfm_name, 0,
> + CRYPTO_ALG_NEED_FALLBACK);
Driver should select CRYPTO_XTS, such that at least the generic
xts implementation is available.

> + if (IS_ERR(fallback)) {
> + pr_err("Failed to allocate %s fallback: %ld\n",
> + tfm_name, PTR_ERR(fallback));
> + return PTR_ERR(fallback);
Shouldn't error out so early. It might be that the fallback won't be needed.
Let's postpone this until we're sure fallback is required.

Horia

2020-08-19 23:57:56

by Sasha Levin

[permalink] [raw]
Subject: Re: [PATCH RESEND 1/9] crypto: caam/jr - add fallback for XTS with more than 8B IV

Hi

[This is an automated email]

This commit has been processed because it contains a "Fixes:" tag
fixing commit: c6415a6016bf ("crypto: caam - add support for acipher xts(aes)").

The bot has tested the following trees: v5.8.1, v5.7.15, v5.4.58, v4.19.139, v4.14.193, v4.9.232, v4.4.232.

v5.8.1: Failed to apply! Possible dependencies:
528f776df67c ("crypto: qat - allow xts requests not multiple of block")
a85211f36f3d ("crypto: qat - fallback for xts with 192 bit keys")
b185a68710e0 ("crypto: qat - validate xts key")
b8aa7dc5c753 ("crypto: drivers - set the flag CRYPTO_ALG_ALLOCATES_MEMORY")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")

v5.7.15: Failed to apply! Possible dependencies:
528f776df67c ("crypto: qat - allow xts requests not multiple of block")
a85211f36f3d ("crypto: qat - fallback for xts with 192 bit keys")
b185a68710e0 ("crypto: qat - validate xts key")
b8aa7dc5c753 ("crypto: drivers - set the flag CRYPTO_ALG_ALLOCATES_MEMORY")
da6a66853a38 ("crypto: caam - silence .setkey in case of bad key length")

v5.4.58: Failed to apply! Possible dependencies:
1c2402266713 ("crypto: caam - add crypto_engine support for AEAD algorithms")
4d370a103695 ("crypto: caam - change return code in caam_jr_enqueue function")
b7f17fe28144 ("crypto: caam - refactor skcipher/aead/gcm/chachapoly {en,de}crypt functions")
d53e44fe980b ("crypto: caam - refactor RSA private key _done callbacks")
ee38767f152a ("crypto: caam - support crypto_engine framework for SKCIPHER algorithms")

v4.19.139: Failed to apply! Possible dependencies:
0efa7579f3de ("crypto: caam - export ahash shared descriptor generation")
1b46c90c8e00 ("crypto: caam - convert top level drivers to libraries")
226853ac3ebe ("crypto: caam/qi2 - add skcipher algorithms")
8d818c105501 ("crypto: caam/qi2 - add DPAA2-CAAM driver")
94cebd9da42c ("crypto: caam - add Queue Interface v2 error codes")
96808c596580 ("crypto: caam/qi2 - add CONFIG_NETDEVICES dependency")
ee38767f152a ("crypto: caam - support crypto_engine framework for SKCIPHER algorithms")

v4.14.193: Failed to apply! Possible dependencies:
0efa7579f3de ("crypto: caam - export ahash shared descriptor generation")
1b46c90c8e00 ("crypto: caam - convert top level drivers to libraries")
226853ac3ebe ("crypto: caam/qi2 - add skcipher algorithms")
8d818c105501 ("crypto: caam/qi2 - add DPAA2-CAAM driver")
94cebd9da42c ("crypto: caam - add Queue Interface v2 error codes")
96808c596580 ("crypto: caam/qi2 - add CONFIG_NETDEVICES dependency")
ee38767f152a ("crypto: caam - support crypto_engine framework for SKCIPHER algorithms")

v4.9.232: Failed to apply! Possible dependencies:
1b008eedb0af ("crypto: caam - remove unused command from aead givencrypt")
281669dfbabe ("crypto: caam - rewrite some generic inline append cmds")
4cbe79ccb523 ("crypto: caam - improve key inlining")
62ad8b5c0964 ("crypto: cavium - Enable CPT options crypto for build")
64c9295b2320 ("crypto: caam - move append_key_aead() into init_sh_desc_key_aead()")
8cea7b66b821 ("crypto: caam - refactor encryption descriptors generation")
8d818c105501 ("crypto: caam/qi2 - add DPAA2-CAAM driver")
db57656b0072 ("crypto: caam - group algorithm related params")
ee38767f152a ("crypto: caam - support crypto_engine framework for SKCIPHER algorithms")

v4.4.232: Failed to apply! Possible dependencies:
1b008eedb0af ("crypto: caam - remove unused command from aead givencrypt")
4cbe79ccb523 ("crypto: caam - improve key inlining")
5ba1c7b5ffc1 ("crypto: caam - fix rfc3686(ctr(aes)) IV load")
64c9295b2320 ("crypto: caam - move append_key_aead() into init_sh_desc_key_aead()")
8c419778ab57 ("crypto: caam - add support for RSA algorithm")
8cea7b66b821 ("crypto: caam - refactor encryption descriptors generation")
d6e7a7d0c2c5 ("crypto: caam - Rename jump labels in ahash_setkey()")
db57656b0072 ("crypto: caam - group algorithm related params")
e11793f5dad8 ("crypto: caam - ensure descriptor buffers are cacheline aligned")


NOTE: The patch will not be queued to stable trees until it is upstream.

How should we proceed with this patch?

--
Thanks
Sasha

2020-08-21 12:03:43

by Horia Geanta

[permalink] [raw]
Subject: Re: [PATCH RESEND 1/9] crypto: caam/jr - add fallback for XTS with more than 8B IV

On 8/21/2020 6:47 AM, Herbert Xu wrote:
> On Tue, Aug 11, 2020 at 05:30:41PM +0300, Horia Geantă wrote:
>>
>>> + if (IS_ERR(fallback)) {
>>> + pr_err("Failed to allocate %s fallback: %ld\n",
>>> + tfm_name, PTR_ERR(fallback));
>>> + return PTR_ERR(fallback);
>> Shouldn't error out so early. It might be that the fallback won't be needed.
>> Let's postpone this until we're sure fallback is required.
>
> Why? The generic should always be there as otherwise you won't
> even pass the self-test. If we're OOM then we should error out
> ASAP.
>
self-tests don't check the cases where a fallback is needed,
so in theory things could go well for a misconfigured kernel
(where a fallback is not available).

But I agree, if driver is updated to select CRYPTO_XTS
then it's probably better to return an error here.

Thanks,
Horia

2020-09-08 22:11:42

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH RESEND 1/9] crypto: caam/jr - add fallback for XTS with more than 8B IV

On Tue, Sep 08, 2020 at 01:35:04PM +0300, Horia Geantă wrote:
>
> > Just go with the get_unaligned unconditionally.
>
> Won't this lead to sub-optimal code for ARMv7
> in case the IV is aligned?

If this should be optimised in ARMv7 then that should be done
in get_unaligned itself and not open-coded.

Cheers,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2020-09-14 16:25:22

by Horia Geanta

[permalink] [raw]
Subject: Re: [PATCH RESEND 1/9] crypto: caam/jr - add fallback for XTS with more than 8B IV

On 9/9/2020 1:10 AM, Herbert Xu wrote:
> On Tue, Sep 08, 2020 at 01:35:04PM +0300, Horia Geantă wrote:
>>
>>> Just go with the get_unaligned unconditionally.
>>
>> Won't this lead to sub-optimal code for ARMv7
>> in case the IV is aligned?
>
> If this should be optimised in ARMv7 then that should be done
> in get_unaligned itself and not open-coded.
>
I am not sure what's wrong with avoiding using the unaligned accessors
in case data is aligned.

Documentation/core-api/unaligned-memory-access.rst clearly states:
These macros work for memory accesses of any length (not just 32 bits as
in the examples above). Be aware that when compared to standard access of
aligned memory, using these macros to access unaligned memory can be costly in
terms of performance.

So IMO it makes sense to use get_unaligned() only when needed.
There are several cases of users doing this, e.g. siphash.

Thanks,
Horia

2020-09-14 17:13:39

by Horia Geanta

[permalink] [raw]
Subject: Re: [PATCH RESEND 1/9] crypto: caam/jr - add fallback for XTS with more than 8B IV

On 9/14/2020 7:28 PM, Ard Biesheuvel wrote:
> On Mon, 14 Sep 2020 at 19:24, Horia Geantă <[email protected]> wrote:
>>
>> On 9/9/2020 1:10 AM, Herbert Xu wrote:
>>> On Tue, Sep 08, 2020 at 01:35:04PM +0300, Horia Geantă wrote:
>>>>
>>>>> Just go with the get_unaligned unconditionally.
>>>>
>>>> Won't this lead to sub-optimal code for ARMv7
>>>> in case the IV is aligned?
>>>
>>> If this should be optimised in ARMv7 then that should be done
>>> in get_unaligned itself and not open-coded.
>>>
>> I am not sure what's wrong with avoiding using the unaligned accessors
>> in case data is aligned.
>>
>> Documentation/core-api/unaligned-memory-access.rst clearly states:
>> These macros work for memory accesses of any length (not just 32 bits as
>> in the examples above). Be aware that when compared to standard access of
>> aligned memory, using these macros to access unaligned memory can be costly in
>> terms of performance.
>>
>> So IMO it makes sense to use get_unaligned() only when needed.
>> There are several cases of users doing this, e.g. siphash.
>>
>
> For ARMv7 code, using the unaligned accessors unconditionally is fine,
> and it will not affect performance.
>
> In general, when CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is defined,
> you can use the unaligned accessors. If it is not, it helps to have
> different code paths.
>
arch/arm/include/asm/unaligned.h doesn't make use of
linux/unaligned/access_ok.h, even if CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
is set.

I understand the comment in the file, however using get_unaligned()
unconditionally takes away the opportunity to generate optimized code
(using ldrd/ldm) when data is aligned.

> This is a bit murky, and through the years, the interpretation of
> unaligned-memory-access.rst has shifted a bit, but in this case, it
> makes no sense to make the distinction.
>

Thanks,
Horia

2020-09-14 18:20:58

by Ard Biesheuvel

[permalink] [raw]
Subject: Re: [PATCH RESEND 1/9] crypto: caam/jr - add fallback for XTS with more than 8B IV

On Mon, 14 Sep 2020 at 20:12, Horia Geantă <[email protected]> wrote:
>
> On 9/14/2020 7:28 PM, Ard Biesheuvel wrote:
> > On Mon, 14 Sep 2020 at 19:24, Horia Geantă <[email protected]> wrote:
> >>
> >> On 9/9/2020 1:10 AM, Herbert Xu wrote:
> >>> On Tue, Sep 08, 2020 at 01:35:04PM +0300, Horia Geantă wrote:
> >>>>
> >>>>> Just go with the get_unaligned unconditionally.
> >>>>
> >>>> Won't this lead to sub-optimal code for ARMv7
> >>>> in case the IV is aligned?
> >>>
> >>> If this should be optimised in ARMv7 then that should be done
> >>> in get_unaligned itself and not open-coded.
> >>>
> >> I am not sure what's wrong with avoiding using the unaligned accessors
> >> in case data is aligned.
> >>
> >> Documentation/core-api/unaligned-memory-access.rst clearly states:
> >> These macros work for memory accesses of any length (not just 32 bits as
> >> in the examples above). Be aware that when compared to standard access of
> >> aligned memory, using these macros to access unaligned memory can be costly in
> >> terms of performance.
> >>
> >> So IMO it makes sense to use get_unaligned() only when needed.
> >> There are several cases of users doing this, e.g. siphash.
> >>
> >
> > For ARMv7 code, using the unaligned accessors unconditionally is fine,
> > and it will not affect performance.
> >
> > In general, when CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is defined,
> > you can use the unaligned accessors. If it is not, it helps to have
> > different code paths.
> >
> arch/arm/include/asm/unaligned.h doesn't make use of
> linux/unaligned/access_ok.h, even if CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> is set.
>
> I understand the comment in the file, however using get_unaligned()
> unconditionally takes away the opportunity to generate optimized code
> (using ldrd/ldm) when data is aligned.
>

But the minimal optimization that is possible here (one ldrd/ldm
instruction vs two ldr instructions) is defeated by the fact that you
are using a conditional branch to select between the two. And this is
not even a hot path to begin with,

> > This is a bit murky, and through the years, the interpretation of
> > unaligned-memory-access.rst has shifted a bit, but in this case, it
> > makes no sense to make the distinction.
> >
>
> Thanks,
> Horia

2020-09-15 10:04:38

by Horia Geanta

[permalink] [raw]
Subject: Re: [PATCH RESEND 1/9] crypto: caam/jr - add fallback for XTS with more than 8B IV

On 9/14/2020 9:20 PM, Ard Biesheuvel wrote:
> On Mon, 14 Sep 2020 at 20:12, Horia Geantă <[email protected]> wrote:
>>
>> On 9/14/2020 7:28 PM, Ard Biesheuvel wrote:
>>> On Mon, 14 Sep 2020 at 19:24, Horia Geantă <[email protected]> wrote:
>>>>
>>>> On 9/9/2020 1:10 AM, Herbert Xu wrote:
>>>>> On Tue, Sep 08, 2020 at 01:35:04PM +0300, Horia Geantă wrote:
>>>>>>
>>>>>>> Just go with the get_unaligned unconditionally.
>>>>>>
>>>>>> Won't this lead to sub-optimal code for ARMv7
>>>>>> in case the IV is aligned?
>>>>>
>>>>> If this should be optimised in ARMv7 then that should be done
>>>>> in get_unaligned itself and not open-coded.
>>>>>
>>>> I am not sure what's wrong with avoiding using the unaligned accessors
>>>> in case data is aligned.
>>>>
>>>> Documentation/core-api/unaligned-memory-access.rst clearly states:
>>>> These macros work for memory accesses of any length (not just 32 bits as
>>>> in the examples above). Be aware that when compared to standard access of
>>>> aligned memory, using these macros to access unaligned memory can be costly in
>>>> terms of performance.
>>>>
>>>> So IMO it makes sense to use get_unaligned() only when needed.
>>>> There are several cases of users doing this, e.g. siphash.
>>>>
>>>
>>> For ARMv7 code, using the unaligned accessors unconditionally is fine,
>>> and it will not affect performance.
>>>
>>> In general, when CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is defined,
>>> you can use the unaligned accessors. If it is not, it helps to have
>>> different code paths.
>>>
>> arch/arm/include/asm/unaligned.h doesn't make use of
>> linux/unaligned/access_ok.h, even if CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
>> is set.
>>
>> I understand the comment in the file, however using get_unaligned()
>> unconditionally takes away the opportunity to generate optimized code
>> (using ldrd/ldm) when data is aligned.
>>
>
> But the minimal optimization that is possible here (one ldrd/ldm
> instruction vs two ldr instructions) is defeated by the fact that you
> are using a conditional branch to select between the two. And this is
> not even a hot path to begin with,
>
This is actually on the hot path (encrypt/decrypt callbacks),
but you're probably right that the conditional branching is going to offset
the optimized code.

To avoid branching, code could be rewritten as:

#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
size = *(u64 *)(req->iv + (ivsize / 2));
#else
size = get_unaligned((u64 *)(req->iv + (ivsize / 2)));
#endif

however in this case ARMv7 would suffer since
CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y and
ldrd/ldm for accesses not word-aligned are inefficient - lead to traps.

Would it be ok to use:
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && !defined(CONFIG_ARM)
to workaround the ARMv7 inconsistency?

Thanks,
Horia

2020-09-15 10:27:20

by Ard Biesheuvel

[permalink] [raw]
Subject: Re: [PATCH RESEND 1/9] crypto: caam/jr - add fallback for XTS with more than 8B IV

On Tue, 15 Sep 2020 at 13:02, Horia Geantă <[email protected]> wrote:
>
> On 9/14/2020 9:20 PM, Ard Biesheuvel wrote:
> > On Mon, 14 Sep 2020 at 20:12, Horia Geantă <[email protected]> wrote:
> >>
> >> On 9/14/2020 7:28 PM, Ard Biesheuvel wrote:
> >>> On Mon, 14 Sep 2020 at 19:24, Horia Geantă <[email protected]> wrote:
> >>>>
> >>>> On 9/9/2020 1:10 AM, Herbert Xu wrote:
> >>>>> On Tue, Sep 08, 2020 at 01:35:04PM +0300, Horia Geantă wrote:
> >>>>>>
> >>>>>>> Just go with the get_unaligned unconditionally.
> >>>>>>
> >>>>>> Won't this lead to sub-optimal code for ARMv7
> >>>>>> in case the IV is aligned?
> >>>>>
> >>>>> If this should be optimised in ARMv7 then that should be done
> >>>>> in get_unaligned itself and not open-coded.
> >>>>>
> >>>> I am not sure what's wrong with avoiding using the unaligned accessors
> >>>> in case data is aligned.
> >>>>
> >>>> Documentation/core-api/unaligned-memory-access.rst clearly states:
> >>>> These macros work for memory accesses of any length (not just 32 bits as
> >>>> in the examples above). Be aware that when compared to standard access of
> >>>> aligned memory, using these macros to access unaligned memory can be costly in
> >>>> terms of performance.
> >>>>
> >>>> So IMO it makes sense to use get_unaligned() only when needed.
> >>>> There are several cases of users doing this, e.g. siphash.
> >>>>
> >>>
> >>> For ARMv7 code, using the unaligned accessors unconditionally is fine,
> >>> and it will not affect performance.
> >>>
> >>> In general, when CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is defined,
> >>> you can use the unaligned accessors. If it is not, it helps to have
> >>> different code paths.
> >>>
> >> arch/arm/include/asm/unaligned.h doesn't make use of
> >> linux/unaligned/access_ok.h, even if CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> >> is set.
> >>
> >> I understand the comment in the file, however using get_unaligned()
> >> unconditionally takes away the opportunity to generate optimized code
> >> (using ldrd/ldm) when data is aligned.
> >>
> >
> > But the minimal optimization that is possible here (one ldrd/ldm
> > instruction vs two ldr instructions) is defeated by the fact that you
> > are using a conditional branch to select between the two. And this is
> > not even a hot path to begin with,
> >
> This is actually on the hot path (encrypt/decrypt callbacks),
> but you're probably right that the conditional branching is going to offset
> the optimized code.
>

This is called once per XTS request, right? And you are saying the
extra cycle makes a difference?

> To avoid branching, code could be rewritten as:
>
> #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> size = *(u64 *)(req->iv + (ivsize / 2));
> #else
> size = get_unaligned((u64 *)(req->iv + (ivsize / 2)));
> #endif
>
> however in this case ARMv7 would suffer since
> CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y and
> ldrd/ldm for accesses not word-aligned are inefficient - lead to traps.
>

CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS means 'just use the unaligned
accessors as they are basically free'. Casting a potentially
misaligned u8* to a u64* is not permitted by the C standard.

> Would it be ok to use:
> #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && !defined(CONFIG_ARM)
> to workaround the ARMv7 inconsistency?
>

No, please just use the get_unaligned() accessor.

2020-09-15 12:50:19

by Horia Geanta

[permalink] [raw]
Subject: Re: [PATCH RESEND 1/9] crypto: caam/jr - add fallback for XTS with more than 8B IV

On 9/15/2020 1:26 PM, Ard Biesheuvel wrote:
> On Tue, 15 Sep 2020 at 13:02, Horia Geantă <[email protected]> wrote:
>>
>> On 9/14/2020 9:20 PM, Ard Biesheuvel wrote:
>>> On Mon, 14 Sep 2020 at 20:12, Horia Geantă <[email protected]> wrote:
>>>>
>>>> On 9/14/2020 7:28 PM, Ard Biesheuvel wrote:
>>>>> On Mon, 14 Sep 2020 at 19:24, Horia Geantă <[email protected]> wrote:
>>>>>>
>>>>>> On 9/9/2020 1:10 AM, Herbert Xu wrote:
>>>>>>> On Tue, Sep 08, 2020 at 01:35:04PM +0300, Horia Geantă wrote:
>>>>>>>>
>>>>>>>>> Just go with the get_unaligned unconditionally.
>>>>>>>>
>>>>>>>> Won't this lead to sub-optimal code for ARMv7
>>>>>>>> in case the IV is aligned?
>>>>>>>
>>>>>>> If this should be optimised in ARMv7 then that should be done
>>>>>>> in get_unaligned itself and not open-coded.
>>>>>>>
>>>>>> I am not sure what's wrong with avoiding using the unaligned accessors
>>>>>> in case data is aligned.
>>>>>>
>>>>>> Documentation/core-api/unaligned-memory-access.rst clearly states:
>>>>>> These macros work for memory accesses of any length (not just 32 bits as
>>>>>> in the examples above). Be aware that when compared to standard access of
>>>>>> aligned memory, using these macros to access unaligned memory can be costly in
>>>>>> terms of performance.
>>>>>>
>>>>>> So IMO it makes sense to use get_unaligned() only when needed.
>>>>>> There are several cases of users doing this, e.g. siphash.
>>>>>>
>>>>>
>>>>> For ARMv7 code, using the unaligned accessors unconditionally is fine,
>>>>> and it will not affect performance.
>>>>>
>>>>> In general, when CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is defined,
>>>>> you can use the unaligned accessors. If it is not, it helps to have
>>>>> different code paths.
>>>>>
>>>> arch/arm/include/asm/unaligned.h doesn't make use of
>>>> linux/unaligned/access_ok.h, even if CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
>>>> is set.
>>>>
>>>> I understand the comment in the file, however using get_unaligned()
>>>> unconditionally takes away the opportunity to generate optimized code
>>>> (using ldrd/ldm) when data is aligned.
>>>>
>>>
>>> But the minimal optimization that is possible here (one ldrd/ldm
>>> instruction vs two ldr instructions) is defeated by the fact that you
>>> are using a conditional branch to select between the two. And this is
>>> not even a hot path to begin with,
>>>
>> This is actually on the hot path (encrypt/decrypt callbacks),
>> but you're probably right that the conditional branching is going to offset
>> the optimized code.
>>
>
> This is called once per XTS request, right? And you are saying the
> extra cycle makes a difference?
>
Yes, once per request and no, not super-important.

>> To avoid branching, code could be rewritten as:
>>
>> #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
>> size = *(u64 *)(req->iv + (ivsize / 2));
>> #else
>> size = get_unaligned((u64 *)(req->iv + (ivsize / 2)));
>> #endif
>>
>> however in this case ARMv7 would suffer since
>> CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y and
>> ldrd/ldm for accesses not word-aligned are inefficient - lead to traps.
>>
>
> CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS means 'just use the unaligned
> accessors as they are basically free'. Casting a potentially
> misaligned u8* to a u64* is not permitted by the C standard.
>
Seems that I misunderstood CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.

Looking at its usage, e.g. ether_addr_equal() or __crypto_memneq_*(),
I see similar casts of pointers possibly misaligned.

>> Would it be ok to use:
>> #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && !defined(CONFIG_ARM)
>> to workaround the ARMv7 inconsistency?
>>
>
> No, please just use the get_unaligned() accessor.
>
Ok.

Thanks,
Horia

2020-09-15 12:53:50

by Ard Biesheuvel

[permalink] [raw]
Subject: Re: [PATCH RESEND 1/9] crypto: caam/jr - add fallback for XTS with more than 8B IV

On Tue, 15 Sep 2020 at 15:45, Horia Geantă <[email protected]> wrote:
>
> On 9/15/2020 1:26 PM, Ard Biesheuvel wrote:
> > On Tue, 15 Sep 2020 at 13:02, Horia Geantă <[email protected]> wrote:
> >>
> >> On 9/14/2020 9:20 PM, Ard Biesheuvel wrote:
> >>> On Mon, 14 Sep 2020 at 20:12, Horia Geantă <[email protected]> wrote:
> >>>>
> >>>> On 9/14/2020 7:28 PM, Ard Biesheuvel wrote:
> >>>>> On Mon, 14 Sep 2020 at 19:24, Horia Geantă <[email protected]> wrote:
> >>>>>>
> >>>>>> On 9/9/2020 1:10 AM, Herbert Xu wrote:
> >>>>>>> On Tue, Sep 08, 2020 at 01:35:04PM +0300, Horia Geantă wrote:
> >>>>>>>>
> >>>>>>>>> Just go with the get_unaligned unconditionally.
> >>>>>>>>
> >>>>>>>> Won't this lead to sub-optimal code for ARMv7
> >>>>>>>> in case the IV is aligned?
> >>>>>>>
> >>>>>>> If this should be optimised in ARMv7 then that should be done
> >>>>>>> in get_unaligned itself and not open-coded.
> >>>>>>>
> >>>>>> I am not sure what's wrong with avoiding using the unaligned accessors
> >>>>>> in case data is aligned.
> >>>>>>
> >>>>>> Documentation/core-api/unaligned-memory-access.rst clearly states:
> >>>>>> These macros work for memory accesses of any length (not just 32 bits as
> >>>>>> in the examples above). Be aware that when compared to standard access of
> >>>>>> aligned memory, using these macros to access unaligned memory can be costly in
> >>>>>> terms of performance.
> >>>>>>
> >>>>>> So IMO it makes sense to use get_unaligned() only when needed.
> >>>>>> There are several cases of users doing this, e.g. siphash.
> >>>>>>
> >>>>>
> >>>>> For ARMv7 code, using the unaligned accessors unconditionally is fine,
> >>>>> and it will not affect performance.
> >>>>>
> >>>>> In general, when CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is defined,
> >>>>> you can use the unaligned accessors. If it is not, it helps to have
> >>>>> different code paths.
> >>>>>
> >>>> arch/arm/include/asm/unaligned.h doesn't make use of
> >>>> linux/unaligned/access_ok.h, even if CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> >>>> is set.
> >>>>
> >>>> I understand the comment in the file, however using get_unaligned()
> >>>> unconditionally takes away the opportunity to generate optimized code
> >>>> (using ldrd/ldm) when data is aligned.
> >>>>
> >>>
> >>> But the minimal optimization that is possible here (one ldrd/ldm
> >>> instruction vs two ldr instructions) is defeated by the fact that you
> >>> are using a conditional branch to select between the two. And this is
> >>> not even a hot path to begin with,
> >>>
> >> This is actually on the hot path (encrypt/decrypt callbacks),
> >> but you're probably right that the conditional branching is going to offset
> >> the optimized code.
> >>
> >
> > This is called once per XTS request, right? And you are saying the
> > extra cycle makes a difference?
> >
> Yes, once per request and no, not super-important.
>
> >> To avoid branching, code could be rewritten as:
> >>
> >> #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> >> size = *(u64 *)(req->iv + (ivsize / 2));
> >> #else
> >> size = get_unaligned((u64 *)(req->iv + (ivsize / 2)));
> >> #endif
> >>
> >> however in this case ARMv7 would suffer since
> >> CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y and
> >> ldrd/ldm for accesses not word-aligned are inefficient - lead to traps.
> >>
> >
> > CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS means 'just use the unaligned
> > accessors as they are basically free'. Casting a potentially
> > misaligned u8* to a u64* is not permitted by the C standard.
> >
> Seems that I misunderstood CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
>

You're not the only one :-) I have been intending to get the
discussion going with the networking folks, who rely heavily on this
as well.

> Looking at its usage, e.g. ether_addr_equal() or __crypto_memneq_*(),
> I see similar casts of pointers possibly misaligned.
>

Yes, that is the confusion. CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
should indicate whether using the unaligned accessors is fine in all
cases, or whether you should find other ways to load the data more
efficiently (compare NET_IP_ALIGN, which shifts the entire IP header
so the 32-bit address field appear aligned in memory)

CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS does *not* mean that you can
simply cast any pointer to any type and dereference it, but the
meaning appears to have shifted this way over the years (and the
Documentation/ was even updated to this effect)

Pre-v6 ARM (and MIPS as well, IIRC) require byte sized accesses and
shift/or sequences to do unaligned accesses, whereas v6 and up simply
allows ldr from a misaligned address. So in the former case, you could
use cra_alignmask to align the data in memory, while the latter case
can ignore it. (arch/arm/crypto/aes-cipher-glue.c uses this as well)

> >> Would it be ok to use:
> >> #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && !defined(CONFIG_ARM)
> >> to workaround the ARMv7 inconsistency?
> >>
> >
> > No, please just use the get_unaligned() accessor.
> >
> Ok.
>
> Thanks,
> Horia