2021-05-26 10:08:15

by Ard Biesheuvel

[permalink] [raw]
Subject: [PATCH v6 0/6] running kernel mode SIMD with softirqs disabled

This is a follow-up to [0], but given that the arm64 architectural
pieces have been merged for arm64, the only remaining changes are crypto
specific. Therefore, the audience has been reduced to those people who
are somewhat more likely to care about these specifics.

The AEAD and skcipher APIs may only be called from task or softirq
context. This permits the arm64 AEAD and skcipher code to get rid of all
scalar fallbacks, given that on this architecture, softirqs are now no
longer served while the SIMD unit is being used in kernel mode, which
means that the scalar fallbacks are never needed. These are removed in
this series.

Changes since v5:
- add Eric's R-b to patches #1 to #3
- split CCM changes into 3 separate patches

Changes since v4:
- drop skcipher_walk layer change to deal with zero sized walks
- drop aead/skcipher layer sanity checks on invocations from hardirq
context
- add patch to clean up CCM a bit more after removing the SIMD code path

Changes since v3:
- clarify the nature of the issue addressed by patch #1, and apply the
same fix to the skcipher walker
- update patches #2 and #3 so that the failures can be observed by the
crypto stats code

[0] https://lore.kernel.org/linux-arm-kernel/[email protected]/

Ard Biesheuvel (6):
crypto: arm64/gcm-aes-ce - remove non-SIMD fallback path
crypto: arm64/aes-neonbs - stop using SIMD helper for skciphers
crypto: arm64/aes-ce - stop using SIMD helper for skciphers
crypto: arm64/aes-ccm - remove non-SIMD fallback path
crypto: arm64/aes-ccm - reduce NEON begin/end calls for common case
crypto: arm64/aes-ccm - avoid by-ref argument for ce_aes_ccm_auth_data

arch/arm64/crypto/Kconfig | 6 -
arch/arm64/crypto/aes-ce-ccm-core.S | 24 +--
arch/arm64/crypto/aes-ce-ccm-glue.c | 196 ++++++------------
arch/arm64/crypto/aes-glue.c | 102 ++--------
arch/arm64/crypto/aes-neonbs-glue.c | 122 +-----------
arch/arm64/crypto/ghash-ce-glue.c | 209 +++++---------------
6 files changed, 141 insertions(+), 518 deletions(-)

--
2.20.1


2021-05-26 10:08:30

by Ard Biesheuvel

[permalink] [raw]
Subject: [PATCH v6 2/6] crypto: arm64/aes-neonbs - stop using SIMD helper for skciphers

Calls into the skcipher API can only occur from contexts where the SIMD
unit is available, so there is no need for the SIMD helper.

Reviewed-by: Eric Biggers <[email protected]>
Signed-off-by: Ard Biesheuvel <[email protected]>
---
arch/arm64/crypto/Kconfig | 2 -
arch/arm64/crypto/aes-neonbs-glue.c | 122 ++------------------
2 files changed, 9 insertions(+), 115 deletions(-)

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index b8eb0453123d..ed1e8cadeb3a 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -122,8 +122,6 @@ config CRYPTO_AES_ARM64_BS
depends on KERNEL_MODE_NEON
select CRYPTO_SKCIPHER
select CRYPTO_AES_ARM64_NEON_BLK
- select CRYPTO_AES_ARM64
select CRYPTO_LIB_AES
- select CRYPTO_SIMD

endif
diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c
index fb507d569922..8df6ad8cb09d 100644
--- a/arch/arm64/crypto/aes-neonbs-glue.c
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@@ -63,11 +63,6 @@ struct aesbs_cbc_ctx {
u32 enc[AES_MAX_KEYLENGTH_U32];
};

-struct aesbs_ctr_ctx {
- struct aesbs_ctx key; /* must be first member */
- struct crypto_aes_ctx fallback;
-};
-
struct aesbs_xts_ctx {
struct aesbs_ctx key;
u32 twkey[AES_MAX_KEYLENGTH_U32];
@@ -207,25 +202,6 @@ static int cbc_decrypt(struct skcipher_request *req)
return err;
}

-static int aesbs_ctr_setkey_sync(struct crypto_skcipher *tfm, const u8 *in_key,
- unsigned int key_len)
-{
- struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
- int err;
-
- err = aes_expandkey(&ctx->fallback, in_key, key_len);
- if (err)
- return err;
-
- ctx->key.rounds = 6 + key_len / 4;
-
- kernel_neon_begin();
- aesbs_convert_key(ctx->key.rk, ctx->fallback.key_enc, ctx->key.rounds);
- kernel_neon_end();
-
- return 0;
-}
-
static int ctr_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
@@ -292,29 +268,6 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
return aesbs_setkey(tfm, in_key, key_len);
}

-static void ctr_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst)
-{
- struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
- unsigned long flags;
-
- /*
- * Temporarily disable interrupts to avoid races where
- * cachelines are evicted when the CPU is interrupted
- * to do something else.
- */
- local_irq_save(flags);
- aes_encrypt(&ctx->fallback, dst, src);
- local_irq_restore(flags);
-}
-
-static int ctr_encrypt_sync(struct skcipher_request *req)
-{
- if (!crypto_simd_usable())
- return crypto_ctr_encrypt_walk(req, ctr_encrypt_one);
-
- return ctr_encrypt(req);
-}
-
static int __xts_crypt(struct skcipher_request *req, bool encrypt,
void (*fn)(u8 out[], u8 const in[], u8 const rk[],
int rounds, int blocks, u8 iv[]))
@@ -431,13 +384,12 @@ static int xts_decrypt(struct skcipher_request *req)
}

static struct skcipher_alg aes_algs[] = { {
- .base.cra_name = "__ecb(aes)",
- .base.cra_driver_name = "__ecb-aes-neonbs",
+ .base.cra_name = "ecb(aes)",
+ .base.cra_driver_name = "ecb-aes-neonbs",
.base.cra_priority = 250,
.base.cra_blocksize = AES_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct aesbs_ctx),
.base.cra_module = THIS_MODULE,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,

.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
@@ -446,13 +398,12 @@ static struct skcipher_alg aes_algs[] = { {
.encrypt = ecb_encrypt,
.decrypt = ecb_decrypt,
}, {
- .base.cra_name = "__cbc(aes)",
- .base.cra_driver_name = "__cbc-aes-neonbs",
+ .base.cra_name = "cbc(aes)",
+ .base.cra_driver_name = "cbc-aes-neonbs",
.base.cra_priority = 250,
.base.cra_blocksize = AES_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct aesbs_cbc_ctx),
.base.cra_module = THIS_MODULE,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,

.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
@@ -462,13 +413,12 @@ static struct skcipher_alg aes_algs[] = { {
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
}, {
- .base.cra_name = "__ctr(aes)",
- .base.cra_driver_name = "__ctr-aes-neonbs",
+ .base.cra_name = "ctr(aes)",
+ .base.cra_driver_name = "ctr-aes-neonbs",
.base.cra_priority = 250,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct aesbs_ctx),
.base.cra_module = THIS_MODULE,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,

.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
@@ -479,29 +429,12 @@ static struct skcipher_alg aes_algs[] = { {
.encrypt = ctr_encrypt,
.decrypt = ctr_encrypt,
}, {
- .base.cra_name = "ctr(aes)",
- .base.cra_driver_name = "ctr-aes-neonbs",
- .base.cra_priority = 250 - 1,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct aesbs_ctr_ctx),
- .base.cra_module = THIS_MODULE,
-
- .min_keysize = AES_MIN_KEY_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE,
- .chunksize = AES_BLOCK_SIZE,
- .walksize = 8 * AES_BLOCK_SIZE,
- .ivsize = AES_BLOCK_SIZE,
- .setkey = aesbs_ctr_setkey_sync,
- .encrypt = ctr_encrypt_sync,
- .decrypt = ctr_encrypt_sync,
-}, {
- .base.cra_name = "__xts(aes)",
- .base.cra_driver_name = "__xts-aes-neonbs",
+ .base.cra_name = "xts(aes)",
+ .base.cra_driver_name = "xts-aes-neonbs",
.base.cra_priority = 250,
.base.cra_blocksize = AES_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct aesbs_xts_ctx),
.base.cra_module = THIS_MODULE,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,

.min_keysize = 2 * AES_MIN_KEY_SIZE,
.max_keysize = 2 * AES_MAX_KEY_SIZE,
@@ -512,54 +445,17 @@ static struct skcipher_alg aes_algs[] = { {
.decrypt = xts_decrypt,
} };

-static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];
-
static void aes_exit(void)
{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++)
- if (aes_simd_algs[i])
- simd_skcipher_free(aes_simd_algs[i]);
-
crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
}

static int __init aes_init(void)
{
- struct simd_skcipher_alg *simd;
- const char *basename;
- const char *algname;
- const char *drvname;
- int err;
- int i;
-
if (!cpu_have_named_feature(ASIMD))
return -ENODEV;

- err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
- if (err)
- return err;
-
- for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
- if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
- continue;
-
- algname = aes_algs[i].base.cra_name + 2;
- drvname = aes_algs[i].base.cra_driver_name + 2;
- basename = aes_algs[i].base.cra_driver_name;
- simd = simd_skcipher_create_compat(algname, drvname, basename);
- err = PTR_ERR(simd);
- if (IS_ERR(simd))
- goto unregister_simds;
-
- aes_simd_algs[i] = simd;
- }
- return 0;
-
-unregister_simds:
- aes_exit();
- return err;
+ return crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
}

module_init(aes_init);
--
2.20.1

2021-05-26 10:08:31

by Ard Biesheuvel

[permalink] [raw]
Subject: [PATCH v6 1/6] crypto: arm64/gcm-aes-ce - remove non-SIMD fallback path

Now that kernel mode SIMD is guaranteed to be available when executing
in task or softirq context, we no longer need scalar fallbacks to use
when the NEON is unavailable. So get rid of them.

Reviewed-by: Eric Biggers <[email protected]>
Signed-off-by: Ard Biesheuvel <[email protected]>
---
arch/arm64/crypto/ghash-ce-glue.c | 209 +++++---------------
1 file changed, 51 insertions(+), 158 deletions(-)

diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 720cd3a58da3..15794fe21a0b 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -362,84 +362,36 @@ static int gcm_encrypt(struct aead_request *req)

err = skcipher_walk_aead_encrypt(&walk, req, false);

- if (likely(crypto_simd_usable())) {
- do {
- const u8 *src = walk.src.virt.addr;
- u8 *dst = walk.dst.virt.addr;
- int nbytes = walk.nbytes;
-
- tag = (u8 *)&lengths;
-
- if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) {
- src = dst = memcpy(buf + sizeof(buf) - nbytes,
- src, nbytes);
- } else if (nbytes < walk.total) {
- nbytes &= ~(AES_BLOCK_SIZE - 1);
- tag = NULL;
- }
-
- kernel_neon_begin();
- pmull_gcm_encrypt(nbytes, dst, src, ctx->ghash_key.h,
- dg, iv, ctx->aes_key.key_enc, nrounds,
- tag);
- kernel_neon_end();
-
- if (unlikely(!nbytes))
- break;
-
- if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
- memcpy(walk.dst.virt.addr,
- buf + sizeof(buf) - nbytes, nbytes);
-
- err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
- } while (walk.nbytes);
- } else {
- while (walk.nbytes >= AES_BLOCK_SIZE) {
- int blocks = walk.nbytes / AES_BLOCK_SIZE;
- const u8 *src = walk.src.virt.addr;
- u8 *dst = walk.dst.virt.addr;
- int remaining = blocks;
-
- do {
- aes_encrypt(&ctx->aes_key, buf, iv);
- crypto_xor_cpy(dst, src, buf, AES_BLOCK_SIZE);
- crypto_inc(iv, AES_BLOCK_SIZE);
-
- dst += AES_BLOCK_SIZE;
- src += AES_BLOCK_SIZE;
- } while (--remaining > 0);
-
- ghash_do_update(blocks, dg, walk.dst.virt.addr,
- &ctx->ghash_key, NULL);
-
- err = skcipher_walk_done(&walk,
- walk.nbytes % AES_BLOCK_SIZE);
- }
-
- /* handle the tail */
- if (walk.nbytes) {
- aes_encrypt(&ctx->aes_key, buf, iv);
+ do {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ int nbytes = walk.nbytes;

- crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr,
- buf, walk.nbytes);
+ tag = (u8 *)&lengths;

- memcpy(buf, walk.dst.virt.addr, walk.nbytes);
- memset(buf + walk.nbytes, 0, sizeof(buf) - walk.nbytes);
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) {
+ src = dst = memcpy(buf + sizeof(buf) - nbytes,
+ src, nbytes);
+ } else if (nbytes < walk.total) {
+ nbytes &= ~(AES_BLOCK_SIZE - 1);
+ tag = NULL;
}

- tag = (u8 *)&lengths;
- ghash_do_update(1, dg, tag, &ctx->ghash_key,
- walk.nbytes ? buf : NULL);
+ kernel_neon_begin();
+ pmull_gcm_encrypt(nbytes, dst, src, ctx->ghash_key.h,
+ dg, iv, ctx->aes_key.key_enc, nrounds,
+ tag);
+ kernel_neon_end();

- if (walk.nbytes)
- err = skcipher_walk_done(&walk, 0);
+ if (unlikely(!nbytes))
+ break;

- put_unaligned_be64(dg[1], tag);
- put_unaligned_be64(dg[0], tag + 8);
- put_unaligned_be32(1, iv + GCM_IV_SIZE);
- aes_encrypt(&ctx->aes_key, iv, iv);
- crypto_xor(tag, iv, AES_BLOCK_SIZE);
- }
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
+ memcpy(walk.dst.virt.addr,
+ buf + sizeof(buf) - nbytes, nbytes);
+
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ } while (walk.nbytes);

if (err)
return err;
@@ -464,6 +416,7 @@ static int gcm_decrypt(struct aead_request *req)
u64 dg[2] = {};
be128 lengths;
u8 *tag;
+ int ret;
int err;

lengths.a = cpu_to_be64(req->assoclen * 8);
@@ -481,101 +434,41 @@ static int gcm_decrypt(struct aead_request *req)

err = skcipher_walk_aead_decrypt(&walk, req, false);

- if (likely(crypto_simd_usable())) {
- int ret;
-
- do {
- const u8 *src = walk.src.virt.addr;
- u8 *dst = walk.dst.virt.addr;
- int nbytes = walk.nbytes;
-
- tag = (u8 *)&lengths;
-
- if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) {
- src = dst = memcpy(buf + sizeof(buf) - nbytes,
- src, nbytes);
- } else if (nbytes < walk.total) {
- nbytes &= ~(AES_BLOCK_SIZE - 1);
- tag = NULL;
- }
-
- kernel_neon_begin();
- ret = pmull_gcm_decrypt(nbytes, dst, src,
- ctx->ghash_key.h,
- dg, iv, ctx->aes_key.key_enc,
- nrounds, tag, otag, authsize);
- kernel_neon_end();
-
- if (unlikely(!nbytes))
- break;
-
- if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
- memcpy(walk.dst.virt.addr,
- buf + sizeof(buf) - nbytes, nbytes);
-
- err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
- } while (walk.nbytes);
-
- if (err)
- return err;
- if (ret)
- return -EBADMSG;
- } else {
- while (walk.nbytes >= AES_BLOCK_SIZE) {
- int blocks = walk.nbytes / AES_BLOCK_SIZE;
- const u8 *src = walk.src.virt.addr;
- u8 *dst = walk.dst.virt.addr;
-
- ghash_do_update(blocks, dg, walk.src.virt.addr,
- &ctx->ghash_key, NULL);
-
- do {
- aes_encrypt(&ctx->aes_key, buf, iv);
- crypto_xor_cpy(dst, src, buf, AES_BLOCK_SIZE);
- crypto_inc(iv, AES_BLOCK_SIZE);
-
- dst += AES_BLOCK_SIZE;
- src += AES_BLOCK_SIZE;
- } while (--blocks > 0);
+ do {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ int nbytes = walk.nbytes;

- err = skcipher_walk_done(&walk,
- walk.nbytes % AES_BLOCK_SIZE);
- }
+ tag = (u8 *)&lengths;

- /* handle the tail */
- if (walk.nbytes) {
- memcpy(buf, walk.src.virt.addr, walk.nbytes);
- memset(buf + walk.nbytes, 0, sizeof(buf) - walk.nbytes);
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) {
+ src = dst = memcpy(buf + sizeof(buf) - nbytes,
+ src, nbytes);
+ } else if (nbytes < walk.total) {
+ nbytes &= ~(AES_BLOCK_SIZE - 1);
+ tag = NULL;
}

- tag = (u8 *)&lengths;
- ghash_do_update(1, dg, tag, &ctx->ghash_key,
- walk.nbytes ? buf : NULL);
-
- if (walk.nbytes) {
- aes_encrypt(&ctx->aes_key, buf, iv);
+ kernel_neon_begin();
+ ret = pmull_gcm_decrypt(nbytes, dst, src, ctx->ghash_key.h,
+ dg, iv, ctx->aes_key.key_enc,
+ nrounds, tag, otag, authsize);
+ kernel_neon_end();

- crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr,
- buf, walk.nbytes);
+ if (unlikely(!nbytes))
+ break;

- err = skcipher_walk_done(&walk, 0);
- }
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
+ memcpy(walk.dst.virt.addr,
+ buf + sizeof(buf) - nbytes, nbytes);

- if (err)
- return err;
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ } while (walk.nbytes);

- put_unaligned_be64(dg[1], tag);
- put_unaligned_be64(dg[0], tag + 8);
- put_unaligned_be32(1, iv + GCM_IV_SIZE);
- aes_encrypt(&ctx->aes_key, iv, iv);
- crypto_xor(tag, iv, AES_BLOCK_SIZE);
+ if (err)
+ return err;

- if (crypto_memneq(tag, otag, authsize)) {
- memzero_explicit(tag, AES_BLOCK_SIZE);
- return -EBADMSG;
- }
- }
- return 0;
+ return ret ? -EBADMSG : 0;
}

static struct aead_alg gcm_aes_alg = {
--
2.20.1

2021-05-26 10:08:33

by Ard Biesheuvel

[permalink] [raw]
Subject: [PATCH v6 3/6] crypto: arm64/aes-ce - stop using SIMD helper for skciphers

Calls into the skcipher API can only occur from contexts where the SIMD
unit is available, so there is no need for the SIMD helper.

Reviewed-by: Eric Biggers <[email protected]>
Signed-off-by: Ard Biesheuvel <[email protected]>
---
arch/arm64/crypto/Kconfig | 4 -
arch/arm64/crypto/aes-glue.c | 102 +++-----------------
2 files changed, 13 insertions(+), 93 deletions(-)

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index ed1e8cadeb3a..454621a20eaa 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -88,16 +88,12 @@ config CRYPTO_AES_ARM64_CE_BLK
depends on KERNEL_MODE_NEON
select CRYPTO_SKCIPHER
select CRYPTO_AES_ARM64_CE
- select CRYPTO_AES_ARM64
- select CRYPTO_SIMD

config CRYPTO_AES_ARM64_NEON_BLK
tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
depends on KERNEL_MODE_NEON
select CRYPTO_SKCIPHER
- select CRYPTO_AES_ARM64
select CRYPTO_LIB_AES
- select CRYPTO_SIMD

config CRYPTO_CHACHA20_NEON
tristate "ChaCha20, XChaCha20, and XChaCha12 stream ciphers using NEON instructions"
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index 17e735931a0c..30b7cc6a7079 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -444,7 +444,7 @@ static int __maybe_unused essiv_cbc_decrypt(struct skcipher_request *req)
return err ?: cbc_decrypt_walk(req, &walk);
}

-static int ctr_encrypt(struct skcipher_request *req)
+static int __maybe_unused ctr_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
@@ -485,29 +485,6 @@ static int ctr_encrypt(struct skcipher_request *req)
return err;
}

-static void ctr_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst)
-{
- const struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
- unsigned long flags;
-
- /*
- * Temporarily disable interrupts to avoid races where
- * cachelines are evicted when the CPU is interrupted
- * to do something else.
- */
- local_irq_save(flags);
- aes_encrypt(ctx, dst, src);
- local_irq_restore(flags);
-}
-
-static int __maybe_unused ctr_encrypt_sync(struct skcipher_request *req)
-{
- if (!crypto_simd_usable())
- return crypto_ctr_encrypt_walk(req, ctr_encrypt_one);
-
- return ctr_encrypt(req);
-}
-
static int __maybe_unused xts_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
@@ -656,10 +633,9 @@ static int __maybe_unused xts_decrypt(struct skcipher_request *req)
static struct skcipher_alg aes_algs[] = { {
#if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS)
.base = {
- .cra_name = "__ecb(aes)",
- .cra_driver_name = "__ecb-aes-" MODE,
+ .cra_name = "ecb(aes)",
+ .cra_driver_name = "ecb-aes-" MODE,
.cra_priority = PRIO,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
.cra_module = THIS_MODULE,
@@ -671,10 +647,9 @@ static struct skcipher_alg aes_algs[] = { {
.decrypt = ecb_decrypt,
}, {
.base = {
- .cra_name = "__cbc(aes)",
- .cra_driver_name = "__cbc-aes-" MODE,
+ .cra_name = "cbc(aes)",
+ .cra_driver_name = "cbc-aes-" MODE,
.cra_priority = PRIO,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
.cra_module = THIS_MODULE,
@@ -687,10 +662,9 @@ static struct skcipher_alg aes_algs[] = { {
.decrypt = cbc_decrypt,
}, {
.base = {
- .cra_name = "__ctr(aes)",
- .cra_driver_name = "__ctr-aes-" MODE,
+ .cra_name = "ctr(aes)",
+ .cra_driver_name = "ctr-aes-" MODE,
.cra_priority = PRIO,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
.cra_module = THIS_MODULE,
@@ -704,26 +678,9 @@ static struct skcipher_alg aes_algs[] = { {
.decrypt = ctr_encrypt,
}, {
.base = {
- .cra_name = "ctr(aes)",
- .cra_driver_name = "ctr-aes-" MODE,
- .cra_priority = PRIO - 1,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct crypto_aes_ctx),
- .cra_module = THIS_MODULE,
- },
- .min_keysize = AES_MIN_KEY_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE,
- .ivsize = AES_BLOCK_SIZE,
- .chunksize = AES_BLOCK_SIZE,
- .setkey = skcipher_aes_setkey,
- .encrypt = ctr_encrypt_sync,
- .decrypt = ctr_encrypt_sync,
-}, {
- .base = {
- .cra_name = "__xts(aes)",
- .cra_driver_name = "__xts-aes-" MODE,
+ .cra_name = "xts(aes)",
+ .cra_driver_name = "xts-aes-" MODE,
.cra_priority = PRIO,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_xts_ctx),
.cra_module = THIS_MODULE,
@@ -738,10 +695,9 @@ static struct skcipher_alg aes_algs[] = { {
}, {
#endif
.base = {
- .cra_name = "__cts(cbc(aes))",
- .cra_driver_name = "__cts-cbc-aes-" MODE,
+ .cra_name = "cts(cbc(aes))",
+ .cra_driver_name = "cts-cbc-aes-" MODE,
.cra_priority = PRIO,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
.cra_module = THIS_MODULE,
@@ -755,10 +711,9 @@ static struct skcipher_alg aes_algs[] = { {
.decrypt = cts_cbc_decrypt,
}, {
.base = {
- .cra_name = "__essiv(cbc(aes),sha256)",
- .cra_driver_name = "__essiv-cbc-aes-sha256-" MODE,
+ .cra_name = "essiv(cbc(aes),sha256)",
+ .cra_driver_name = "essiv-cbc-aes-sha256-" MODE,
.cra_priority = PRIO + 1,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_essiv_cbc_ctx),
.cra_module = THIS_MODULE,
@@ -997,28 +952,15 @@ static struct shash_alg mac_algs[] = { {
.descsize = sizeof(struct mac_desc_ctx),
} };

-static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];
-
static void aes_exit(void)
{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++)
- if (aes_simd_algs[i])
- simd_skcipher_free(aes_simd_algs[i]);
-
crypto_unregister_shashes(mac_algs, ARRAY_SIZE(mac_algs));
crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
}

static int __init aes_init(void)
{
- struct simd_skcipher_alg *simd;
- const char *basename;
- const char *algname;
- const char *drvname;
int err;
- int i;

err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
if (err)
@@ -1028,26 +970,8 @@ static int __init aes_init(void)
if (err)
goto unregister_ciphers;

- for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
- if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
- continue;
-
- algname = aes_algs[i].base.cra_name + 2;
- drvname = aes_algs[i].base.cra_driver_name + 2;
- basename = aes_algs[i].base.cra_driver_name;
- simd = simd_skcipher_create_compat(algname, drvname, basename);
- err = PTR_ERR(simd);
- if (IS_ERR(simd))
- goto unregister_simds;
-
- aes_simd_algs[i] = simd;
- }
-
return 0;

-unregister_simds:
- aes_exit();
- return err;
unregister_ciphers:
crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
return err;
--
2.20.1

2021-05-26 10:08:35

by Ard Biesheuvel

[permalink] [raw]
Subject: [PATCH v6 4/6] crypto: arm64/aes-ccm - remove non-SIMD fallback path

AES/CCM on arm64 is implemented as a synchronous AEAD, and so it is
guaranteed by the API that it is only invoked in task or softirq
context. Since softirqs are now only handled when the SIMD is not
being used in the task context that was interrupted to service the
softirq, we no longer need a fallback path. Let's remove it.

Signed-off-by: Ard Biesheuvel <[email protected]>
---
arch/arm64/crypto/aes-ce-ccm-glue.c | 153 ++++----------------
1 file changed, 32 insertions(+), 121 deletions(-)

diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c
index f6d19b0dc893..54bd2494a000 100644
--- a/arch/arm64/crypto/aes-ce-ccm-glue.c
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -6,12 +6,10 @@
*/

#include <asm/neon.h>
-#include <asm/simd.h>
#include <asm/unaligned.h>
#include <crypto/aes.h>
#include <crypto/scatterwalk.h>
#include <crypto/internal/aead.h>
-#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <linux/module.h>

@@ -99,36 +97,10 @@ static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen)
static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[],
u32 abytes, u32 *macp)
{
- if (crypto_simd_usable()) {
- kernel_neon_begin();
- ce_aes_ccm_auth_data(mac, in, abytes, macp, key->key_enc,
- num_rounds(key));
- kernel_neon_end();
- } else {
- if (*macp > 0 && *macp < AES_BLOCK_SIZE) {
- int added = min(abytes, AES_BLOCK_SIZE - *macp);
-
- crypto_xor(&mac[*macp], in, added);
-
- *macp += added;
- in += added;
- abytes -= added;
- }
-
- while (abytes >= AES_BLOCK_SIZE) {
- aes_encrypt(key, mac, mac);
- crypto_xor(mac, in, AES_BLOCK_SIZE);
-
- in += AES_BLOCK_SIZE;
- abytes -= AES_BLOCK_SIZE;
- }
-
- if (abytes > 0) {
- aes_encrypt(key, mac, mac);
- crypto_xor(mac, in, abytes);
- *macp = abytes;
- }
- }
+ kernel_neon_begin();
+ ce_aes_ccm_auth_data(mac, in, abytes, macp, key->key_enc,
+ num_rounds(key));
+ kernel_neon_end();
}

static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
@@ -171,54 +143,6 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
} while (len);
}

-static int ccm_crypt_fallback(struct skcipher_walk *walk, u8 mac[], u8 iv0[],
- struct crypto_aes_ctx *ctx, bool enc)
-{
- u8 buf[AES_BLOCK_SIZE];
- int err = 0;
-
- while (walk->nbytes) {
- int blocks = walk->nbytes / AES_BLOCK_SIZE;
- u32 tail = walk->nbytes % AES_BLOCK_SIZE;
- u8 *dst = walk->dst.virt.addr;
- u8 *src = walk->src.virt.addr;
- u32 nbytes = walk->nbytes;
-
- if (nbytes == walk->total && tail > 0) {
- blocks++;
- tail = 0;
- }
-
- do {
- u32 bsize = AES_BLOCK_SIZE;
-
- if (nbytes < AES_BLOCK_SIZE)
- bsize = nbytes;
-
- crypto_inc(walk->iv, AES_BLOCK_SIZE);
- aes_encrypt(ctx, buf, walk->iv);
- aes_encrypt(ctx, mac, mac);
- if (enc)
- crypto_xor(mac, src, bsize);
- crypto_xor_cpy(dst, src, buf, bsize);
- if (!enc)
- crypto_xor(mac, dst, bsize);
- dst += bsize;
- src += bsize;
- nbytes -= bsize;
- } while (--blocks);
-
- err = skcipher_walk_done(walk, tail);
- }
-
- if (!err) {
- aes_encrypt(ctx, buf, iv0);
- aes_encrypt(ctx, mac, mac);
- crypto_xor(mac, buf, AES_BLOCK_SIZE);
- }
- return err;
-}
-
static int ccm_encrypt(struct aead_request *req)
{
struct crypto_aead *aead = crypto_aead_reqtfm(req);
@@ -241,30 +165,24 @@ static int ccm_encrypt(struct aead_request *req)

err = skcipher_walk_aead_encrypt(&walk, req, false);

- if (crypto_simd_usable()) {
- while (walk.nbytes) {
- u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+ while (walk.nbytes) {
+ u32 tail = walk.nbytes % AES_BLOCK_SIZE;

- if (walk.nbytes == walk.total)
- tail = 0;
+ if (walk.nbytes == walk.total)
+ tail = 0;

- kernel_neon_begin();
- ce_aes_ccm_encrypt(walk.dst.virt.addr,
- walk.src.virt.addr,
- walk.nbytes - tail, ctx->key_enc,
- num_rounds(ctx), mac, walk.iv);
- kernel_neon_end();
+ kernel_neon_begin();
+ ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ walk.nbytes - tail, ctx->key_enc,
+ num_rounds(ctx), mac, walk.iv);
+ kernel_neon_end();

- err = skcipher_walk_done(&walk, tail);
- }
- if (!err) {
- kernel_neon_begin();
- ce_aes_ccm_final(mac, buf, ctx->key_enc,
- num_rounds(ctx));
- kernel_neon_end();
- }
- } else {
- err = ccm_crypt_fallback(&walk, mac, buf, ctx, true);
+ err = skcipher_walk_done(&walk, tail);
+ }
+ if (!err) {
+ kernel_neon_begin();
+ ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
+ kernel_neon_end();
}
if (err)
return err;
@@ -299,32 +217,25 @@ static int ccm_decrypt(struct aead_request *req)

err = skcipher_walk_aead_decrypt(&walk, req, false);

- if (crypto_simd_usable()) {
- while (walk.nbytes) {
- u32 tail = walk.nbytes % AES_BLOCK_SIZE;
+ while (walk.nbytes) {
+ u32 tail = walk.nbytes % AES_BLOCK_SIZE;

- if (walk.nbytes == walk.total)
- tail = 0;
+ if (walk.nbytes == walk.total)
+ tail = 0;

- kernel_neon_begin();
- ce_aes_ccm_decrypt(walk.dst.virt.addr,
- walk.src.virt.addr,
+ kernel_neon_begin();
+ ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
walk.nbytes - tail, ctx->key_enc,
num_rounds(ctx), mac, walk.iv);
- kernel_neon_end();
+ kernel_neon_end();

- err = skcipher_walk_done(&walk, tail);
- }
- if (!err) {
- kernel_neon_begin();
- ce_aes_ccm_final(mac, buf, ctx->key_enc,
- num_rounds(ctx));
- kernel_neon_end();
- }
- } else {
- err = ccm_crypt_fallback(&walk, mac, buf, ctx, false);
+ err = skcipher_walk_done(&walk, tail);
+ }
+ if (!err) {
+ kernel_neon_begin();
+ ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
+ kernel_neon_end();
}
-
if (err)
return err;

--
2.20.1

2021-05-26 10:08:35

by Ard Biesheuvel

[permalink] [raw]
Subject: [PATCH v6 5/6] crypto: arm64/aes-ccm - reduce NEON begin/end calls for common case

AES-CCM (as used in WPA2 CCMP, for instance) typically involves
authenticate-only data, and operates on a single network packet, and so
the common case is for the authenticate, en/decrypt and finalize SIMD
helpers to all be called exactly once in sequence. Since
kernel_neon_end() now involves manipulation of the preemption state as
well as the softirq mask state, let's reduce the number of times we are
forced to call it to only once if we are handling this common case.

Signed-off-by: Ard Biesheuvel <[email protected]>
---
arch/arm64/crypto/aes-ce-ccm-core.S | 1 +
arch/arm64/crypto/aes-ce-ccm-glue.c | 74 +++++++++++---------
2 files changed, 43 insertions(+), 32 deletions(-)

diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S
index 99a028e298ed..8adff299fcd3 100644
--- a/arch/arm64/crypto/aes-ce-ccm-core.S
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@@ -124,6 +124,7 @@ SYM_FUNC_START(ce_aes_ccm_final)
SYM_FUNC_END(ce_aes_ccm_final)

.macro aes_ccm_do_crypt,enc
+ cbz x2, 5f
ldr x8, [x6, #8] /* load lower ctr */
ld1 {v0.16b}, [x5] /* load mac */
CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */
diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c
index 54bd2494a000..98159f2c49ae 100644
--- a/arch/arm64/crypto/aes-ce-ccm-glue.c
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -97,10 +97,8 @@ static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen)
static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[],
u32 abytes, u32 *macp)
{
- kernel_neon_begin();
ce_aes_ccm_auth_data(mac, in, abytes, macp, key->key_enc,
num_rounds(key));
- kernel_neon_end();
}

static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
@@ -157,35 +155,41 @@ static int ccm_encrypt(struct aead_request *req)
if (err)
return err;

- if (req->assoclen)
- ccm_calculate_auth_mac(req, mac);
-
/* preserve the original iv for the final round */
memcpy(buf, req->iv, AES_BLOCK_SIZE);

err = skcipher_walk_aead_encrypt(&walk, req, false);
+ if (unlikely(err))
+ return err;

- while (walk.nbytes) {
+ kernel_neon_begin();
+
+ if (req->assoclen)
+ ccm_calculate_auth_mac(req, mac);
+
+ do {
u32 tail = walk.nbytes % AES_BLOCK_SIZE;

if (walk.nbytes == walk.total)
tail = 0;

- kernel_neon_begin();
ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
walk.nbytes - tail, ctx->key_enc,
num_rounds(ctx), mac, walk.iv);
- kernel_neon_end();

- err = skcipher_walk_done(&walk, tail);
- }
- if (!err) {
- kernel_neon_begin();
- ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
+ if (walk.nbytes == walk.total)
+ ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
+
kernel_neon_end();
- }
- if (err)
- return err;
+
+ if (walk.nbytes) {
+ err = skcipher_walk_done(&walk, tail);
+ if (unlikely(err))
+ return err;
+ if (unlikely(walk.nbytes))
+ kernel_neon_begin();
+ }
+ } while (walk.nbytes);

/* copy authtag to end of dst */
scatterwalk_map_and_copy(mac, req->dst, req->assoclen + req->cryptlen,
@@ -209,35 +213,41 @@ static int ccm_decrypt(struct aead_request *req)
if (err)
return err;

- if (req->assoclen)
- ccm_calculate_auth_mac(req, mac);
-
/* preserve the original iv for the final round */
memcpy(buf, req->iv, AES_BLOCK_SIZE);

err = skcipher_walk_aead_decrypt(&walk, req, false);
+ if (unlikely(err))
+ return err;
+
+ kernel_neon_begin();

- while (walk.nbytes) {
+ if (req->assoclen)
+ ccm_calculate_auth_mac(req, mac);
+
+ do {
u32 tail = walk.nbytes % AES_BLOCK_SIZE;

if (walk.nbytes == walk.total)
tail = 0;

- kernel_neon_begin();
ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
- walk.nbytes - tail, ctx->key_enc,
- num_rounds(ctx), mac, walk.iv);
- kernel_neon_end();
+ walk.nbytes - tail, ctx->key_enc,
+ num_rounds(ctx), mac, walk.iv);
+
+ if (walk.nbytes == walk.total)
+ ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));

- err = skcipher_walk_done(&walk, tail);
- }
- if (!err) {
- kernel_neon_begin();
- ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
kernel_neon_end();
- }
- if (err)
- return err;
+
+ if (walk.nbytes) {
+ err = skcipher_walk_done(&walk, tail);
+ if (unlikely(err))
+ return err;
+ if (unlikely(walk.nbytes))
+ kernel_neon_begin();
+ }
+ } while (walk.nbytes);

/* compare calculated auth tag with the stored one */
scatterwalk_map_and_copy(buf, req->src,
--
2.20.1

2021-05-26 10:08:45

by Ard Biesheuvel

[permalink] [raw]
Subject: [PATCH v6 6/6] crypto: arm64/aes-ccm - avoid by-ref argument for ce_aes_ccm_auth_data

With the SIMD code path removed, we can clean up the CCM auth-only path
a bit further, by passing the 'macp' input buffer pointer by value,
rather than by reference, and taking the output value from the
function's return value.

This way, the compiler is no longer forced to allocate macp on the
stack. This is not expected to make any difference in practice, it just
makes for slightly cleaner code.

Signed-off-by: Ard Biesheuvel <[email protected]>
---
arch/arm64/crypto/aes-ce-ccm-core.S | 23 ++++++++++----------
arch/arm64/crypto/aes-ce-ccm-glue.c | 17 +++++----------
2 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S
index 8adff299fcd3..b03f7f71f893 100644
--- a/arch/arm64/crypto/aes-ce-ccm-core.S
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@@ -12,22 +12,21 @@
.arch armv8-a+crypto

/*
- * void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
- * u32 *macp, u8 const rk[], u32 rounds);
+ * u32 ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
+ * u32 macp, u8 const rk[], u32 rounds);
*/
SYM_FUNC_START(ce_aes_ccm_auth_data)
- ldr w8, [x3] /* leftover from prev round? */
ld1 {v0.16b}, [x0] /* load mac */
- cbz w8, 1f
- sub w8, w8, #16
+ cbz w3, 1f
+ sub w3, w3, #16
eor v1.16b, v1.16b, v1.16b
0: ldrb w7, [x1], #1 /* get 1 byte of input */
subs w2, w2, #1
- add w8, w8, #1
+ add w3, w3, #1
ins v1.b[0], w7
ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */
beq 8f /* out of input? */
- cbnz w8, 0b
+ cbnz w3, 0b
eor v0.16b, v0.16b, v1.16b
1: ld1 {v3.4s}, [x4] /* load first round key */
prfm pldl1strm, [x1]
@@ -62,7 +61,7 @@ SYM_FUNC_START(ce_aes_ccm_auth_data)
beq 10f
adds w2, w2, #16
beq 10f
- mov w8, w2
+ mov w3, w2
7: ldrb w7, [x1], #1
umov w6, v0.b[0]
eor w6, w6, w7
@@ -71,15 +70,15 @@ SYM_FUNC_START(ce_aes_ccm_auth_data)
beq 10f
ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */
b 7b
-8: cbz w8, 91f
- mov w7, w8
- add w8, w8, #16
+8: cbz w3, 91f
+ mov w7, w3
+ add w3, w3, #16
9: ext v1.16b, v1.16b, v1.16b, #1
adds w7, w7, #1
bne 9b
91: eor v0.16b, v0.16b, v1.16b
st1 {v0.16b}, [x0]
-10: str w8, [x3]
+10: mov w0, w3
ret
SYM_FUNC_END(ce_aes_ccm_auth_data)

diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c
index 98159f2c49ae..e8c04512bff7 100644
--- a/arch/arm64/crypto/aes-ce-ccm-glue.c
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -27,8 +27,8 @@ static int num_rounds(struct crypto_aes_ctx *ctx)
return 6 + ctx->key_length / 4;
}

-asmlinkage void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
- u32 *macp, u32 const rk[], u32 rounds);
+asmlinkage u32 ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
+ u32 macp, u32 const rk[], u32 rounds);

asmlinkage void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
u32 const rk[], u32 rounds, u8 mac[],
@@ -94,13 +94,6 @@ static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen)
return 0;
}

-static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[],
- u32 abytes, u32 *macp)
-{
- ce_aes_ccm_auth_data(mac, in, abytes, macp, key->key_enc,
- num_rounds(key));
-}
-
static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
{
struct crypto_aead *aead = crypto_aead_reqtfm(req);
@@ -120,7 +113,8 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
ltag.len = 6;
}

- ccm_update_mac(ctx, mac, (u8 *)&ltag, ltag.len, &macp);
+ macp = ce_aes_ccm_auth_data(mac, (u8 *)&ltag, ltag.len, macp,
+ ctx->key_enc, num_rounds(ctx));
scatterwalk_start(&walk, req->src);

do {
@@ -132,7 +126,8 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
n = scatterwalk_clamp(&walk, len);
}
p = scatterwalk_map(&walk);
- ccm_update_mac(ctx, mac, p, n, &macp);
+ macp = ce_aes_ccm_auth_data(mac, p, n, macp, ctx->key_enc,
+ num_rounds(ctx));
len -= n;

scatterwalk_unmap(p);
--
2.20.1

2021-05-26 19:08:35

by Eric Biggers

[permalink] [raw]
Subject: Re: [PATCH v6 4/6] crypto: arm64/aes-ccm - remove non-SIMD fallback path

On Wed, May 26, 2021 at 12:07:27PM +0200, Ard Biesheuvel wrote:
> AES/CCM on arm64 is implemented as a synchronous AEAD, and so it is
> guaranteed by the API that it is only invoked in task or softirq
> context. Since softirqs are now only handled when the SIMD is not
> being used in the task context that was interrupted to service the
> softirq, we no longer need a fallback path. Let's remove it.
>
> Signed-off-by: Ard Biesheuvel <[email protected]>
> ---
> arch/arm64/crypto/aes-ce-ccm-glue.c | 153 ++++----------------
> 1 file changed, 32 insertions(+), 121 deletions(-)

Reviewed-by: Eric Biggers <[email protected]>

2021-05-26 19:12:17

by Eric Biggers

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] crypto: arm64/aes-ccm - reduce NEON begin/end calls for common case

On Wed, May 26, 2021 at 12:07:28PM +0200, Ard Biesheuvel wrote:
> AES-CCM (as used in WPA2 CCMP, for instance) typically involves
> authenticate-only data, and operates on a single network packet, and so
> the common case is for the authenticate, en/decrypt and finalize SIMD
> helpers to all be called exactly once in sequence. Since
> kernel_neon_end() now involves manipulation of the preemption state as
> well as the softirq mask state, let's reduce the number of times we are
> forced to call it to only once if we are handling this common case.
>
> Signed-off-by: Ard Biesheuvel <[email protected]>
> ---
> arch/arm64/crypto/aes-ce-ccm-core.S | 1 +
> arch/arm64/crypto/aes-ce-ccm-glue.c | 74 +++++++++++---------
> 2 files changed, 43 insertions(+), 32 deletions(-)
>
> diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S
> index 99a028e298ed..8adff299fcd3 100644
> --- a/arch/arm64/crypto/aes-ce-ccm-core.S
> +++ b/arch/arm64/crypto/aes-ce-ccm-core.S
> @@ -124,6 +124,7 @@ SYM_FUNC_START(ce_aes_ccm_final)
> SYM_FUNC_END(ce_aes_ccm_final)
>
> .macro aes_ccm_do_crypt,enc
> + cbz x2, 5f
> ldr x8, [x6, #8] /* load lower ctr */
> ld1 {v0.16b}, [x5] /* load mac */
> CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */
> diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c
> index 54bd2494a000..98159f2c49ae 100644
> --- a/arch/arm64/crypto/aes-ce-ccm-glue.c
> +++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
> @@ -97,10 +97,8 @@ static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen)
> static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[],
> u32 abytes, u32 *macp)
> {
> - kernel_neon_begin();
> ce_aes_ccm_auth_data(mac, in, abytes, macp, key->key_enc,
> num_rounds(key));
> - kernel_neon_end();
> }
[...]
> + if (req->assoclen)
> + ccm_calculate_auth_mac(req, mac);
> +

This still makes all the associated data be processed under a single
kernel_neon_begin() / kernel_neon_end() pair, even if there is a large amount of
it. Shouldn't it be limited to a reasonable amount at a time, like 4K?
This sort of thing has been considered a bug before, e.g. see
commit 706024a52c6 ("crypto: arch/lib - limit simd usage to 4k chunks").

You could do the entire CCM operation under a single pair as long as there isn't
more than 4K of associated data.

- Eric

2021-05-26 19:12:56

by Eric Biggers

[permalink] [raw]
Subject: Re: [PATCH v6 6/6] crypto: arm64/aes-ccm - avoid by-ref argument for ce_aes_ccm_auth_data

On Wed, May 26, 2021 at 12:07:29PM +0200, Ard Biesheuvel wrote:
> With the SIMD code path removed, we can clean up the CCM auth-only path
> a bit further, by passing the 'macp' input buffer pointer by value,
> rather than by reference, and taking the output value from the
> function's return value.
>
> This way, the compiler is no longer forced to allocate macp on the
> stack. This is not expected to make any difference in practice, it just
> makes for slightly cleaner code.
>
> Signed-off-by: Ard Biesheuvel <[email protected]>
> ---
> arch/arm64/crypto/aes-ce-ccm-core.S | 23 ++++++++++----------
> arch/arm64/crypto/aes-ce-ccm-glue.c | 17 +++++----------
> 2 files changed, 17 insertions(+), 23 deletions(-)

Reviewed-by: Eric Biggers <[email protected]>

2021-05-26 19:36:14

by Ard Biesheuvel

[permalink] [raw]
Subject: Re: [PATCH v6 5/6] crypto: arm64/aes-ccm - reduce NEON begin/end calls for common case

On Wed, 26 May 2021 at 19:14, Eric Biggers <[email protected]> wrote:
>
> On Wed, May 26, 2021 at 12:07:28PM +0200, Ard Biesheuvel wrote:
> > AES-CCM (as used in WPA2 CCMP, for instance) typically involves
> > authenticate-only data, and operates on a single network packet, and so
> > the common case is for the authenticate, en/decrypt and finalize SIMD
> > helpers to all be called exactly once in sequence. Since
> > kernel_neon_end() now involves manipulation of the preemption state as
> > well as the softirq mask state, let's reduce the number of times we are
> > forced to call it to only once if we are handling this common case.
> >
> > Signed-off-by: Ard Biesheuvel <[email protected]>
> > ---
> > arch/arm64/crypto/aes-ce-ccm-core.S | 1 +
> > arch/arm64/crypto/aes-ce-ccm-glue.c | 74 +++++++++++---------
> > 2 files changed, 43 insertions(+), 32 deletions(-)
> >
> > diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S
> > index 99a028e298ed..8adff299fcd3 100644
> > --- a/arch/arm64/crypto/aes-ce-ccm-core.S
> > +++ b/arch/arm64/crypto/aes-ce-ccm-core.S
> > @@ -124,6 +124,7 @@ SYM_FUNC_START(ce_aes_ccm_final)
> > SYM_FUNC_END(ce_aes_ccm_final)
> >
> > .macro aes_ccm_do_crypt,enc
> > + cbz x2, 5f
> > ldr x8, [x6, #8] /* load lower ctr */
> > ld1 {v0.16b}, [x5] /* load mac */
> > CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */
> > diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c
> > index 54bd2494a000..98159f2c49ae 100644
> > --- a/arch/arm64/crypto/aes-ce-ccm-glue.c
> > +++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
> > @@ -97,10 +97,8 @@ static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen)
> > static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[],
> > u32 abytes, u32 *macp)
> > {
> > - kernel_neon_begin();
> > ce_aes_ccm_auth_data(mac, in, abytes, macp, key->key_enc,
> > num_rounds(key));
> > - kernel_neon_end();
> > }
> [...]
> > + if (req->assoclen)
> > + ccm_calculate_auth_mac(req, mac);
> > +
>
> This still makes all the associated data be processed under a single
> kernel_neon_begin() / kernel_neon_end() pair, even if there is a large amount of
> it. Shouldn't it be limited to a reasonable amount at a time, like 4K?
> This sort of thing has been considered a bug before, e.g. see
> commit 706024a52c6 ("crypto: arch/lib - limit simd usage to 4k chunks").
>
> You could do the entire CCM operation under a single pair as long as there isn't
> more than 4K of associated data.
>

Good point. I'll add a separate patch for that.