2020-04-22 23:20:39

by Jason A. Donenfeld

[permalink] [raw]
Subject: [PATCH crypto-stable v3 1/2] crypto: arch/lib - limit simd usage to 4k chunks

The initial Zinc patchset, after some mailing list discussion, contained
code to ensure that kernel_fpu_enable would not be kept on for more than
a 4k chunk, since it disables preemption. The choice of 4k isn't totally
scientific, but it's not a bad guess either, and it's what's used in
both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form
of PAGE_SIZE, which this commit corrects to be explicitly 4k for the
former two).

Ard did some back of the envelope calculations and found that
at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k
means we have a maximum preemption disabling of 20us, which Sebastian
confirmed was probably a good limit.

Unfortunately the chunking appears to have been left out of the final
patchset that added the glue code. So, this commit adds it back in.

Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function")
Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel")
Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
Cc: Eric Biggers <[email protected]>
Cc: Ard Biesheuvel <[email protected]>
Cc: Sebastian Andrzej Siewior <[email protected]>
Cc: [email protected]
Signed-off-by: Jason A. Donenfeld <[email protected]>
---
Changes v2->v3:
- [Eric] Split nhpoly1305 changes into separate commit, since it's not
related to the library interface.

Changes v1->v2:
- [Ard] Use explicit 4k chunks instead of PAGE_SIZE.
- [Eric] Prefer do-while over for (;;).

arch/arm/crypto/chacha-glue.c | 14 +++++++++++---
arch/arm/crypto/poly1305-glue.c | 15 +++++++++++----
arch/arm64/crypto/chacha-neon-glue.c | 14 +++++++++++---
arch/arm64/crypto/poly1305-glue.c | 15 +++++++++++----
arch/x86/crypto/blake2s-glue.c | 10 ++++------
arch/x86/crypto/chacha_glue.c | 14 +++++++++++---
arch/x86/crypto/poly1305_glue.c | 13 ++++++-------
7 files changed, 65 insertions(+), 30 deletions(-)

diff --git a/arch/arm/crypto/chacha-glue.c b/arch/arm/crypto/chacha-glue.c
index 6fdb0ac62b3d..59da6c0b63b6 100644
--- a/arch/arm/crypto/chacha-glue.c
+++ b/arch/arm/crypto/chacha-glue.c
@@ -91,9 +91,17 @@ void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
return;
}

- kernel_neon_begin();
- chacha_doneon(state, dst, src, bytes, nrounds);
- kernel_neon_end();
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ kernel_neon_begin();
+ chacha_doneon(state, dst, src, todo, nrounds);
+ kernel_neon_end();
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
}
EXPORT_SYMBOL(chacha_crypt_arch);

diff --git a/arch/arm/crypto/poly1305-glue.c b/arch/arm/crypto/poly1305-glue.c
index ceec04ec2f40..13cfef4ae22e 100644
--- a/arch/arm/crypto/poly1305-glue.c
+++ b/arch/arm/crypto/poly1305-glue.c
@@ -160,13 +160,20 @@ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);

if (static_branch_likely(&have_neon) && do_neon) {
- kernel_neon_begin();
- poly1305_blocks_neon(&dctx->h, src, len, 1);
- kernel_neon_end();
+ do {
+ unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+ kernel_neon_begin();
+ poly1305_blocks_neon(&dctx->h, src, todo, 1);
+ kernel_neon_end();
+
+ len -= todo;
+ src += todo;
+ } while (len);
} else {
poly1305_blocks_arm(&dctx->h, src, len, 1);
+ src += len;
}
- src += len;
nbytes %= POLY1305_BLOCK_SIZE;
}

diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c
index 37ca3e889848..af2bbca38e70 100644
--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
@@ -87,9 +87,17 @@ void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
!crypto_simd_usable())
return chacha_crypt_generic(state, dst, src, bytes, nrounds);

- kernel_neon_begin();
- chacha_doneon(state, dst, src, bytes, nrounds);
- kernel_neon_end();
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ kernel_neon_begin();
+ chacha_doneon(state, dst, src, todo, nrounds);
+ kernel_neon_end();
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
}
EXPORT_SYMBOL(chacha_crypt_arch);

diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c
index e97b092f56b8..f33ada70c4ed 100644
--- a/arch/arm64/crypto/poly1305-glue.c
+++ b/arch/arm64/crypto/poly1305-glue.c
@@ -143,13 +143,20 @@ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);

if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
- kernel_neon_begin();
- poly1305_blocks_neon(&dctx->h, src, len, 1);
- kernel_neon_end();
+ do {
+ unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+ kernel_neon_begin();
+ poly1305_blocks_neon(&dctx->h, src, todo, 1);
+ kernel_neon_end();
+
+ len -= todo;
+ src += todo;
+ } while (len);
} else {
poly1305_blocks(&dctx->h, src, len, 1);
+ src += len;
}
- src += len;
nbytes %= POLY1305_BLOCK_SIZE;
}

diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
index 06ef2d4a4701..6737bcea1fa1 100644
--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
@@ -32,16 +32,16 @@ void blake2s_compress_arch(struct blake2s_state *state,
const u32 inc)
{
/* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8);
+ BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);

if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) {
blake2s_compress_generic(state, block, nblocks, inc);
return;
}

- for (;;) {
+ do {
const size_t blocks = min_t(size_t, nblocks,
- PAGE_SIZE / BLAKE2S_BLOCK_SIZE);
+ SZ_4K / BLAKE2S_BLOCK_SIZE);

kernel_fpu_begin();
if (IS_ENABLED(CONFIG_AS_AVX512) &&
@@ -52,10 +52,8 @@ void blake2s_compress_arch(struct blake2s_state *state,
kernel_fpu_end();

nblocks -= blocks;
- if (!nblocks)
- break;
block += blocks * BLAKE2S_BLOCK_SIZE;
- }
+ } while (nblocks);
}
EXPORT_SYMBOL(blake2s_compress_arch);

diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
index b412c21ee06e..22250091cdbe 100644
--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
@@ -153,9 +153,17 @@ void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
bytes <= CHACHA_BLOCK_SIZE)
return chacha_crypt_generic(state, dst, src, bytes, nrounds);

- kernel_fpu_begin();
- chacha_dosimd(state, dst, src, bytes, nrounds);
- kernel_fpu_end();
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ kernel_fpu_begin();
+ chacha_dosimd(state, dst, src, todo, nrounds);
+ kernel_fpu_end();
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
}
EXPORT_SYMBOL(chacha_crypt_arch);

diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index 6dfec19f7d57..dfe921efa9b2 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -91,8 +91,8 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
struct poly1305_arch_internal *state = ctx;

/* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
- PAGE_SIZE % POLY1305_BLOCK_SIZE);
+ BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
+ SZ_4K % POLY1305_BLOCK_SIZE);

if (!static_branch_likely(&poly1305_use_avx) ||
(len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
@@ -102,8 +102,8 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
return;
}

- for (;;) {
- const size_t bytes = min_t(size_t, len, PAGE_SIZE);
+ do {
+ const size_t bytes = min_t(size_t, len, SZ_4K);

kernel_fpu_begin();
if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
@@ -113,11 +113,10 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
else
poly1305_blocks_avx(ctx, inp, bytes, padbit);
kernel_fpu_end();
+
len -= bytes;
- if (!len)
- break;
inp += bytes;
- }
+ } while (len);
}

static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
--
2.26.2


2020-04-23 07:43:17

by Christophe Leroy

[permalink] [raw]
Subject: Re: [PATCH crypto-stable v3 1/2] crypto: arch/lib - limit simd usage to 4k chunks



Le 23/04/2020 à 09:18, Ard Biesheuvel a écrit :
> FYI: you shouldn't cc [email protected] directly on your patches,
> or add the cc: line. Only patches that are already in Linus' tree
> should be sent there.
>
> Also, the fixes tags are really quite sufficient. In fact, it is
> actually rather difficult these days to prevent something from being
> taken into -stable if the bots notice that it applies cleanly.

According to Kernel Documentation,
https://www.kernel.org/doc/html/latest/process/submitting-patches.html :


Patches that fix a severe bug in a released kernel should be directed
toward the stable maintainers by putting a line like this:

Cc: [email protected]

into the sign-off area of your patch (note, NOT an email recipient). You
should also read Documentation/process/stable-kernel-rules.rst in
addition to this file.


Isn't it correct anymore ?

Christophe


>
> On Thu, 23 Apr 2020 at 01:19, Jason A. Donenfeld <[email protected]> wrote:
>>
>> The initial Zinc patchset, after some mailing list discussion, contained
>> code to ensure that kernel_fpu_enable would not be kept on for more than
>> a 4k chunk, since it disables preemption. The choice of 4k isn't totally
>> scientific, but it's not a bad guess either, and it's what's used in
>> both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form
>> of PAGE_SIZE, which this commit corrects to be explicitly 4k for the
>> former two).
>>
>> Ard did some back of the envelope calculations and found that
>> at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k
>> means we have a maximum preemption disabling of 20us, which Sebastian
>> confirmed was probably a good limit.
>>
>> Unfortunately the chunking appears to have been left out of the final
>> patchset that added the glue code. So, this commit adds it back in.
>>
>> Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
>> Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
>> Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function")
>> Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel")
>> Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
>> Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
>> Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
>> Cc: Eric Biggers <[email protected]>
>> Cc: Ard Biesheuvel <[email protected]>
>> Cc: Sebastian Andrzej Siewior <[email protected]>
>> Cc: [email protected]
>> Signed-off-by: Jason A. Donenfeld <[email protected]>
>
> Reviewed-by: Ard Biesheuvel <[email protected]>
>
> Thanks for cleaning this up
>
>> ---
>> Changes v2->v3:
>> - [Eric] Split nhpoly1305 changes into separate commit, since it's not
>> related to the library interface.
>>
>> Changes v1->v2:
>> - [Ard] Use explicit 4k chunks instead of PAGE_SIZE.
>> - [Eric] Prefer do-while over for (;;).
>>
>> arch/arm/crypto/chacha-glue.c | 14 +++++++++++---
>> arch/arm/crypto/poly1305-glue.c | 15 +++++++++++----
>> arch/arm64/crypto/chacha-neon-glue.c | 14 +++++++++++---
>> arch/arm64/crypto/poly1305-glue.c | 15 +++++++++++----
>> arch/x86/crypto/blake2s-glue.c | 10 ++++------
>> arch/x86/crypto/chacha_glue.c | 14 +++++++++++---
>> arch/x86/crypto/poly1305_glue.c | 13 ++++++-------
>> 7 files changed, 65 insertions(+), 30 deletions(-)
>>
>> diff --git a/arch/arm/crypto/chacha-glue.c b/arch/arm/crypto/chacha-glue.c
>> index 6fdb0ac62b3d..59da6c0b63b6 100644
>> --- a/arch/arm/crypto/chacha-glue.c
>> +++ b/arch/arm/crypto/chacha-glue.c
>> @@ -91,9 +91,17 @@ void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
>> return;
>> }
>>
>> - kernel_neon_begin();
>> - chacha_doneon(state, dst, src, bytes, nrounds);
>> - kernel_neon_end();
>> + do {
>> + unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
>> +
>> + kernel_neon_begin();
>> + chacha_doneon(state, dst, src, todo, nrounds);
>> + kernel_neon_end();
>> +
>> + bytes -= todo;
>> + src += todo;
>> + dst += todo;
>> + } while (bytes);
>> }
>> EXPORT_SYMBOL(chacha_crypt_arch);
>>
>> diff --git a/arch/arm/crypto/poly1305-glue.c b/arch/arm/crypto/poly1305-glue.c
>> index ceec04ec2f40..13cfef4ae22e 100644
>> --- a/arch/arm/crypto/poly1305-glue.c
>> +++ b/arch/arm/crypto/poly1305-glue.c
>> @@ -160,13 +160,20 @@ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
>> unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
>>
>> if (static_branch_likely(&have_neon) && do_neon) {
>> - kernel_neon_begin();
>> - poly1305_blocks_neon(&dctx->h, src, len, 1);
>> - kernel_neon_end();
>> + do {
>> + unsigned int todo = min_t(unsigned int, len, SZ_4K);
>> +
>> + kernel_neon_begin();
>> + poly1305_blocks_neon(&dctx->h, src, todo, 1);
>> + kernel_neon_end();
>> +
>> + len -= todo;
>> + src += todo;
>> + } while (len);
>> } else {
>> poly1305_blocks_arm(&dctx->h, src, len, 1);
>> + src += len;
>> }
>> - src += len;
>> nbytes %= POLY1305_BLOCK_SIZE;
>> }
>>
>> diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c
>> index 37ca3e889848..af2bbca38e70 100644
>> --- a/arch/arm64/crypto/chacha-neon-glue.c
>> +++ b/arch/arm64/crypto/chacha-neon-glue.c
>> @@ -87,9 +87,17 @@ void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
>> !crypto_simd_usable())
>> return chacha_crypt_generic(state, dst, src, bytes, nrounds);
>>
>> - kernel_neon_begin();
>> - chacha_doneon(state, dst, src, bytes, nrounds);
>> - kernel_neon_end();
>> + do {
>> + unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
>> +
>> + kernel_neon_begin();
>> + chacha_doneon(state, dst, src, todo, nrounds);
>> + kernel_neon_end();
>> +
>> + bytes -= todo;
>> + src += todo;
>> + dst += todo;
>> + } while (bytes);
>> }
>> EXPORT_SYMBOL(chacha_crypt_arch);
>>
>> diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c
>> index e97b092f56b8..f33ada70c4ed 100644
>> --- a/arch/arm64/crypto/poly1305-glue.c
>> +++ b/arch/arm64/crypto/poly1305-glue.c
>> @@ -143,13 +143,20 @@ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
>> unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
>>
>> if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
>> - kernel_neon_begin();
>> - poly1305_blocks_neon(&dctx->h, src, len, 1);
>> - kernel_neon_end();
>> + do {
>> + unsigned int todo = min_t(unsigned int, len, SZ_4K);
>> +
>> + kernel_neon_begin();
>> + poly1305_blocks_neon(&dctx->h, src, todo, 1);
>> + kernel_neon_end();
>> +
>> + len -= todo;
>> + src += todo;
>> + } while (len);
>> } else {
>> poly1305_blocks(&dctx->h, src, len, 1);
>> + src += len;
>> }
>> - src += len;
>> nbytes %= POLY1305_BLOCK_SIZE;
>> }
>>
>> diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
>> index 06ef2d4a4701..6737bcea1fa1 100644
>> --- a/arch/x86/crypto/blake2s-glue.c
>> +++ b/arch/x86/crypto/blake2s-glue.c
>> @@ -32,16 +32,16 @@ void blake2s_compress_arch(struct blake2s_state *state,
>> const u32 inc)
>> {
>> /* SIMD disables preemption, so relax after processing each page. */
>> - BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8);
>> + BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
>>
>> if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) {
>> blake2s_compress_generic(state, block, nblocks, inc);
>> return;
>> }
>>
>> - for (;;) {
>> + do {
>> const size_t blocks = min_t(size_t, nblocks,
>> - PAGE_SIZE / BLAKE2S_BLOCK_SIZE);
>> + SZ_4K / BLAKE2S_BLOCK_SIZE);
>>
>> kernel_fpu_begin();
>> if (IS_ENABLED(CONFIG_AS_AVX512) &&
>> @@ -52,10 +52,8 @@ void blake2s_compress_arch(struct blake2s_state *state,
>> kernel_fpu_end();
>>
>> nblocks -= blocks;
>> - if (!nblocks)
>> - break;
>> block += blocks * BLAKE2S_BLOCK_SIZE;
>> - }
>> + } while (nblocks);
>> }
>> EXPORT_SYMBOL(blake2s_compress_arch);
>>
>> diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
>> index b412c21ee06e..22250091cdbe 100644
>> --- a/arch/x86/crypto/chacha_glue.c
>> +++ b/arch/x86/crypto/chacha_glue.c
>> @@ -153,9 +153,17 @@ void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
>> bytes <= CHACHA_BLOCK_SIZE)
>> return chacha_crypt_generic(state, dst, src, bytes, nrounds);
>>
>> - kernel_fpu_begin();
>> - chacha_dosimd(state, dst, src, bytes, nrounds);
>> - kernel_fpu_end();
>> + do {
>> + unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
>> +
>> + kernel_fpu_begin();
>> + chacha_dosimd(state, dst, src, todo, nrounds);
>> + kernel_fpu_end();
>> +
>> + bytes -= todo;
>> + src += todo;
>> + dst += todo;
>> + } while (bytes);
>> }
>> EXPORT_SYMBOL(chacha_crypt_arch);
>>
>> diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
>> index 6dfec19f7d57..dfe921efa9b2 100644
>> --- a/arch/x86/crypto/poly1305_glue.c
>> +++ b/arch/x86/crypto/poly1305_glue.c
>> @@ -91,8 +91,8 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
>> struct poly1305_arch_internal *state = ctx;
>>
>> /* SIMD disables preemption, so relax after processing each page. */
>> - BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
>> - PAGE_SIZE % POLY1305_BLOCK_SIZE);
>> + BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
>> + SZ_4K % POLY1305_BLOCK_SIZE);
>>
>> if (!static_branch_likely(&poly1305_use_avx) ||
>> (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
>> @@ -102,8 +102,8 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
>> return;
>> }
>>
>> - for (;;) {
>> - const size_t bytes = min_t(size_t, len, PAGE_SIZE);
>> + do {
>> + const size_t bytes = min_t(size_t, len, SZ_4K);
>>
>> kernel_fpu_begin();
>> if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
>> @@ -113,11 +113,10 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
>> else
>> poly1305_blocks_avx(ctx, inp, bytes, padbit);
>> kernel_fpu_end();
>> +
>> len -= bytes;
>> - if (!len)
>> - break;
>> inp += bytes;
>> - }
>> + } while (len);
>> }
>>
>> static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
>> --
>> 2.26.2
>>