LinuxLists.cc - [PATCH v4 5/8] crypto: arm64/sha256-ce

2024-06-03 18:39:26

Subject: [PATCH v4 5/8] crypto: arm64/sha256-ce - add support for finup_mb

From: Eric Biggers <[email protected]>

Add an implementation of finup_mb to sha256-ce, using an interleaving
factor of 2. It interleaves a finup operation for two equal-length
messages that share a common prefix. dm-verity and fs-verity will take
advantage of this for greatly improved performance on capable CPUs.

On an ARM Cortex-X1, this increases the throughput of SHA-256 hashing
4096-byte messages by 70%.

Signed-off-by: Eric Biggers <[email protected]>
---
arch/arm64/crypto/sha2-ce-core.S | 281 ++++++++++++++++++++++++++++++-
arch/arm64/crypto/sha2-ce-glue.c | 40 +++++
2 files changed, 315 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S
index fce84d88ddb2..fb5d5227e585 100644
--- a/arch/arm64/crypto/sha2-ce-core.S
+++ b/arch/arm64/crypto/sha2-ce-core.S
@@ -68,22 +68,26 @@
.word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
.word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
.word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
.word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2

+ .macro load_round_constants tmp
+ adr_l \tmp, .Lsha2_rcon
+ ld1 { v0.4s- v3.4s}, [\tmp], #64
+ ld1 { v4.4s- v7.4s}, [\tmp], #64
+ ld1 { v8.4s-v11.4s}, [\tmp], #64
+ ld1 {v12.4s-v15.4s}, [\tmp]
+ .endm
+
/*
* int __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
* int blocks)
*/
.text
SYM_FUNC_START(__sha256_ce_transform)
- /* load round constants */
- adr_l x8, .Lsha2_rcon
- ld1 { v0.4s- v3.4s}, [x8], #64
- ld1 { v4.4s- v7.4s}, [x8], #64
- ld1 { v8.4s-v11.4s}, [x8], #64
- ld1 {v12.4s-v15.4s}, [x8]
+
+ load_round_constants x8

/* load state */
ld1 {dgav.4s, dgbv.4s}, [x0]

/* load sha256_ce_state::finalize */
@@ -153,5 +157,270 @@ CPU_LE( rev32 v19.16b, v19.16b )
/* store new state */
3: st1 {dgav.4s, dgbv.4s}, [x0]
mov w0, w2
ret
SYM_FUNC_END(__sha256_ce_transform)
+
+ .unreq dga
+ .unreq dgav
+ .unreq dgb
+ .unreq dgbv
+ .unreq t0
+ .unreq t1
+ .unreq dg0q
+ .unreq dg0v
+ .unreq dg1q
+ .unreq dg1v
+ .unreq dg2q
+ .unreq dg2v
+
+ // parameters for __sha256_ce_finup2x()
+ sctx .req x0
+ data1 .req x1
+ data2 .req x2
+ len .req w3
+ out1 .req x4
+ out2 .req x5
+
+ // other scalar variables
+ count .req x6
+ final_step .req w7
+
+ // x8-x9 are used as temporaries.
+
+ // v0-v15 are used to cache the SHA-256 round constants.
+ // v16-v19 are used for the message schedule for the first message.
+ // v20-v23 are used for the message schedule for the second message.
+ // v24-v31 are used for the state and temporaries as given below.
+ // *_a are for the first message and *_b for the second.
+ state0_a_q .req q24
+ state0_a .req v24
+ state1_a_q .req q25
+ state1_a .req v25
+ state0_b_q .req q26
+ state0_b .req v26
+ state1_b_q .req q27
+ state1_b .req v27
+ t0_a .req v28
+ t0_b .req v29
+ t1_a_q .req q30
+ t1_a .req v30
+ t1_b_q .req q31
+ t1_b .req v31
+
+#define OFFSETOF_COUNT 32 // offsetof(struct sha256_state, count)
+#define OFFSETOF_BUF 40 // offsetof(struct sha256_state, buf)
+// offsetof(struct sha256_state, state) is assumed to be 0.
+
+ // Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a
+ // and m0_b contain the current 4 message schedule words for the first
+ // and second message respectively.
+ //
+ // If not all the message schedule words have been computed yet, then
+ // this also computes 4 more message schedule words for each message.
+ // m1_a-m3_a contain the next 3 groups of 4 message schedule words for
+ // the first message, and likewise m1_b-m3_b for the second. After
+ // consuming the current value of m0_a, this macro computes the group
+ // after m3_a and writes it to m0_a, and likewise for *_b. This means
+ // that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a,
+ // m3_a, m0_a), and likewise for *_b, so the caller must cycle through
+ // the registers accordingly.
+ .macro do_4rounds_2x i, k, m0_a, m1_a, m2_a, m3_a, \
+ m0_b, m1_b, m2_b, m3_b
+ add t0_a\().4s, \m0_a\().4s, \k\().4s
+ add t0_b\().4s, \m0_b\().4s, \k\().4s
+ .if \i < 48
+ sha256su0 \m0_a\().4s, \m1_a\().4s
+ sha256su0 \m0_b\().4s, \m1_b\().4s
+ sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s
+ sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s
+ .endif
+ mov t1_a.16b, state0_a.16b
+ mov t1_b.16b, state0_b.16b
+ sha256h state0_a_q, state1_a_q, t0_a\().4s
+ sha256h state0_b_q, state1_b_q, t0_b\().4s
+ sha256h2 state1_a_q, t1_a_q, t0_a\().4s
+ sha256h2 state1_b_q, t1_b_q, t0_b\().4s
+ .endm
+
+ .macro do_16rounds_2x i, k0, k1, k2, k3
+ do_4rounds_2x \i + 0, \k0, v16, v17, v18, v19, v20, v21, v22, v23
+ do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20
+ do_4rounds_2x \i + 8, \k2, v18, v19, v16, v17, v22, v23, v20, v21
+ do_4rounds_2x \i + 12, \k3, v19, v16, v17, v18, v23, v20, v21, v22
+ .endm
+
+//
+// void __sha256_ce_finup2x(const struct sha256_state *sctx,
+// const u8 *data1, const u8 *data2, int len,
+// u8 out1[SHA256_DIGEST_SIZE],
+// u8 out2[SHA256_DIGEST_SIZE]);
+//
+// This function computes the SHA-256 digests of two messages |data1| and
+// |data2| that are both |len| bytes long, starting from the initial state
+// |sctx|. |len| must be at least SHA256_BLOCK_SIZE.
+//
+// The instructions for the two SHA-256 operations are interleaved. On many
+// CPUs, this is almost twice as fast as hashing each message individually due
+// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
+//
+SYM_FUNC_START(__sha256_ce_finup2x)
+ sub sp, sp, #128
+ mov final_step, #0
+ load_round_constants x8
+
+ // Load the initial state from sctx->state.
+ ld1 {state0_a.4s-state1_a.4s}, [sctx]
+
+ // Load sctx->count. Take the mod 64 of it to get the number of bytes
+ // that are buffered in sctx->buf. Also save it in a register with len
+ // added to it.
+ ldr x8, [sctx, #OFFSETOF_COUNT]
+ add count, x8, len, sxtw
+ and x8, x8, #63
+ cbz x8, .Lfinup2x_enter_loop // No bytes buffered?
+
+ // x8 bytes (1 to 63) are currently buffered in sctx->buf. Load them
+ // followed by the first 64 - x8 bytes of data. Since len >= 64, we
+ // just load 64 bytes from each of sctx->buf, data1, and data2
+ // unconditionally and rearrange the data as needed.
+ add x9, sctx, #OFFSETOF_BUF
+ ld1 {v16.16b-v19.16b}, [x9]
+ st1 {v16.16b-v19.16b}, [sp]
+
+ ld1 {v16.16b-v19.16b}, [data1], #64
+ add x9, sp, x8
+ st1 {v16.16b-v19.16b}, [x9]
+ ld1 {v16.4s-v19.4s}, [sp]
+
+ ld1 {v20.16b-v23.16b}, [data2], #64
+ st1 {v20.16b-v23.16b}, [x9]
+ ld1 {v20.4s-v23.4s}, [sp]
+
+ sub len, len, #64
+ sub data1, data1, x8
+ sub data2, data2, x8
+ add len, len, w8
+ mov state0_b.16b, state0_a.16b
+ mov state1_b.16b, state1_a.16b
+ b .Lfinup2x_loop_have_data
+
+.Lfinup2x_enter_loop:
+ sub len, len, #64
+ mov state0_b.16b, state0_a.16b
+ mov state1_b.16b, state1_a.16b
+.Lfinup2x_loop:
+ // Load the next two data blocks.
+ ld1 {v16.4s-v19.4s}, [data1], #64
+ ld1 {v20.4s-v23.4s}, [data2], #64
+.Lfinup2x_loop_have_data:
+ // Convert the words of the data blocks from big endian.
+CPU_LE( rev32 v16.16b, v16.16b )
+CPU_LE( rev32 v17.16b, v17.16b )
+CPU_LE( rev32 v18.16b, v18.16b )
+CPU_LE( rev32 v19.16b, v19.16b )
+CPU_LE( rev32 v20.16b, v20.16b )
+CPU_LE( rev32 v21.16b, v21.16b )
+CPU_LE( rev32 v22.16b, v22.16b )
+CPU_LE( rev32 v23.16b, v23.16b )
+.Lfinup2x_loop_have_bswapped_data:
+
+ // Save the original state for each block.
+ st1 {state0_a.4s-state1_b.4s}, [sp]
+
+ // Do the SHA-256 rounds on each block.
+ do_16rounds_2x 0, v0, v1, v2, v3
+ do_16rounds_2x 16, v4, v5, v6, v7
+ do_16rounds_2x 32, v8, v9, v10, v11
+ do_16rounds_2x 48, v12, v13, v14, v15
+
+ // Add the original state for each block.
+ ld1 {v16.4s-v19.4s}, [sp]
+ add state0_a.4s, state0_a.4s, v16.4s
+ add state1_a.4s, state1_a.4s, v17.4s
+ add state0_b.4s, state0_b.4s, v18.4s
+ add state1_b.4s, state1_b.4s, v19.4s
+
+ // Update len and loop back if more blocks remain.
+ sub len, len, #64
+ tbz len, #31, .Lfinup2x_loop // len >= 0?
+
+ // Check if any final blocks need to be handled.
+ // final_step = 2: all done
+ // final_step = 1: need to do count-only padding block
+ // final_step = 0: need to do the block with 0x80 padding byte
+ tbnz final_step, #1, .Lfinup2x_done
+ tbnz final_step, #0, .Lfinup2x_finalize_countonly
+ add len, len, #64
+ cbz len, .Lfinup2x_finalize_blockaligned
+
+ // Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block.
+ // To do this, write the padding starting with the 0x80 byte to
+ // &sp[64]. Then for each message, copy the last 64 data bytes to sp
+ // and load from &sp[64 - len] to get the needed padding block. This
+ // code relies on the data buffers being >= 64 bytes in length.
+ sub w8, len, #64 // w8 = len - 64
+ add data1, data1, w8, sxtw // data1 += len - 64
+ add data2, data2, w8, sxtw // data2 += len - 64
+ mov x9, 0x80
+ fmov d16, x9
+ movi v17.16b, #0
+ stp q16, q17, [sp, #64]
+ stp q17, q17, [sp, #96]
+ sub x9, sp, w8, sxtw // x9 = &sp[64 - len]
+ cmp len, #56
+ b.ge 1f // will count spill into its own block?
+ lsl count, count, #3
+ rev count, count
+ str count, [x9, #56]
+ mov final_step, #2 // won't need count-only block
+ b 2f
+1:
+ mov final_step, #1 // will need count-only block
+2:
+ ld1 {v16.16b-v19.16b}, [data1]
+ st1 {v16.16b-v19.16b}, [sp]
+ ld1 {v16.4s-v19.4s}, [x9]
+ ld1 {v20.16b-v23.16b}, [data2]
+ st1 {v20.16b-v23.16b}, [sp]
+ ld1 {v20.4s-v23.4s}, [x9]
+ b .Lfinup2x_loop_have_data
+
+ // Prepare a padding block, either:
+ //
+ // {0x80, 0, 0, 0, ..., count (as __be64)}
+ // This is for a block aligned message.
+ //
+ // { 0, 0, 0, 0, ..., count (as __be64)}
+ // This is for a message whose length mod 64 is >= 56.
+ //
+ // Pre-swap the endianness of the words.
+.Lfinup2x_finalize_countonly:
+ movi v16.2d, #0
+ b 1f
+.Lfinup2x_finalize_blockaligned:
+ mov x8, #0x80000000
+ fmov d16, x8
+1:
+ movi v17.2d, #0
+ movi v18.2d, #0
+ ror count, count, #29 // ror(lsl(count, 3), 32)
+ mov v19.d[0], xzr
+ mov v19.d[1], count
+ mov v20.16b, v16.16b
+ movi v21.2d, #0
+ movi v22.2d, #0
+ mov v23.16b, v19.16b
+ mov final_step, #2
+ b .Lfinup2x_loop_have_bswapped_data
+
+.Lfinup2x_done:
+ // Write the two digests with all bytes in the correct order.
+CPU_LE( rev32 state0_a.16b, state0_a.16b )
+CPU_LE( rev32 state1_a.16b, state1_a.16b )
+CPU_LE( rev32 state0_b.16b, state0_b.16b )
+CPU_LE( rev32 state1_b.16b, state1_b.16b )
+ st1 {state0_a.4s-state1_a.4s}, [out1]
+ st1 {state0_b.4s-state1_b.4s}, [out2]
+ add sp, sp, #128
+ ret
+SYM_FUNC_END(__sha256_ce_finup2x)
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
index 0a44d2e7ee1f..b37cffc4191f 100644
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -31,10 +31,15 @@ extern const u32 sha256_ce_offsetof_count;
extern const u32 sha256_ce_offsetof_finalize;

asmlinkage int __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
int blocks);

+asmlinkage void __sha256_ce_finup2x(const struct sha256_state *sctx,
+ const u8 *data1, const u8 *data2, int len,
+ u8 out1[SHA256_DIGEST_SIZE],
+ u8 out2[SHA256_DIGEST_SIZE]);
+
static void sha256_ce_transform(struct sha256_state *sst, u8 const *src,
int blocks)
{
while (blocks) {
int rem;
@@ -122,10 +127,43 @@ static int sha256_ce_digest(struct shash_desc *desc, const u8 *data,
{
sha256_base_init(desc);
return sha256_ce_finup(desc, data, len, out);
}

+static int sha256_ce_finup_mb(struct shash_desc *desc,
+ const u8 * const data[], unsigned int len,
+ u8 * const outs[], unsigned int num_msgs)
+{
+ struct sha256_ce_state *sctx = shash_desc_ctx(desc);
+
+ /*
+ * num_msgs != 2 should not happen here, since this algorithm sets
+ * mb_max_msgs=2, and the crypto API handles num_msgs <= 1 before
+ * calling into the algorithm's finup_mb method.
+ */
+ if (WARN_ON_ONCE(num_msgs != 2))
+ return -EOPNOTSUPP;
+
+ if (unlikely(!crypto_simd_usable()))
+ return -EOPNOTSUPP;
+
+ /* __sha256_ce_finup2x() assumes SHA256_BLOCK_SIZE <= len <= INT_MAX. */
+ if (unlikely(len < SHA256_BLOCK_SIZE || len > INT_MAX))
+ return -EOPNOTSUPP;
+
+ /* __sha256_ce_finup2x() assumes the following offsets. */
+ BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0);
+ BUILD_BUG_ON(offsetof(struct sha256_state, count) != 32);
+ BUILD_BUG_ON(offsetof(struct sha256_state, buf) != 40);
+
+ kernel_neon_begin();
+ __sha256_ce_finup2x(&sctx->sst, data[0], data[1], len, outs[0],
+ outs[1]);
+ kernel_neon_end();
+ return 0;
+}
+
static int sha256_ce_export(struct shash_desc *desc, void *out)
{
struct sha256_ce_state *sctx = shash_desc_ctx(desc);

memcpy(out, &sctx->sst, sizeof(struct sha256_state));
@@ -162,13 +200,15 @@ static struct shash_alg algs[] = { {
.init = sha256_base_init,
.update = sha256_ce_update,
.final = sha256_ce_final,
.finup = sha256_ce_finup,
.digest = sha256_ce_digest,
+ .finup_mb = sha256_ce_finup_mb,
.export = sha256_ce_export,
.import = sha256_ce_import,
.descsize = sizeof(struct sha256_ce_state),
+ .mb_max_msgs = 2,
.statesize = sizeof(struct sha256_state),
.digestsize = SHA256_DIGEST_SIZE,
.base = {
.cra_name = "sha256",
.cra_driver_name = "sha256-ce",
--
2.45.1

2024-06-04 19:01:11

by Ard Biesheuvel

[permalink] [raw]

Subject: Re: [PATCH v4 5/8] crypto: arm64/sha256-ce - add support for finup_mb

On Mon, 3 Jun 2024 at 20:39, Eric Biggers <[email protected]> wrote:
>
> From: Eric Biggers <[email protected]>
>
> Add an implementation of finup_mb to sha256-ce, using an interleaving
> factor of 2. It interleaves a finup operation for two equal-length
> messages that share a common prefix. dm-verity and fs-verity will take
> advantage of this for greatly improved performance on capable CPUs.
>
> On an ARM Cortex-X1, this increases the throughput of SHA-256 hashing
> 4096-byte messages by 70%.
>
> Signed-off-by: Eric Biggers <[email protected]>

Reviewed-by: Ard Biesheuvel <[email protected]>

> ---
> arch/arm64/crypto/sha2-ce-core.S | 281 ++++++++++++++++++++++++++++++-
> arch/arm64/crypto/sha2-ce-glue.c | 40 +++++
> 2 files changed, 315 insertions(+), 6 deletions(-)
>
> diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S
> index fce84d88ddb2..fb5d5227e585 100644
> --- a/arch/arm64/crypto/sha2-ce-core.S
> +++ b/arch/arm64/crypto/sha2-ce-core.S
> @@ -68,22 +68,26 @@
> .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
> .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
> .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
> .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
>
> + .macro load_round_constants tmp
> + adr_l \tmp, .Lsha2_rcon
> + ld1 { v0.4s- v3.4s}, [\tmp], #64
> + ld1 { v4.4s- v7.4s}, [\tmp], #64
> + ld1 { v8.4s-v11.4s}, [\tmp], #64
> + ld1 {v12.4s-v15.4s}, [\tmp]
> + .endm
> +
> /*
> * int __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
> * int blocks)
> */
> .text
> SYM_FUNC_START(__sha256_ce_transform)
> - /* load round constants */
> - adr_l x8, .Lsha2_rcon
> - ld1 { v0.4s- v3.4s}, [x8], #64
> - ld1 { v4.4s- v7.4s}, [x8], #64
> - ld1 { v8.4s-v11.4s}, [x8], #64
> - ld1 {v12.4s-v15.4s}, [x8]
> +
> + load_round_constants x8
>
> /* load state */
> ld1 {dgav.4s, dgbv.4s}, [x0]
>
> /* load sha256_ce_state::finalize */
> @@ -153,5 +157,270 @@ CPU_LE( rev32 v19.16b, v19.16b )
> /* store new state */
> 3: st1 {dgav.4s, dgbv.4s}, [x0]
> mov w0, w2
> ret
> SYM_FUNC_END(__sha256_ce_transform)
> +
> + .unreq dga
> + .unreq dgav
> + .unreq dgb
> + .unreq dgbv
> + .unreq t0
> + .unreq t1
> + .unreq dg0q
> + .unreq dg0v
> + .unreq dg1q
> + .unreq dg1v
> + .unreq dg2q
> + .unreq dg2v
> +
> + // parameters for __sha256_ce_finup2x()
> + sctx .req x0
> + data1 .req x1
> + data2 .req x2
> + len .req w3
> + out1 .req x4
> + out2 .req x5
> +
> + // other scalar variables
> + count .req x6
> + final_step .req w7
> +
> + // x8-x9 are used as temporaries.
> +
> + // v0-v15 are used to cache the SHA-256 round constants.
> + // v16-v19 are used for the message schedule for the first message.
> + // v20-v23 are used for the message schedule for the second message.
> + // v24-v31 are used for the state and temporaries as given below.
> + // *_a are for the first message and *_b for the second.
> + state0_a_q .req q24
> + state0_a .req v24
> + state1_a_q .req q25
> + state1_a .req v25
> + state0_b_q .req q26
> + state0_b .req v26
> + state1_b_q .req q27
> + state1_b .req v27
> + t0_a .req v28
> + t0_b .req v29
> + t1_a_q .req q30
> + t1_a .req v30
> + t1_b_q .req q31
> + t1_b .req v31
> +
> +#define OFFSETOF_COUNT 32 // offsetof(struct sha256_state, count)
> +#define OFFSETOF_BUF 40 // offsetof(struct sha256_state, buf)
> +// offsetof(struct sha256_state, state) is assumed to be 0.
> +
> + // Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a
> + // and m0_b contain the current 4 message schedule words for the first
> + // and second message respectively.
> + //
> + // If not all the message schedule words have been computed yet, then
> + // this also computes 4 more message schedule words for each message.
> + // m1_a-m3_a contain the next 3 groups of 4 message schedule words for
> + // the first message, and likewise m1_b-m3_b for the second. After
> + // consuming the current value of m0_a, this macro computes the group
> + // after m3_a and writes it to m0_a, and likewise for *_b. This means
> + // that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a,
> + // m3_a, m0_a), and likewise for *_b, so the caller must cycle through
> + // the registers accordingly.
> + .macro do_4rounds_2x i, k, m0_a, m1_a, m2_a, m3_a, \
> + m0_b, m1_b, m2_b, m3_b
> + add t0_a\().4s, \m0_a\().4s, \k\().4s
> + add t0_b\().4s, \m0_b\().4s, \k\().4s
> + .if \i < 48
> + sha256su0 \m0_a\().4s, \m1_a\().4s
> + sha256su0 \m0_b\().4s, \m1_b\().4s
> + sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s
> + sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s
> + .endif
> + mov t1_a.16b, state0_a.16b
> + mov t1_b.16b, state0_b.16b
> + sha256h state0_a_q, state1_a_q, t0_a\().4s
> + sha256h state0_b_q, state1_b_q, t0_b\().4s
> + sha256h2 state1_a_q, t1_a_q, t0_a\().4s
> + sha256h2 state1_b_q, t1_b_q, t0_b\().4s
> + .endm
> +
> + .macro do_16rounds_2x i, k0, k1, k2, k3
> + do_4rounds_2x \i + 0, \k0, v16, v17, v18, v19, v20, v21, v22, v23
> + do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20
> + do_4rounds_2x \i + 8, \k2, v18, v19, v16, v17, v22, v23, v20, v21
> + do_4rounds_2x \i + 12, \k3, v19, v16, v17, v18, v23, v20, v21, v22
> + .endm
> +
> +//
> +// void __sha256_ce_finup2x(const struct sha256_state *sctx,
> +// const u8 *data1, const u8 *data2, int len,
> +// u8 out1[SHA256_DIGEST_SIZE],
> +// u8 out2[SHA256_DIGEST_SIZE]);
> +//
> +// This function computes the SHA-256 digests of two messages |data1| and
> +// |data2| that are both |len| bytes long, starting from the initial state
> +// |sctx|. |len| must be at least SHA256_BLOCK_SIZE.
> +//
> +// The instructions for the two SHA-256 operations are interleaved. On many
> +// CPUs, this is almost twice as fast as hashing each message individually due
> +// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
> +//
> +SYM_FUNC_START(__sha256_ce_finup2x)
> + sub sp, sp, #128
> + mov final_step, #0
> + load_round_constants x8
> +
> + // Load the initial state from sctx->state.
> + ld1 {state0_a.4s-state1_a.4s}, [sctx]
> +
> + // Load sctx->count. Take the mod 64 of it to get the number of bytes
> + // that are buffered in sctx->buf. Also save it in a register with len
> + // added to it.
> + ldr x8, [sctx, #OFFSETOF_COUNT]
> + add count, x8, len, sxtw
> + and x8, x8, #63
> + cbz x8, .Lfinup2x_enter_loop // No bytes buffered?
> +
> + // x8 bytes (1 to 63) are currently buffered in sctx->buf. Load them
> + // followed by the first 64 - x8 bytes of data. Since len >= 64, we
> + // just load 64 bytes from each of sctx->buf, data1, and data2
> + // unconditionally and rearrange the data as needed.
> + add x9, sctx, #OFFSETOF_BUF
> + ld1 {v16.16b-v19.16b}, [x9]
> + st1 {v16.16b-v19.16b}, [sp]
> +
> + ld1 {v16.16b-v19.16b}, [data1], #64
> + add x9, sp, x8
> + st1 {v16.16b-v19.16b}, [x9]
> + ld1 {v16.4s-v19.4s}, [sp]
> +
> + ld1 {v20.16b-v23.16b}, [data2], #64
> + st1 {v20.16b-v23.16b}, [x9]
> + ld1 {v20.4s-v23.4s}, [sp]
> +
> + sub len, len, #64
> + sub data1, data1, x8
> + sub data2, data2, x8
> + add len, len, w8
> + mov state0_b.16b, state0_a.16b
> + mov state1_b.16b, state1_a.16b
> + b .Lfinup2x_loop_have_data
> +
> +.Lfinup2x_enter_loop:
> + sub len, len, #64
> + mov state0_b.16b, state0_a.16b
> + mov state1_b.16b, state1_a.16b
> +.Lfinup2x_loop:
> + // Load the next two data blocks.
> + ld1 {v16.4s-v19.4s}, [data1], #64
> + ld1 {v20.4s-v23.4s}, [data2], #64
> +.Lfinup2x_loop_have_data:
> + // Convert the words of the data blocks from big endian.
> +CPU_LE( rev32 v16.16b, v16.16b )
> +CPU_LE( rev32 v17.16b, v17.16b )
> +CPU_LE( rev32 v18.16b, v18.16b )
> +CPU_LE( rev32 v19.16b, v19.16b )
> +CPU_LE( rev32 v20.16b, v20.16b )
> +CPU_LE( rev32 v21.16b, v21.16b )
> +CPU_LE( rev32 v22.16b, v22.16b )
> +CPU_LE( rev32 v23.16b, v23.16b )
> +.Lfinup2x_loop_have_bswapped_data:
> +
> + // Save the original state for each block.
> + st1 {state0_a.4s-state1_b.4s}, [sp]
> +
> + // Do the SHA-256 rounds on each block.
> + do_16rounds_2x 0, v0, v1, v2, v3
> + do_16rounds_2x 16, v4, v5, v6, v7
> + do_16rounds_2x 32, v8, v9, v10, v11
> + do_16rounds_2x 48, v12, v13, v14, v15
> +
> + // Add the original state for each block.
> + ld1 {v16.4s-v19.4s}, [sp]
> + add state0_a.4s, state0_a.4s, v16.4s
> + add state1_a.4s, state1_a.4s, v17.4s
> + add state0_b.4s, state0_b.4s, v18.4s
> + add state1_b.4s, state1_b.4s, v19.4s
> +
> + // Update len and loop back if more blocks remain.
> + sub len, len, #64
> + tbz len, #31, .Lfinup2x_loop // len >= 0?
> +
> + // Check if any final blocks need to be handled.
> + // final_step = 2: all done
> + // final_step = 1: need to do count-only padding block
> + // final_step = 0: need to do the block with 0x80 padding byte
> + tbnz final_step, #1, .Lfinup2x_done
> + tbnz final_step, #0, .Lfinup2x_finalize_countonly
> + add len, len, #64
> + cbz len, .Lfinup2x_finalize_blockaligned
> +
> + // Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block.
> + // To do this, write the padding starting with the 0x80 byte to
> + // &sp[64]. Then for each message, copy the last 64 data bytes to sp
> + // and load from &sp[64 - len] to get the needed padding block. This
> + // code relies on the data buffers being >= 64 bytes in length.
> + sub w8, len, #64 // w8 = len - 64
> + add data1, data1, w8, sxtw // data1 += len - 64
> + add data2, data2, w8, sxtw // data2 += len - 64
> + mov x9, 0x80
> + fmov d16, x9
> + movi v17.16b, #0
> + stp q16, q17, [sp, #64]
> + stp q17, q17, [sp, #96]
> + sub x9, sp, w8, sxtw // x9 = &sp[64 - len]
> + cmp len, #56
> + b.ge 1f // will count spill into its own block?
> + lsl count, count, #3
> + rev count, count
> + str count, [x9, #56]
> + mov final_step, #2 // won't need count-only block
> + b 2f
> +1:
> + mov final_step, #1 // will need count-only block
> +2:
> + ld1 {v16.16b-v19.16b}, [data1]
> + st1 {v16.16b-v19.16b}, [sp]
> + ld1 {v16.4s-v19.4s}, [x9]
> + ld1 {v20.16b-v23.16b}, [data2]
> + st1 {v20.16b-v23.16b}, [sp]
> + ld1 {v20.4s-v23.4s}, [x9]
> + b .Lfinup2x_loop_have_data
> +
> + // Prepare a padding block, either:
> + //
> + // {0x80, 0, 0, 0, ..., count (as __be64)}
> + // This is for a block aligned message.
> + //
> + // { 0, 0, 0, 0, ..., count (as __be64)}
> + // This is for a message whose length mod 64 is >= 56.
> + //
> + // Pre-swap the endianness of the words.
> +.Lfinup2x_finalize_countonly:
> + movi v16.2d, #0
> + b 1f
> +.Lfinup2x_finalize_blockaligned:
> + mov x8, #0x80000000
> + fmov d16, x8
> +1:
> + movi v17.2d, #0
> + movi v18.2d, #0
> + ror count, count, #29 // ror(lsl(count, 3), 32)
> + mov v19.d[0], xzr
> + mov v19.d[1], count
> + mov v20.16b, v16.16b
> + movi v21.2d, #0
> + movi v22.2d, #0
> + mov v23.16b, v19.16b
> + mov final_step, #2
> + b .Lfinup2x_loop_have_bswapped_data
> +
> +.Lfinup2x_done:
> + // Write the two digests with all bytes in the correct order.
> +CPU_LE( rev32 state0_a.16b, state0_a.16b )
> +CPU_LE( rev32 state1_a.16b, state1_a.16b )
> +CPU_LE( rev32 state0_b.16b, state0_b.16b )
> +CPU_LE( rev32 state1_b.16b, state1_b.16b )
> + st1 {state0_a.4s-state1_a.4s}, [out1]
> + st1 {state0_b.4s-state1_b.4s}, [out2]
> + add sp, sp, #128
> + ret
> +SYM_FUNC_END(__sha256_ce_finup2x)
> diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
> index 0a44d2e7ee1f..b37cffc4191f 100644
> --- a/arch/arm64/crypto/sha2-ce-glue.c
> +++ b/arch/arm64/crypto/sha2-ce-glue.c
> @@ -31,10 +31,15 @@ extern const u32 sha256_ce_offsetof_count;
> extern const u32 sha256_ce_offsetof_finalize;
>
> asmlinkage int __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
> int blocks);
>
> +asmlinkage void __sha256_ce_finup2x(const struct sha256_state *sctx,
> + const u8 *data1, const u8 *data2, int len,
> + u8 out1[SHA256_DIGEST_SIZE],
> + u8 out2[SHA256_DIGEST_SIZE]);
> +
> static void sha256_ce_transform(struct sha256_state *sst, u8 const *src,
> int blocks)
> {
> while (blocks) {
> int rem;
> @@ -122,10 +127,43 @@ static int sha256_ce_digest(struct shash_desc *desc, const u8 *data,
> {
> sha256_base_init(desc);
> return sha256_ce_finup(desc, data, len, out);
> }
>
> +static int sha256_ce_finup_mb(struct shash_desc *desc,
> + const u8 * const data[], unsigned int len,
> + u8 * const outs[], unsigned int num_msgs)
> +{
> + struct sha256_ce_state *sctx = shash_desc_ctx(desc);
> +
> + /*
> + * num_msgs != 2 should not happen here, since this algorithm sets
> + * mb_max_msgs=2, and the crypto API handles num_msgs <= 1 before
> + * calling into the algorithm's finup_mb method.
> + */
> + if (WARN_ON_ONCE(num_msgs != 2))
> + return -EOPNOTSUPP;
> +
> + if (unlikely(!crypto_simd_usable()))
> + return -EOPNOTSUPP;
> +
> + /* __sha256_ce_finup2x() assumes SHA256_BLOCK_SIZE <= len <= INT_MAX. */
> + if (unlikely(len < SHA256_BLOCK_SIZE || len > INT_MAX))
> + return -EOPNOTSUPP;
> +
> + /* __sha256_ce_finup2x() assumes the following offsets. */
> + BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0);
> + BUILD_BUG_ON(offsetof(struct sha256_state, count) != 32);
> + BUILD_BUG_ON(offsetof(struct sha256_state, buf) != 40);
> +
> + kernel_neon_begin();
> + __sha256_ce_finup2x(&sctx->sst, data[0], data[1], len, outs[0],
> + outs[1]);
> + kernel_neon_end();
> + return 0;
> +}
> +
> static int sha256_ce_export(struct shash_desc *desc, void *out)
> {
> struct sha256_ce_state *sctx = shash_desc_ctx(desc);
>
> memcpy(out, &sctx->sst, sizeof(struct sha256_state));
> @@ -162,13 +200,15 @@ static struct shash_alg algs[] = { {
> .init = sha256_base_init,
> .update = sha256_ce_update,
> .final = sha256_ce_final,
> .finup = sha256_ce_finup,
> .digest = sha256_ce_digest,
> + .finup_mb = sha256_ce_finup_mb,
> .export = sha256_ce_export,
> .import = sha256_ce_import,
> .descsize = sizeof(struct sha256_ce_state),
> + .mb_max_msgs = 2,
> .statesize = sizeof(struct sha256_state),
> .digestsize = SHA256_DIGEST_SIZE,
> .base = {
> .cra_name = "sha256",
> .cra_driver_name = "sha256-ce",
> --
> 2.45.1
>