2024-04-11 17:49:31

by Eric Biggers

[permalink] [raw]
Subject: [PATCH v2 0/4] crypto: x86/sha256-ni - cleanup and optimization

This patchset reduces the amount of source code duplication in
sha256_ni_asm.S and also reduces the binary code size slightly.

Changed in v2:
- Take advantage of .irp in patch 1
- Added two additional cleanup patches

Eric Biggers (4):
crypto: x86/sha256-ni - convert to use rounds macros
crypto: x86/sha256-ni - rename some register aliases
crypto: x86/sha256-ni - optimize code size
crypto: x86/sha256-ni - simplify do_4rounds

arch/x86/crypto/sha256_ni_asm.S | 253 +++++++-------------------------
1 file changed, 49 insertions(+), 204 deletions(-)


base-commit: 20cb38a7af88dc40095da7c2c9094da3873fea23
--
2.44.0



2024-04-11 17:49:39

by Eric Biggers

[permalink] [raw]
Subject: [PATCH v2 2/4] crypto: x86/sha256-ni - rename some register aliases

From: Eric Biggers <[email protected]>

MSGTMP[0-3] are used to hold the message schedule and are not temporary
registers per se. MSGTMP4 is used as a temporary register for several
different purposes and isn't really related to MSGTMP[0-3]. Rename them
to MSG[0-3] and TMP accordingly.

Also add a comment that clarifies what MSG is.

Suggested-by: Stefan Kanthak <[email protected]>
Signed-off-by: Eric Biggers <[email protected]>
---
arch/x86/crypto/sha256_ni_asm.S | 34 ++++++++++++++++-----------------
1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
index 498f67727b94..b7e7001dafdf 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -60,18 +60,18 @@
#define DATA_PTR %rsi /* 2nd arg */
#define NUM_BLKS %rdx /* 3rd arg */

#define SHA256CONSTANTS %rax

-#define MSG %xmm0
+#define MSG %xmm0 /* sha256rnds2 implicit operand */
#define STATE0 %xmm1
#define STATE1 %xmm2
-#define MSGTMP0 %xmm3
-#define MSGTMP1 %xmm4
-#define MSGTMP2 %xmm5
-#define MSGTMP3 %xmm6
-#define MSGTMP4 %xmm7
+#define MSG0 %xmm3
+#define MSG1 %xmm4
+#define MSG2 %xmm5
+#define MSG3 %xmm6
+#define TMP %xmm7

#define SHUF_MASK %xmm8

#define ABEF_SAVE %xmm9
#define CDGH_SAVE %xmm10
@@ -85,13 +85,13 @@
movdqa \m0, MSG
.endif
paddd \i*4(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
.if \i >= 12 && \i < 60
- movdqa \m0, MSGTMP4
- palignr $4, \m3, MSGTMP4
- paddd MSGTMP4, \m1
+ movdqa \m0, TMP
+ palignr $4, \m3, TMP
+ paddd TMP, \m1
sha256msg2 \m0, \m1
.endif
pshufd $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
.if \i >= 4 && \i < 52
@@ -131,27 +131,27 @@ SYM_TYPED_FUNC_START(sha256_ni_transform)
movdqu 0*16(DIGEST_PTR), STATE0
movdqu 1*16(DIGEST_PTR), STATE1

pshufd $0xB1, STATE0, STATE0 /* CDAB */
pshufd $0x1B, STATE1, STATE1 /* EFGH */
- movdqa STATE0, MSGTMP4
+ movdqa STATE0, TMP
palignr $8, STATE1, STATE0 /* ABEF */
- pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */
+ pblendw $0xF0, TMP, STATE1 /* CDGH */

movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
lea K256(%rip), SHA256CONSTANTS

.Lloop0:
/* Save hash values for addition after rounds */
movdqa STATE0, ABEF_SAVE
movdqa STATE1, CDGH_SAVE

.irp i, 0, 16, 32, 48
- do_4rounds (\i + 0), MSGTMP0, MSGTMP1, MSGTMP2, MSGTMP3
- do_4rounds (\i + 4), MSGTMP1, MSGTMP2, MSGTMP3, MSGTMP0
- do_4rounds (\i + 8), MSGTMP2, MSGTMP3, MSGTMP0, MSGTMP1
- do_4rounds (\i + 12), MSGTMP3, MSGTMP0, MSGTMP1, MSGTMP2
+ do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3
+ do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0
+ do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1
+ do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2
.endr

/* Add current hash values with previously saved */
paddd ABEF_SAVE, STATE0
paddd CDGH_SAVE, STATE1
@@ -162,13 +162,13 @@ SYM_TYPED_FUNC_START(sha256_ni_transform)
jne .Lloop0

/* Write hash values back in the correct order */
pshufd $0x1B, STATE0, STATE0 /* FEBA */
pshufd $0xB1, STATE1, STATE1 /* DCHG */
- movdqa STATE0, MSGTMP4
+ movdqa STATE0, TMP
pblendw $0xF0, STATE1, STATE0 /* DCBA */
- palignr $8, MSGTMP4, STATE1 /* HGFE */
+ palignr $8, TMP, STATE1 /* HGFE */

movdqu STATE0, 0*16(DIGEST_PTR)
movdqu STATE1, 1*16(DIGEST_PTR)

.Ldone_hash:
--
2.44.0


2024-04-11 17:49:53

by Eric Biggers

[permalink] [raw]
Subject: [PATCH v2 4/4] crypto: x86/sha256-ni - simplify do_4rounds

From: Eric Biggers <[email protected]>

Instead of loading the message words into both MSG and \m0 and then
adding the round constants to MSG, load the message words into \m0 and
the round constants into MSG and then add \m0 to MSG. This shortens the
source code slightly. It changes the instructions slightly, but it
doesn't affect binary code size and doesn't seem to affect performance.

Suggested-by: Stefan Kanthak <[email protected]>
Signed-off-by: Eric Biggers <[email protected]>
---
arch/x86/crypto/sha256_ni_asm.S | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
index ffc9f1c75c15..d515a55a3bc1 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -76,17 +76,15 @@
#define ABEF_SAVE %xmm9
#define CDGH_SAVE %xmm10

.macro do_4rounds i, m0, m1, m2, m3
.if \i < 16
- movdqu \i*4(DATA_PTR), MSG
- pshufb SHUF_MASK, MSG
- movdqa MSG, \m0
-.else
- movdqa \m0, MSG
+ movdqu \i*4(DATA_PTR), \m0
+ pshufb SHUF_MASK, \m0
.endif
- paddd (\i-32)*4(SHA256CONSTANTS), MSG
+ movdqa (\i-32)*4(SHA256CONSTANTS), MSG
+ paddd \m0, MSG
sha256rnds2 STATE0, STATE1
.if \i >= 12 && \i < 60
movdqa \m0, TMP
palignr $4, \m3, TMP
paddd TMP, \m1
--
2.44.0


2024-04-11 18:11:54

by Eric Biggers

[permalink] [raw]
Subject: [PATCH v2 1/4] crypto: x86/sha256-ni - convert to use rounds macros

From: Eric Biggers <[email protected]>

To avoid source code duplication, do the SHA-256 rounds using macros.
This reduces the length of sha256_ni_asm.S by 153 lines while still
producing the exact same object file.

Signed-off-by: Eric Biggers <[email protected]>
---
arch/x86/crypto/sha256_ni_asm.S | 211 +++++---------------------------
1 file changed, 29 insertions(+), 182 deletions(-)

diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
index 537b6dcd7ed8..498f67727b94 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -74,23 +74,43 @@
#define SHUF_MASK %xmm8

#define ABEF_SAVE %xmm9
#define CDGH_SAVE %xmm10

+.macro do_4rounds i, m0, m1, m2, m3
+.if \i < 16
+ movdqu \i*4(DATA_PTR), MSG
+ pshufb SHUF_MASK, MSG
+ movdqa MSG, \m0
+.else
+ movdqa \m0, MSG
+.endif
+ paddd \i*4(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+.if \i >= 12 && \i < 60
+ movdqa \m0, MSGTMP4
+ palignr $4, \m3, MSGTMP4
+ paddd MSGTMP4, \m1
+ sha256msg2 \m0, \m1
+.endif
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+.if \i >= 4 && \i < 52
+ sha256msg1 \m0, \m3
+.endif
+.endm
+
/*
* Intel SHA Extensions optimized implementation of a SHA-256 update function
*
* The function takes a pointer to the current hash values, a pointer to the
* input data, and a number of 64 byte blocks to process. Once all blocks have
* been processed, the digest pointer is updated with the resulting hash value.
* The function only processes complete blocks, there is no functionality to
* store partial blocks. All message padding and hash value initialization must
* be done outside the update function.
*
- * The indented lines in the loop are instructions related to rounds processing.
- * The non-indented lines are instructions related to the message schedule.
- *
* void sha256_ni_transform(uint32_t *digest, const void *data,
uint32_t numBlocks);
* digest : pointer to digest
* data: pointer to input data
* numBlocks: Number of blocks to process
@@ -123,189 +143,16 @@ SYM_TYPED_FUNC_START(sha256_ni_transform)
.Lloop0:
/* Save hash values for addition after rounds */
movdqa STATE0, ABEF_SAVE
movdqa STATE1, CDGH_SAVE

- /* Rounds 0-3 */
- movdqu 0*16(DATA_PTR), MSG
- pshufb SHUF_MASK, MSG
- movdqa MSG, MSGTMP0
- paddd 0*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
-
- /* Rounds 4-7 */
- movdqu 1*16(DATA_PTR), MSG
- pshufb SHUF_MASK, MSG
- movdqa MSG, MSGTMP1
- paddd 1*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP1, MSGTMP0
-
- /* Rounds 8-11 */
- movdqu 2*16(DATA_PTR), MSG
- pshufb SHUF_MASK, MSG
- movdqa MSG, MSGTMP2
- paddd 2*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP2, MSGTMP1
-
- /* Rounds 12-15 */
- movdqu 3*16(DATA_PTR), MSG
- pshufb SHUF_MASK, MSG
- movdqa MSG, MSGTMP3
- paddd 3*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP3, MSGTMP4
- palignr $4, MSGTMP2, MSGTMP4
- paddd MSGTMP4, MSGTMP0
- sha256msg2 MSGTMP3, MSGTMP0
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP3, MSGTMP2
-
- /* Rounds 16-19 */
- movdqa MSGTMP0, MSG
- paddd 4*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP0, MSGTMP4
- palignr $4, MSGTMP3, MSGTMP4
- paddd MSGTMP4, MSGTMP1
- sha256msg2 MSGTMP0, MSGTMP1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP0, MSGTMP3
-
- /* Rounds 20-23 */
- movdqa MSGTMP1, MSG
- paddd 5*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP1, MSGTMP4
- palignr $4, MSGTMP0, MSGTMP4
- paddd MSGTMP4, MSGTMP2
- sha256msg2 MSGTMP1, MSGTMP2
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP1, MSGTMP0
-
- /* Rounds 24-27 */
- movdqa MSGTMP2, MSG
- paddd 6*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP2, MSGTMP4
- palignr $4, MSGTMP1, MSGTMP4
- paddd MSGTMP4, MSGTMP3
- sha256msg2 MSGTMP2, MSGTMP3
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP2, MSGTMP1
-
- /* Rounds 28-31 */
- movdqa MSGTMP3, MSG
- paddd 7*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP3, MSGTMP4
- palignr $4, MSGTMP2, MSGTMP4
- paddd MSGTMP4, MSGTMP0
- sha256msg2 MSGTMP3, MSGTMP0
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP3, MSGTMP2
-
- /* Rounds 32-35 */
- movdqa MSGTMP0, MSG
- paddd 8*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP0, MSGTMP4
- palignr $4, MSGTMP3, MSGTMP4
- paddd MSGTMP4, MSGTMP1
- sha256msg2 MSGTMP0, MSGTMP1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP0, MSGTMP3
-
- /* Rounds 36-39 */
- movdqa MSGTMP1, MSG
- paddd 9*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP1, MSGTMP4
- palignr $4, MSGTMP0, MSGTMP4
- paddd MSGTMP4, MSGTMP2
- sha256msg2 MSGTMP1, MSGTMP2
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP1, MSGTMP0
-
- /* Rounds 40-43 */
- movdqa MSGTMP2, MSG
- paddd 10*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP2, MSGTMP4
- palignr $4, MSGTMP1, MSGTMP4
- paddd MSGTMP4, MSGTMP3
- sha256msg2 MSGTMP2, MSGTMP3
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP2, MSGTMP1
-
- /* Rounds 44-47 */
- movdqa MSGTMP3, MSG
- paddd 11*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP3, MSGTMP4
- palignr $4, MSGTMP2, MSGTMP4
- paddd MSGTMP4, MSGTMP0
- sha256msg2 MSGTMP3, MSGTMP0
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP3, MSGTMP2
-
- /* Rounds 48-51 */
- movdqa MSGTMP0, MSG
- paddd 12*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP0, MSGTMP4
- palignr $4, MSGTMP3, MSGTMP4
- paddd MSGTMP4, MSGTMP1
- sha256msg2 MSGTMP0, MSGTMP1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP0, MSGTMP3
-
- /* Rounds 52-55 */
- movdqa MSGTMP1, MSG
- paddd 13*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP1, MSGTMP4
- palignr $4, MSGTMP0, MSGTMP4
- paddd MSGTMP4, MSGTMP2
- sha256msg2 MSGTMP1, MSGTMP2
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
-
- /* Rounds 56-59 */
- movdqa MSGTMP2, MSG
- paddd 14*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP2, MSGTMP4
- palignr $4, MSGTMP1, MSGTMP4
- paddd MSGTMP4, MSGTMP3
- sha256msg2 MSGTMP2, MSGTMP3
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
-
- /* Rounds 60-63 */
- movdqa MSGTMP3, MSG
- paddd 15*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
+.irp i, 0, 16, 32, 48
+ do_4rounds (\i + 0), MSGTMP0, MSGTMP1, MSGTMP2, MSGTMP3
+ do_4rounds (\i + 4), MSGTMP1, MSGTMP2, MSGTMP3, MSGTMP0
+ do_4rounds (\i + 8), MSGTMP2, MSGTMP3, MSGTMP0, MSGTMP1
+ do_4rounds (\i + 12), MSGTMP3, MSGTMP0, MSGTMP1, MSGTMP2
+.endr

/* Add current hash values with previously saved */
paddd ABEF_SAVE, STATE0
paddd CDGH_SAVE, STATE1

--
2.44.0


2024-04-15 20:50:14

by Stefan Kanthak

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] crypto: x86/sha256-ni - simplify do_4rounds

"Eric Biggers" <[email protected]> wrote:

> Instead of loading the message words into both MSG and \m0 and then
> adding the round constants to MSG, load the message words into \m0 and
> the round constants into MSG and then add \m0 to MSG. This shortens the
> source code slightly. It changes the instructions slightly, but it
> doesn't affect binary code size and doesn't seem to affect performance.

At last the final change: write the macro straightforward and SIMPLE,
closely matching NIST.FIPS.180-4.pdf and their order of operations.

@@ ...
+.macro sha256 m0 :req, m1 :req, m2 :req, m3 :req
+.if \@ < 4
+ movdqu \@*16(DATA_PTR), \m0
+ pshufb SHUF_MASK, \m0 # \m0 = {w(\@*16), w(\@*16+1), w(\@*16+2), w(\@*16+3)}
+.else
+ # \m0 = {w(\@*16-16), w(\@*16-15), w(\@*16-14), w(\@*16-13)}
+ # \m1 = {w(\@*16-12), w(\@*16-11), w(\@*16-10), w(\@*16-9)}
+ # \m2 = {w(\@*16-8), w(\@*16-7), w(\@*16-6), w(\@*16-5)}
+ # \m3 = {w(\@*16-4), w(\@*16-3), w(\@*16-2), w(\@*16-1)}
+ sha256msg1 \m1, \m0
+ movdqa \m3, TMP
+ palignr $4, \m2, TMP
+ paddd TMP, \m0
+ sha256msg2 \m3, \m0 # \m0 = {w(\@*16), w(\@*16+1), w(\@*16+2), w(\@*16+3)}
+.endif
+ movdqa (\@-8)*16(SHA256CONSTANTS), MSG
+ paddd \m0, MSG
+ sha256rnds2 STATE0, STATE1 # STATE1 = {f', e', b', a'}
+ punpckhqdq MSG, MSG
+ sha256rnds2 STATE1, STATE0 # STATE0 = {f", e", b", a"},
+ # STATE1 = {h", g", d", c"}
+.endm

JFTR: you may simplify this further using .altmacro and generate \m0 to \m3
as MSG%(4-\@&3), MSG%(5-\@&3), MSG%(6-\@&3) and MSG%(7-\@&3) within
the macro, thus getting rid of its 4 arguments.

@@ ...
+.rept 4 # 4*4*4 rounds
+ sha256 MSG0, MSG1, MSG2, MSG3
+ sha256 MSG1, MSG2, MSG3, MSG0
+ sha256 MSG2, MSG3, MSG0, MSG1
+ sha256 MSG3, MSG0, MSG1, MSG2
+.endr

Now that all code written by Tim Chen and Sean Gulley is gone,
remove their copyright notice and insert your and my name instead.

regards
Stefan

PS: see <https://skanthak.homepage.t-online.de/fips-180.html>
(which I still polish) not just for this implementation.

PPS: if MASM had a counter like \@, I'd used it there.

2024-04-15 21:21:29

by Eric Biggers

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] crypto: x86/sha256-ni - simplify do_4rounds

On Mon, Apr 15, 2024 at 10:41:07PM +0200, Stefan Kanthak wrote:
> "Eric Biggers" <[email protected]> wrote:
>
> > Instead of loading the message words into both MSG and \m0 and then
> > adding the round constants to MSG, load the message words into \m0 and
> > the round constants into MSG and then add \m0 to MSG. This shortens the
> > source code slightly. It changes the instructions slightly, but it
> > doesn't affect binary code size and doesn't seem to affect performance.
>
> At last the final change: write the macro straightforward and SIMPLE,
> closely matching NIST.FIPS.180-4.pdf and their order of operations.
>
> @@ ...
> +.macro sha256 m0 :req, m1 :req, m2 :req, m3 :req
> +.if \@ < 4
> + movdqu \@*16(DATA_PTR), \m0
> + pshufb SHUF_MASK, \m0 # \m0 = {w(\@*16), w(\@*16+1), w(\@*16+2), w(\@*16+3)}
> +.else
> + # \m0 = {w(\@*16-16), w(\@*16-15), w(\@*16-14), w(\@*16-13)}
> + # \m1 = {w(\@*16-12), w(\@*16-11), w(\@*16-10), w(\@*16-9)}
> + # \m2 = {w(\@*16-8), w(\@*16-7), w(\@*16-6), w(\@*16-5)}
> + # \m3 = {w(\@*16-4), w(\@*16-3), w(\@*16-2), w(\@*16-1)}
> + sha256msg1 \m1, \m0
> + movdqa \m3, TMP
> + palignr $4, \m2, TMP
> + paddd TMP, \m0
> + sha256msg2 \m3, \m0 # \m0 = {w(\@*16), w(\@*16+1), w(\@*16+2), w(\@*16+3)}
> +.endif
> + movdqa (\@-8)*16(SHA256CONSTANTS), MSG
> + paddd \m0, MSG
> + sha256rnds2 STATE0, STATE1 # STATE1 = {f', e', b', a'}
> + punpckhqdq MSG, MSG
> + sha256rnds2 STATE1, STATE0 # STATE0 = {f", e", b", a"},
> + # STATE1 = {h", g", d", c"}
> +.endm
>
> JFTR: you may simplify this further using .altmacro and generate \m0 to \m3
> as MSG%(4-\@&3), MSG%(5-\@&3), MSG%(6-\@&3) and MSG%(7-\@&3) within
> the macro, thus getting rid of its 4 arguments.
>
> @@ ...
> +.rept 4 # 4*4*4 rounds
> + sha256 MSG0, MSG1, MSG2, MSG3
> + sha256 MSG1, MSG2, MSG3, MSG0
> + sha256 MSG2, MSG3, MSG0, MSG1
> + sha256 MSG3, MSG0, MSG1, MSG2
> +.endr

Could you please send a real patch, following
Documentation/process/submitting-patches.rst? It's hard to understand what
you're proposing here.

> Now that all code written by Tim Chen and Sean Gulley is gone,
> remove their copyright notice and insert your and my name instead.

Well, their code has been cleaned up. We have to keep copyright notices around
unless we're certain they can go.

>
> regards
> Stefan
>
> PS: see <https://skanthak.homepage.t-online.de/fips-180.html>
> (which I still polish) not just for this implementation.
>
> PPS: if MASM had a counter like \@, I'd used it there.

Thanks,

- Eric

2024-04-15 22:09:30

by Stefan Kanthak

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] crypto: x86/sha256-ni - simplify do_4rounds

"Eric Biggers" <[email protected]> wrote:

> On Mon, Apr 15, 2024 at 10:41:07PM +0200, Stefan Kanthak wrote:
[...]
>> At last the final change: write the macro straightforward and SIMPLE,
>> closely matching NIST.FIPS.180-4.pdf and their order of operations.
>>
>> @@ ...
>> +.macro sha256 m0 :req, m1 :req, m2 :req, m3 :req
>> +.if \@ < 4
>> + movdqu \@*16(DATA_PTR), \m0
>> + pshufb SHUF_MASK, \m0 # \m0 = {w(\@*16), w(\@*16+1), w(\@*16+2), w(\@*16+3)}
>> +.else
>> + # \m0 = {w(\@*16-16), w(\@*16-15), w(\@*16-14), w(\@*16-13)}
>> + # \m1 = {w(\@*16-12), w(\@*16-11), w(\@*16-10), w(\@*16-9)}
>> + # \m2 = {w(\@*16-8), w(\@*16-7), w(\@*16-6), w(\@*16-5)}
>> + # \m3 = {w(\@*16-4), w(\@*16-3), w(\@*16-2), w(\@*16-1)}
>> + sha256msg1 \m1, \m0
>> + movdqa \m3, TMP
>> + palignr $4, \m2, TMP
>> + paddd TMP, \m0
>> + sha256msg2 \m3, \m0 # \m0 = {w(\@*16), w(\@*16+1), w(\@*16+2), w(\@*16+3)}
>> +.endif
>> + movdqa (\@-8)*16(SHA256CONSTANTS), MSG
>> + paddd \m0, MSG
>> + sha256rnds2 STATE0, STATE1 # STATE1 = {f', e', b', a'}
>> + punpckhqdq MSG, MSG
>> + sha256rnds2 STATE1, STATE0 # STATE0 = {f", e", b", a"},
>> + # STATE1 = {h", g", d", c"}
>> +.endm
>>
>> JFTR: you may simplify this further using .altmacro and generate \m0 to \m3
>> as MSG%(4-\@&3), MSG%(5-\@&3), MSG%(6-\@&3) and MSG%(7-\@&3) within
>> the macro, thus getting rid of its 4 arguments.
>>
>> @@ ...
>> +.rept 4 # 4*4*4 rounds
>> + sha256 MSG0, MSG1, MSG2, MSG3
>> + sha256 MSG1, MSG2, MSG3, MSG0
>> + sha256 MSG2, MSG3, MSG0, MSG1
>> + sha256 MSG3, MSG0, MSG1, MSG2
>> +.endr
>
> Could you please send a real patch, following
> Documentation/process/submitting-patches.rst? It's hard to understand what
> you're proposing here.

1) I replace your macro (which unfortunately follows Tim Chens twisted code)
COMPLETELY with a clean and simple implementation: message schedule first,
update of state variables last.
You don't need ".if \i >= 12 && \i < 60"/".if \i >= 4 && \i < 52" at all!

2) I replace the .irp which invokes your macro with a .rept: my macro uses \@
instead of an argument for the round number.

<https://sourceware.org/binutils/docs/as.html#Macro>

| \@
| as maintains a counter of how many macros it has executed in this pseudo-
| variable; you can copy that number to your output with '\@', but only
| within a macro definition.

That's all.

regards
Stefan

2024-04-16 00:20:12

by Stefan Kanthak

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] crypto: x86/sha256-ni - simplify do_4rounds

"Eric Biggers" <[email protected]> wrote:

> On Tue, Apr 16, 2024 at 12:04:56AM +0200, Stefan Kanthak wrote:
>> "Eric Biggers" <[email protected]> wrote:
>>
>> > On Mon, Apr 15, 2024 at 10:41:07PM +0200, Stefan Kanthak wrote:
>> [...]
>> >> At last the final change: write the macro straightforward and SIMPLE,
>> >> closely matching NIST.FIPS.180-4.pdf and their order of operations.
>> >>
>> >> @@ ...
>> >> +.macro sha256 m0 :req, m1 :req, m2 :req, m3 :req
>> >> +.if \@ < 4
>> >> + movdqu \@*16(DATA_PTR), \m0
>> >> + pshufb SHUF_MASK, \m0 # \m0 = {w(\@*16), w(\@*16+1), w(\@*16+2), w(\@*16+3)}
>> >> +.else
>> >> + # \m0 = {w(\@*16-16), w(\@*16-15), w(\@*16-14), w(\@*16-13)}
>> >> + # \m1 = {w(\@*16-12), w(\@*16-11), w(\@*16-10), w(\@*16-9)}
>> >> + # \m2 = {w(\@*16-8), w(\@*16-7), w(\@*16-6), w(\@*16-5)}
>> >> + # \m3 = {w(\@*16-4), w(\@*16-3), w(\@*16-2), w(\@*16-1)}
>> >> + sha256msg1 \m1, \m0
>> >> + movdqa \m3, TMP
>> >> + palignr $4, \m2, TMP
>> >> + paddd TMP, \m0
>> >> + sha256msg2 \m3, \m0 # \m0 = {w(\@*16), w(\@*16+1), w(\@*16+2), w(\@*16+3)}
>> >> +.endif
>> >> + movdqa (\@-8)*16(SHA256CONSTANTS), MSG
>> >> + paddd \m0, MSG
>> >> + sha256rnds2 STATE0, STATE1 # STATE1 = {f', e', b', a'}
>> >> + punpckhqdq MSG, MSG
>> >> + sha256rnds2 STATE1, STATE0 # STATE0 = {f", e", b", a"},
>> >> + # STATE1 = {h", g", d", c"}
>> >> +.endm
>> >>
>> >> JFTR: you may simplify this further using .altmacro and generate \m0 to \m3
>> >> as MSG%(4-\@&3), MSG%(5-\@&3), MSG%(6-\@&3) and MSG%(7-\@&3) within
>> >> the macro, thus getting rid of its 4 arguments.
>> >>
>> >> @@ ...
>> >> +.rept 4 # 4*4*4 rounds
>> >> + sha256 MSG0, MSG1, MSG2, MSG3
>> >> + sha256 MSG1, MSG2, MSG3, MSG0
>> >> + sha256 MSG2, MSG3, MSG0, MSG1
>> >> + sha256 MSG3, MSG0, MSG1, MSG2
>> >> +.endr
>> >
>> > Could you please send a real patch, following
>> > Documentation/process/submitting-patches.rst? It's hard to understand what
>> > you're proposing here.
>>
>> 1) I replace your macro (which unfortunately follows Tim Chens twisted code)
>> COMPLETELY with a clean and simple implementation: message schedule first,
>> update of state variables last.
>> You don't need ".if \i >= 12 && \i < 60"/".if \i >= 4 && \i < 52" at all!
>
> It's probably intentional that the code does the message schedule computations a
> bit ahead of time. This might improve performance by reducing the time spent
> waiting for the message schedule.

While this is a valid point, Intel states in
<https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html>
only for SHA-1:

| In other words, the rounds processing is the critical path and the latency of
| sha1rnds4 determines the performance of SHA-1 calculations.

For SHA-256 no such explicit statement is given: did the authors consider it not
worthwhile?

JFTR: while Tim Chen's code (following the paper) executes 3 instructions and 1
sha256msg2 between every 2 sha256rnds2, my macro executes them back to back,
so my code would be slower if their latency determines performance.

> It would be worth trying a few different variants on different CPUs and seeing
> how they actually perform in practice, though.

Right. I noticed no difference on Zen2+ and Zen3; Intel CPUs with SHA-NI are not
available to me (I didn't bother to use __asm__ on Compiler Explorer).

Stefan