2024-04-08 09:45:15

by Stefan Kanthak

[permalink] [raw]
Subject: [PATCH 1/2] crypto: s(h)aving 40+ bytes off arch/x86/crypto/sha256_ni_asm.S

Use shorter SSE2 instructions instead of some SSE4.1
use short displacements into K256

--- -/arch/x86/crypto/sha256_ni_asm.S
+++ +/arch/x86/crypto/sha256_ni_asm.S
@@ -108,17 +108,17 @@
* Need to reorder these appropriately
* DCBA, HGFE -> ABEF, CDGH
*/
- movdqu 0*16(DIGEST_PTR), STATE0
- movdqu 1*16(DIGEST_PTR), STATE1
+ movdqu 0*16(DIGEST_PTR), STATE0 /* DCBA */
+ movdqu 1*16(DIGEST_PTR), STATE1 /* HGFE */

- pshufd $0xB1, STATE0, STATE0 /* CDAB */
- pshufd $0x1B, STATE1, STATE1 /* EFGH */
movdqa STATE0, MSGTMP4
- palignr $8, STATE1, STATE0 /* ABEF */
- pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */
+ punpcklqdq STATE1, STATE0 /* FEBA */
+ punpckhqdq MSGTMP4, STATE1 /* DCHG */
+ pshufd $0x1B, STATE0, STATE0 /* ABEF */
+ pshufd $0xB1, STATE1, STATE1 /* CDGH */

movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
- lea K256(%rip), SHA256CONSTANTS
+ lea K256+8*16(%rip), SHA256CONSTANTS

.Lloop0:
/* Save hash values for addition after rounds */
@@ -129,18 +129,18 @@
movdqu 0*16(DATA_PTR), MSG
pshufb SHUF_MASK, MSG
movdqa MSG, MSGTMP0
- paddd 0*16(SHA256CONSTANTS), MSG
+ paddd -8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
- pshufd $0x0E, MSG, MSG
+ punpckhqdq MSG, MSG
sha256rnds2 STATE1, STATE0

/* Rounds 4-7 */
movdqu 1*16(DATA_PTR), MSG
pshufb SHUF_MASK, MSG
movdqa MSG, MSGTMP1
- paddd 1*16(SHA256CONSTANTS), MSG
+ paddd -7*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
- pshufd $0x0E, MSG, MSG
+ punpckhqdq MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP1, MSGTMP0

@@ -148,9 +148,9 @@
movdqu 2*16(DATA_PTR), MSG
pshufb SHUF_MASK, MSG
movdqa MSG, MSGTMP2
- paddd 2*16(SHA256CONSTANTS), MSG
+ paddd -6*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
- pshufd $0x0E, MSG, MSG
+ punpckhqdq MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP2, MSGTMP1

@@ -158,151 +158,151 @@
movdqu 3*16(DATA_PTR), MSG
pshufb SHUF_MASK, MSG
movdqa MSG, MSGTMP3
- paddd 3*16(SHA256CONSTANTS), MSG
+ paddd -5*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
movdqa MSGTMP3, MSGTMP4
palignr $4, MSGTMP2, MSGTMP4
paddd MSGTMP4, MSGTMP0
sha256msg2 MSGTMP3, MSGTMP0
- pshufd $0x0E, MSG, MSG
+ punpckhqdq MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP3, MSGTMP2

/* Rounds 16-19 */
movdqa MSGTMP0, MSG
- paddd 4*16(SHA256CONSTANTS), MSG
+ paddd -4*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
movdqa MSGTMP0, MSGTMP4
palignr $4, MSGTMP3, MSGTMP4
paddd MSGTMP4, MSGTMP1
sha256msg2 MSGTMP0, MSGTMP1
- pshufd $0x0E, MSG, MSG
+ punpckhqdq MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP0, MSGTMP3

/* Rounds 20-23 */
movdqa MSGTMP1, MSG
- paddd 5*16(SHA256CONSTANTS), MSG
+ paddd -3*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
movdqa MSGTMP1, MSGTMP4
palignr $4, MSGTMP0, MSGTMP4
paddd MSGTMP4, MSGTMP2
sha256msg2 MSGTMP1, MSGTMP2
- pshufd $0x0E, MSG, MSG
+ punpckhqdq MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP1, MSGTMP0

/* Rounds 24-27 */
movdqa MSGTMP2, MSG
- paddd 6*16(SHA256CONSTANTS), MSG
+ paddd -2*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
movdqa MSGTMP2, MSGTMP4
palignr $4, MSGTMP1, MSGTMP4
paddd MSGTMP4, MSGTMP3
sha256msg2 MSGTMP2, MSGTMP3
- pshufd $0x0E, MSG, MSG
+ punpckhqdq MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP2, MSGTMP1

/* Rounds 28-31 */
movdqa MSGTMP3, MSG
- paddd 7*16(SHA256CONSTANTS), MSG
+ paddd -1*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
movdqa MSGTMP3, MSGTMP4
palignr $4, MSGTMP2, MSGTMP4
paddd MSGTMP4, MSGTMP0
sha256msg2 MSGTMP3, MSGTMP0
- pshufd $0x0E, MSG, MSG
+ punpckhqdq MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP3, MSGTMP2

/* Rounds 32-35 */
movdqa MSGTMP0, MSG
- paddd 8*16(SHA256CONSTANTS), MSG
+ paddd 0*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
movdqa MSGTMP0, MSGTMP4
palignr $4, MSGTMP3, MSGTMP4
paddd MSGTMP4, MSGTMP1
sha256msg2 MSGTMP0, MSGTMP1
- pshufd $0x0E, MSG, MSG
+ punpckhqdq MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP0, MSGTMP3

/* Rounds 36-39 */
movdqa MSGTMP1, MSG
- paddd 9*16(SHA256CONSTANTS), MSG
+ paddd 1*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
movdqa MSGTMP1, MSGTMP4
palignr $4, MSGTMP0, MSGTMP4
paddd MSGTMP4, MSGTMP2
sha256msg2 MSGTMP1, MSGTMP2
- pshufd $0x0E, MSG, MSG
+ punpckhqdq MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP1, MSGTMP0

/* Rounds 40-43 */
movdqa MSGTMP2, MSG
- paddd 10*16(SHA256CONSTANTS), MSG
+ paddd 2*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
movdqa MSGTMP2, MSGTMP4
palignr $4, MSGTMP1, MSGTMP4
paddd MSGTMP4, MSGTMP3
sha256msg2 MSGTMP2, MSGTMP3
- pshufd $0x0E, MSG, MSG
+ punpckhqdq MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP2, MSGTMP1

/* Rounds 44-47 */
movdqa MSGTMP3, MSG
- paddd 11*16(SHA256CONSTANTS), MSG
+ paddd 3*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
movdqa MSGTMP3, MSGTMP4
palignr $4, MSGTMP2, MSGTMP4
paddd MSGTMP4, MSGTMP0
sha256msg2 MSGTMP3, MSGTMP0
- pshufd $0x0E, MSG, MSG
+ punpckhqdq MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP3, MSGTMP2

/* Rounds 48-51 */
movdqa MSGTMP0, MSG
- paddd 12*16(SHA256CONSTANTS), MSG
+ paddd 4*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
movdqa MSGTMP0, MSGTMP4
palignr $4, MSGTMP3, MSGTMP4
paddd MSGTMP4, MSGTMP1
sha256msg2 MSGTMP0, MSGTMP1
- pshufd $0x0E, MSG, MSG
+ punpckhqdq MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP0, MSGTMP3

/* Rounds 52-55 */
movdqa MSGTMP1, MSG
- paddd 13*16(SHA256CONSTANTS), MSG
+ paddd 5*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
movdqa MSGTMP1, MSGTMP4
palignr $4, MSGTMP0, MSGTMP4
paddd MSGTMP4, MSGTMP2
sha256msg2 MSGTMP1, MSGTMP2
- pshufd $0x0E, MSG, MSG
+ punpckhqdq MSG, MSG
sha256rnds2 STATE1, STATE0

/* Rounds 56-59 */
movdqa MSGTMP2, MSG
- paddd 14*16(SHA256CONSTANTS), MSG
+ paddd 6*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
movdqa MSGTMP2, MSGTMP4
palignr $4, MSGTMP1, MSGTMP4
paddd MSGTMP4, MSGTMP3
sha256msg2 MSGTMP2, MSGTMP3
- pshufd $0x0E, MSG, MSG
+ punpckhqdq MSG, MSG
sha256rnds2 STATE1, STATE0

/* Rounds 60-63 */
movdqa MSGTMP3, MSG
- paddd 15*16(SHA256CONSTANTS), MSG
+ paddd 7*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
- pshufd $0x0E, MSG, MSG
+ punpckhqdq MSG, MSG
sha256rnds2 STATE1, STATE0

/* Add current hash values with previously saved */
@@ -315,11 +315,11 @@
jne .Lloop0

/* Write hash values back in the correct order */
- pshufd $0x1B, STATE0, STATE0 /* FEBA */
- pshufd $0xB1, STATE1, STATE1 /* DCHG */
movdqa STATE0, MSGTMP4
- pblendw $0xF0, STATE1, STATE0 /* DCBA */
- palignr $8, MSGTMP4, STATE1 /* HGFE */
+ punpcklqdq STATE1, STATE0 /* GHEF */
+ punpckhqdq MSGTMP4, STATE1 /* ABCD */
+ pshufd $0xB1, STATE0, STATE0 /* HGFE */
+ pshufd $0x1B, STATE1, STATE1 /* DCBA */

movdqu STATE0, 0*16(DIGEST_PTR)
movdqu STATE1, 1*16(DIGEST_PTR)



2024-04-08 12:37:51

by Eric Biggers

[permalink] [raw]
Subject: Re: [PATCH 1/2] crypto: s(h)aving 40+ bytes off arch/x86/crypto/sha256_ni_asm.S

On Mon, Apr 08, 2024 at 11:26:52AM +0200, Stefan Kanthak wrote:
> Use shorter SSE2 instructions instead of some SSE4.1
> use short displacements into K256
>
> --- -/arch/x86/crypto/sha256_ni_asm.S
> +++ +/arch/x86/crypto/sha256_ni_asm.S

Thanks! I'd like to benchmark this to see how it affects performance, but
unfortunately this patch doesn't apply. It looks your email client corrupted
your patch by replacing tabs with spaces. Can you please use 'git send-email'
to send patches?

- Eric

2024-04-08 13:17:41

by Eric Biggers

[permalink] [raw]
Subject: Re: [PATCH 1/2] crypto: s(h)aving 40+ bytes off arch/x86/crypto/sha256_ni_asm.S

On Mon, Apr 08, 2024 at 11:26:52AM +0200, Stefan Kanthak wrote:
> @@ -315,11 +315,11 @@
> jne .Lloop0
>
> /* Write hash values back in the correct order */
> - pshufd $0x1B, STATE0, STATE0 /* FEBA */
> - pshufd $0xB1, STATE1, STATE1 /* DCHG */
> movdqa STATE0, MSGTMP4
> - pblendw $0xF0, STATE1, STATE0 /* DCBA */
> - palignr $8, MSGTMP4, STATE1 /* HGFE */
> + punpcklqdq STATE1, STATE0 /* GHEF */
> + punpckhqdq MSGTMP4, STATE1 /* ABCD */
> + pshufd $0xB1, STATE0, STATE0 /* HGFE */
> + pshufd $0x1B, STATE1, STATE1 /* DCBA */
>
> movdqu STATE0, 0*16(DIGEST_PTR)
> movdqu STATE1, 1*16(DIGEST_PTR)

Please make sure to run the crypto self-tests too. The above is storing the two
halves of the state in the wrong order.

Thanks,

- Eric

2024-04-08 15:18:54

by Eric Biggers

[permalink] [raw]
Subject: Re: [PATCH 1/2] crypto: s(h)aving 40+ bytes off arch/x86/crypto/sha256_ni_asm.S

[+Cc linux-crypto]

Please use reply-all so that the list gets included.

On Mon, Apr 08, 2024 at 04:15:32PM +0200, Stefan Kanthak wrote:
> Hi Eric,
>
> > On Mon, Apr 08, 2024 at 11:26:52AM +0200, Stefan Kanthak wrote:
> >> Use shorter SSE2 instructions instead of some SSE4.1
> >> use short displacements into K256
> >>
> >> --- -/arch/x86/crypto/sha256_ni_asm.S
> >> +++ +/arch/x86/crypto/sha256_ni_asm.S
> >
> > Thanks! I'd like to benchmark this to see how it affects performance,
>
> Performance is NOT affected: if CPUs weren't superscalar, the patch might
> save 2 to 4 processor cycles as it replaces palignr/pblendw (slow) with
> punpck*qdq (fast and shorter)
>
> > but unfortunately this patch doesn't apply. It looks your email client
> > corrupted your patch by replacing tabs with spaces. Can you please use
> > 'git send-email' to send patches?
>
> I don't use git at all; I'll use cURL instead.
> Since the information on vger.kernel.org states "text/plain", no multipart,
> I assume that attachments are also not accepted?

Please read Documentation/process/submitting-patches.rst, which explains how to
submit Linux kernel patches.

> >> + pshufd $0xB1, STATE0, STATE0 /* HGFE */
> >> + pshufd $0x1B, STATE1, STATE1 /* DCBA */
> >>
> >> movdqu STATE0, 0*16(DIGEST_PTR)
> >> movdqu STATE1, 1*16(DIGEST_PTR)
> >
> > Please make sure to run the crypto self-tests too.
>
> I can't, I don't use Linux at all; I just noticed that this function uses
> 4-byte displacements and palignr/pblendw instead of punpck?qdq after pshufd
>
> > The above is storing the two halves of the state in the wrong order.
>
> ARGH, you are right; I recognized it too, wanted to correct it, but was
> interrupted and forgot it after returning to the patch. Sorry.

I'm afraid that if you don't submit a probably formatted and tested patch, your
patch can't be accepted. We can treat it as a suggestion, though since you're
sending actual code it would really help if it had your Signed-off-by.

- Eric

2024-04-09 12:51:57

by Eric Biggers

[permalink] [raw]
Subject: Re: [PATCH 1/2] crypto: s(h)aving 40+ bytes off arch/x86/crypto/sha256_ni_asm.S

On Tue, Apr 09, 2024 at 12:23:13PM +0200, Stefan Kanthak wrote:
> "Eric Biggers" <[email protected]> wrote:
>
> > [+Cc linux-crypto]
> >
> > Please use reply-all so that the list gets included.
> >
> > On Mon, Apr 08, 2024 at 04:15:32PM +0200, Stefan Kanthak wrote:
> >> Hi Eric,
> >>
> >> > On Mon, Apr 08, 2024 at 11:26:52AM +0200, Stefan Kanthak wrote:
> >> >> Use shorter SSE2 instructions instead of some SSE4.1
> >> >> use short displacements into K256
> >> >>
> >> >> --- -/arch/x86/crypto/sha256_ni_asm.S
> >> >> +++ +/arch/x86/crypto/sha256_ni_asm.S
> >> >
> >> > Thanks! I'd like to benchmark this to see how it affects performance,
> >>
> >> Performance is NOT affected: if CPUs weren't superscalar, the patch might
> >> save 2 to 4 processor cycles as it replaces palignr/pblendw (slow) with
> >> punpck*qdq (fast and shorter)
> >>
> >> > but unfortunately this patch doesn't apply. It looks your email client
> >> > corrupted your patch by replacing tabs with spaces. Can you please use
> >> > 'git send-email' to send patches?
> >>
> >> I don't use git at all; I'll use cURL instead.
>
> [...]
>
> >> > Please make sure to run the crypto self-tests too.
> >>
> >> I can't, I don't use Linux at all; I just noticed that this function uses
> >> 4-byte displacements and palignr/pblendw instead of punpck?qdq after pshufd
> >>
> >> > The above is storing the two halves of the state in the wrong order.
> >>
> >> ARGH, you are right; I recognized it too, wanted to correct it, but was
> >> interrupted and forgot it after returning to the patch. Sorry.
> >
> > I'm afraid that if you don't submit a probably formatted and tested patch, your
> > patch can't be accepted. We can treat it as a suggestion, though since you're
> > sending actual code it would really help if it had your Signed-off-by.
>
> Treat is as suggestion.

All right. I'll send out a properly formatted and tested patch then. I'd also
like to convert the SHA-256 rounds to use macros, which would make the source
150 lines shorter (without changing the output). I'll probably do that first.

> I but wonder that in the past 9 years since Tim Chen submitted the SHA-NI code
> (which was copied umpteen times by others and included in their own code bases)
> nobody noticed/questioned (or if so, bothered to submit a patch like mine, that
> reduces the code size by 5%, upstream) why he used 16x "pshufd $14, %xmm0, %xmm0"
> instead of the 1 byte shorter "punpckhqdq %xmm0, %xmm0" or "psrldq $8, %xmm0"
> (which both MAY execute on more ports or faster than the shuffle instructions,
> depending on the micro-architecture), why he used 8x a 4-byte instead of a 1-byte
> displacement, or why he used "palignr/pblendw" instead of the shorter "punpck?qdq".

Not many people work on crypto assembly code, and x86 SIMD is especially
difficult because there are often multiple ways to do things that differ in
subtle ways such as instruction length and the execution ports used on different
models of CPU. I think your suggestions are good, so thanks for them.

>
> regards
> Stefan
>
> PS: aaaahhhh, you picked my suggestion up and applied it to the AES routine.

Yes, I realized that a similar optimization can apply to AES round keys, as they
can be indexed them from -112 through 112 instead of 0 through 224.

- Eric

2024-04-09 13:05:32

by Stefan Kanthak

[permalink] [raw]
Subject: Re: [PATCH 1/2] crypto: s(h)aving 40+ bytes off arch/x86/crypto/sha256_ni_asm.S

"Eric Biggers" <[email protected]> wrote:

> [+Cc linux-crypto]
>
> Please use reply-all so that the list gets included.
>
> On Mon, Apr 08, 2024 at 04:15:32PM +0200, Stefan Kanthak wrote:
>> Hi Eric,
>>
>> > On Mon, Apr 08, 2024 at 11:26:52AM +0200, Stefan Kanthak wrote:
>> >> Use shorter SSE2 instructions instead of some SSE4.1
>> >> use short displacements into K256
>> >>
>> >> --- -/arch/x86/crypto/sha256_ni_asm.S
>> >> +++ +/arch/x86/crypto/sha256_ni_asm.S
>> >
>> > Thanks! I'd like to benchmark this to see how it affects performance,
>>
>> Performance is NOT affected: if CPUs weren't superscalar, the patch might
>> save 2 to 4 processor cycles as it replaces palignr/pblendw (slow) with
>> punpck*qdq (fast and shorter)
>>
>> > but unfortunately this patch doesn't apply. It looks your email client
>> > corrupted your patch by replacing tabs with spaces. Can you please use
>> > 'git send-email' to send patches?
>>
>> I don't use git at all; I'll use cURL instead.

[...]

>> > Please make sure to run the crypto self-tests too.
>>
>> I can't, I don't use Linux at all; I just noticed that this function uses
>> 4-byte displacements and palignr/pblendw instead of punpck?qdq after pshufd
>>
>> > The above is storing the two halves of the state in the wrong order.
>>
>> ARGH, you are right; I recognized it too, wanted to correct it, but was
>> interrupted and forgot it after returning to the patch. Sorry.
>
> I'm afraid that if you don't submit a probably formatted and tested patch, your
> patch can't be accepted. We can treat it as a suggestion, though since you're
> sending actual code it would really help if it had your Signed-off-by.

Treat is as suggestion.
I but wonder that in the past 9 years since Tim Chen submitted the SHA-NI code
(which was copied umpteen times by others and included in their own code bases)
nobody noticed/questioned (or if so, bothered to submit a patch like mine, that
reduces the code size by 5%, upstream) why he used 16x "pshufd $14, %xmm0, %xmm0"
instead of the 1 byte shorter "punpckhqdq %xmm0, %xmm0" or "psrldq $8, %xmm0"
(which both MAY execute on more ports or faster than the shuffle instructions,
depending on the micro-architecture), why he used 8x a 4-byte instead of a 1-byte
displacement, or why he used "palignr/pblendw" instead of the shorter "punpck?qdq".

regards
Stefan

PS: aaaahhhh, you picked my suggestion up and applied it to the AES routine.