2024-04-09 00:02:20

by Eric Biggers

[permalink] [raw]
Subject: [PATCH] crypto: x86/aes-xts - access round keys using single-byte offsets

From: Eric Biggers <[email protected]>

Access the AES round keys using offsets -7*16 through 7*16, instead of
0*16 through 14*16. This allows VEX-encoded instructions to address all
round keys using 1-byte offsets, whereas before some needed 4-byte
offsets. This decreases the code size of aes-xts-avx-x86_64.o by 4.2%.

Signed-off-by: Eric Biggers <[email protected]>
---
arch/x86/crypto/aes-xts-avx-x86_64.S | 81 +++++++++++++++-------------
1 file changed, 44 insertions(+), 37 deletions(-)

diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
index fcaf64a2f8c6..95e412e7601d 100644
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -80,11 +80,11 @@
.byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
.text

// Function parameters
.set KEY, %rdi // Initially points to crypto_aes_ctx, then is
- // advanced to point directly to the round keys
+ // advanced to point directly to 7th round key
.set SRC, %rsi // Pointer to next source data
.set DST, %rdx // Pointer to next destination data
.set LEN, %rcx // Remaining length in bytes
.set TWEAK, %r8 // Pointer to next tweak

@@ -406,28 +406,28 @@
.endif
.endm

// Load the round keys: just the first one if !USE_AVX10, otherwise all of them.
.macro _load_round_keys
- _vbroadcast128 0*16(KEY), KEY0
+ _vbroadcast128 -7*16(KEY), KEY0
.if USE_AVX10
- _vbroadcast128 1*16(KEY), KEY1
- _vbroadcast128 2*16(KEY), KEY2
- _vbroadcast128 3*16(KEY), KEY3
- _vbroadcast128 4*16(KEY), KEY4
- _vbroadcast128 5*16(KEY), KEY5
- _vbroadcast128 6*16(KEY), KEY6
- _vbroadcast128 7*16(KEY), KEY7
- _vbroadcast128 8*16(KEY), KEY8
- _vbroadcast128 9*16(KEY), KEY9
- _vbroadcast128 10*16(KEY), KEY10
+ _vbroadcast128 -6*16(KEY), KEY1
+ _vbroadcast128 -5*16(KEY), KEY2
+ _vbroadcast128 -4*16(KEY), KEY3
+ _vbroadcast128 -3*16(KEY), KEY4
+ _vbroadcast128 -2*16(KEY), KEY5
+ _vbroadcast128 -1*16(KEY), KEY6
+ _vbroadcast128 0*16(KEY), KEY7
+ _vbroadcast128 1*16(KEY), KEY8
+ _vbroadcast128 2*16(KEY), KEY9
+ _vbroadcast128 3*16(KEY), KEY10
// Note: if it's AES-128 or AES-192, the last several round keys won't
// be used. We do the loads anyway to save a conditional jump.
- _vbroadcast128 11*16(KEY), KEY11
- _vbroadcast128 12*16(KEY), KEY12
- _vbroadcast128 13*16(KEY), KEY13
- _vbroadcast128 14*16(KEY), KEY14
+ _vbroadcast128 4*16(KEY), KEY11
+ _vbroadcast128 5*16(KEY), KEY12
+ _vbroadcast128 6*16(KEY), KEY13
+ _vbroadcast128 7*16(KEY), KEY14
.endif
.endm

// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0)
// on the block(s) in \data using the round key(s) in \key. The register length
@@ -454,13 +454,13 @@
.macro _vaes_1x enc, last, i, xmm_suffix, data
.if USE_AVX10
_vaes \enc, \last, KEY\i\xmm_suffix, \data
.else
.ifnb \xmm_suffix
- _vaes \enc, \last, \i*16(KEY), \data
+ _vaes \enc, \last, (\i-7)*16(KEY), \data
.else
- _vbroadcast128 \i*16(KEY), V4
+ _vbroadcast128 (\i-7)*16(KEY), V4
_vaes \enc, \last, V4, \data
.endif
.endif
.endm

@@ -475,11 +475,11 @@
_vaes \enc, \last, KEY\i, V1
_tweak_step (2*(\i-1) + 1)
_vaes \enc, \last, KEY\i, V2
_vaes \enc, \last, KEY\i, V3
.else
- _vbroadcast128 \i*16(KEY), V4
+ _vbroadcast128 (\i-7)*16(KEY), V4
_tweak_step (2*(\i-1))
_vaes \enc, \last, V4, V0
_vaes \enc, \last, V4, V1
_tweak_step (2*(\i-1) + 1)
_vaes \enc, \last, V4, V2
@@ -526,13 +526,19 @@
_define_aliases

// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
movl 480(KEY), KEYLEN

- // If decrypting, advance KEY to the decryption round keys.
-.if !\enc
- add $240, KEY
+ // Advance KEY to point to the 7th encryption round key (if encrypting)
+ // or the 7th decryption round key (if decrypting). This makes the
+ // offset to any round key be in the range [-112, 112], fitting in a
+ // signed byte. This shortens VEX-encoded instructions that access the
+ // 8th and later round keys which otherwise would need 4-byte offsets.
+.if \enc
+ add $7*16, KEY
+.else
+ add $(15+7)*16, KEY
.endif

// Check whether the data length is a multiple of the AES block length.
test $15, LEN
jnz .Lneed_cts\@
@@ -751,40 +757,41 @@

// void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
// u8 iv[AES_BLOCK_SIZE]);
SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
vmovdqu (%rsi), %xmm0
- vpxor 0*16(%rdi), %xmm0, %xmm0
+ add $7*16, %rdi
+ vpxor -7*16(%rdi), %xmm0, %xmm0
+ vaesenc -6*16(%rdi), %xmm0, %xmm0
+ vaesenc -5*16(%rdi), %xmm0, %xmm0
+ vaesenc -4*16(%rdi), %xmm0, %xmm0
+ vaesenc -3*16(%rdi), %xmm0, %xmm0
+ vaesenc -2*16(%rdi), %xmm0, %xmm0
+ vaesenc -1*16(%rdi), %xmm0, %xmm0
+ vaesenc 0*16(%rdi), %xmm0, %xmm0
vaesenc 1*16(%rdi), %xmm0, %xmm0
vaesenc 2*16(%rdi), %xmm0, %xmm0
+ cmpl $24, 480-(7*16)(%rdi)
+ jle .Lencrypt_iv_aes_128_or_192
vaesenc 3*16(%rdi), %xmm0, %xmm0
vaesenc 4*16(%rdi), %xmm0, %xmm0
vaesenc 5*16(%rdi), %xmm0, %xmm0
vaesenc 6*16(%rdi), %xmm0, %xmm0
- vaesenc 7*16(%rdi), %xmm0, %xmm0
- vaesenc 8*16(%rdi), %xmm0, %xmm0
- vaesenc 9*16(%rdi), %xmm0, %xmm0
- cmpl $24, 480(%rdi)
- jle .Lencrypt_iv_aes_128_or_192
- vaesenc 10*16(%rdi), %xmm0, %xmm0
- vaesenc 11*16(%rdi), %xmm0, %xmm0
- vaesenc 12*16(%rdi), %xmm0, %xmm0
- vaesenc 13*16(%rdi), %xmm0, %xmm0
- vaesenclast 14*16(%rdi), %xmm0, %xmm0
+ vaesenclast 7*16(%rdi), %xmm0, %xmm0
.Lencrypt_iv_done:
vmovdqu %xmm0, (%rsi)
RET

// Out-of-line handling of AES-128 and AES-192
.Lencrypt_iv_aes_128_or_192:
jz .Lencrypt_iv_aes_192
- vaesenclast 10*16(%rdi), %xmm0, %xmm0
+ vaesenclast 3*16(%rdi), %xmm0, %xmm0
jmp .Lencrypt_iv_done
.Lencrypt_iv_aes_192:
- vaesenc 10*16(%rdi), %xmm0, %xmm0
- vaesenc 11*16(%rdi), %xmm0, %xmm0
- vaesenclast 12*16(%rdi), %xmm0, %xmm0
+ vaesenc 3*16(%rdi), %xmm0, %xmm0
+ vaesenc 4*16(%rdi), %xmm0, %xmm0
+ vaesenclast 5*16(%rdi), %xmm0, %xmm0
jmp .Lencrypt_iv_done
SYM_FUNC_END(aes_xts_encrypt_iv)

// Below are the actual AES-XTS encryption and decryption functions,
// instantiated from the above macro. They all have the following prototype:

base-commit: 4ad27a8be9dbefd4820da0f60da879d512b2f659
prerequisite-patch-id: 8d09ed747039f5e718ac7267e2a15e22504aa7f3
--
2.44.0



2024-04-09 12:11:50

by Eric Biggers

[permalink] [raw]
Subject: Re: [PATCH] crypto: x86/aes-xts - access round keys using single-byte offsets

On Tue, Apr 09, 2024 at 11:12:11AM +0200, Ard Biesheuvel wrote:
> On Tue, 9 Apr 2024 at 02:02, Eric Biggers <[email protected]> wrote:
> >
> > From: Eric Biggers <[email protected]>
> >
> > Access the AES round keys using offsets -7*16 through 7*16, instead of
> > 0*16 through 14*16. This allows VEX-encoded instructions to address all
> > round keys using 1-byte offsets, whereas before some needed 4-byte
> > offsets. This decreases the code size of aes-xts-avx-x86_64.o by 4.2%.
> >
> > Signed-off-by: Eric Biggers <[email protected]>
>
> Nice optimization!
>
> Do you think we might be able to macrofy this a bit so we can use zero
> based indexing for the round keys, and hide the arithmetic?
>
>

There are two alternatives I considered: defining variables KEYOFF0 through
KEYOFF14 and writing the offsets as KEYOFF\i(KEY), or defining one variable
KEYOFF and writing the offsets as \i*16-KEYOFF(KEY). I think I slightly prefer
the current patch where it's less abstracted out, though. It makes it clear the
offsets really are single-byte, and also index 7 is the exact mid-point so going
from -7 to 7 still feels fairly natural. If we wanted to do something more
complex like use different offsets for AVX vs. AVX512, then we'd need the
abstraction to handle that, but it doesn't seem useful to do that.

- Eric

2024-04-09 12:45:06

by Ard Biesheuvel

[permalink] [raw]
Subject: Re: [PATCH] crypto: x86/aes-xts - access round keys using single-byte offsets

On Tue, 9 Apr 2024 at 14:11, Eric Biggers <[email protected]> wrote:
>
> On Tue, Apr 09, 2024 at 11:12:11AM +0200, Ard Biesheuvel wrote:
> > On Tue, 9 Apr 2024 at 02:02, Eric Biggers <[email protected]> wrote:
> > >
> > > From: Eric Biggers <[email protected]>
> > >
> > > Access the AES round keys using offsets -7*16 through 7*16, instead of
> > > 0*16 through 14*16. This allows VEX-encoded instructions to address all
> > > round keys using 1-byte offsets, whereas before some needed 4-byte
> > > offsets. This decreases the code size of aes-xts-avx-x86_64.o by 4.2%.
> > >
> > > Signed-off-by: Eric Biggers <[email protected]>
> >
> > Nice optimization!
> >
> > Do you think we might be able to macrofy this a bit so we can use zero
> > based indexing for the round keys, and hide the arithmetic?
> >
> >
>
> There are two alternatives I considered: defining variables KEYOFF0 through
> KEYOFF14 and writing the offsets as KEYOFF\i(KEY), or defining one variable
> KEYOFF and writing the offsets as \i*16-KEYOFF(KEY). I think I slightly prefer
> the current patch where it's less abstracted out, though. It makes it clear the
> offsets really are single-byte, and also index 7 is the exact mid-point so going
> from -7 to 7 still feels fairly natural. If we wanted to do something more
> complex like use different offsets for AVX vs. AVX512, then we'd need the
> abstraction to handle that, but it doesn't seem useful to do that.
>

Fair enough.