2021-01-16 17:17:12

by Ard Biesheuvel

[permalink] [raw]
Subject: Re: [RFC V1 5/7] crypto: aesni - AES CTR x86_64 "by16" AVX512 optimization

On Fri, 18 Dec 2020 at 22:08, Megha Dey <[email protected]> wrote:
>
> Introduce the "by16" implementation of the AES CTR mode using AVX512
> optimizations. "by16" means that 16 independent blocks (each block
> being 128 bits) can be ciphered simultaneously as opposed to the
> current 8 blocks.
>
> The glue code in AESNI module overrides the existing "by8" CTR mode
> encryption/decryption routines with the "by16" ones when the following
> criteria are met:
> At compile time:
> 1. CONFIG_CRYPTO_AVX512 is enabled
> 2. toolchain(assembler) supports VAES instructions
> At runtime:
> 1. VAES and AVX512VL features are supported on platform (currently
> only Icelake)
> 2. aesni_intel.use_avx512 module parameter is set at boot time. For this
> algorithm, switching from AVX512 optimized version is not possible once
> set at boot time because of how the code is structured today.(Can be
> changed later if required)
>
> The functions aes_ctr_enc_128_avx512_by16(), aes_ctr_enc_192_avx512_by16()
> and aes_ctr_enc_256_avx512_by16() are adapted from Intel Optimized IPSEC
> Cryptographic library.
>
> On a Icelake desktop, with turbo disabled and all CPUs running at maximum
> frequency, the "by16" CTR mode optimization shows better performance
> across data & key sizes as measured by tcrypt.
>
> The average performance improvement of the "by16" version over the "by8"
> version is as follows:
> For all key sizes(128/192/256 bits),
> data sizes < 128 bytes/block, negligible improvement(~3% loss)
> data sizes > 128 bytes/block, there is an average improvement of
> 48% for both encryption and decryption.
>
> A typical run of tcrypt with AES CTR mode encryption/decryption of the
> "by8" and "by16" optimization on a Icelake desktop shows the following
> results:
>
> --------------------------------------------------------------
> | key | bytes | cycles/op (lower is better)| percentage |
> | length | per | encryption | decryption | loss/gain |
> | (bits) | block |-------------------------------------------|
> | | | by8 | by16 | by8 | by16 | enc | dec |
> |------------------------------------------------------------|
> | 128 | 16 | 156 | 168 | 164 | 168 | -7.7 | -2.5 |
> | 128 | 64 | 180 | 190 | 157 | 146 | -5.6 | 7.1 |
> | 128 | 256 | 248 | 158 | 251 | 161 | 36.3 | 35.9 |
> | 128 | 1024 | 633 | 316 | 642 | 319 | 50.1 | 50.4 |
> | 128 | 1472 | 853 | 411 | 877 | 407 | 51.9 | 53.6 |
> | 128 | 8192 | 4463 | 1959 | 4447 | 1940 | 56.2 | 56.4 |
> | 192 | 16 | 136 | 145 | 149 | 166 | -6.7 | -11.5 |
> | 192 | 64 | 159 | 154 | 157 | 160 | 3.2 | -2 |
> | 192 | 256 | 268 | 172 | 274 | 177 | 35.9 | 35.5 |
> | 192 | 1024 | 710 | 358 | 720 | 355 | 49.6 | 50.7 |
> | 192 | 1472 | 989 | 468 | 983 | 469 | 52.7 | 52.3 |
> | 192 | 8192 | 6326 | 3551 | 6301 | 3567 | 43.9 | 43.4 |
> | 256 | 16 | 153 | 165 | 139 | 156 | -7.9 | -12.3 |
> | 256 | 64 | 158 | 152 | 174 | 161 | 3.8 | 7.5 |
> | 256 | 256 | 283 | 176 | 287 | 202 | 37.9 | 29.7 |
> | 256 | 1024 | 797 | 393 | 807 | 395 | 50.7 | 51.1 |
> | 256 | 1472 | 1108 | 534 | 1107 | 527 | 51.9 | 52.4 |
> | 256 | 8192 | 5763 | 2616 | 5773 | 2617 | 54.7 | 54.7 |
> --------------------------------------------------------------
>
> This work was inspired by the AES CTR mode optimization published
> in Intel Optimized IPSEC Cryptographic library.
> https://github.com/intel/intel-ipsec-mb/blob/master/lib/avx512/cntr_vaes_avx512.asm
>
> Co-developed-by: Tomasz Kantecki <[email protected]>
> Signed-off-by: Tomasz Kantecki <[email protected]>
> Signed-off-by: Megha Dey <[email protected]>
> ---
> arch/x86/crypto/Makefile | 1 +
> arch/x86/crypto/aes_ctrby16_avx512-x86_64.S | 856 ++++++++++++++++++++++++++++
> arch/x86/crypto/aesni-intel_glue.c | 57 +-
> arch/x86/crypto/avx512_vaes_common.S | 422 ++++++++++++++
> arch/x86/include/asm/disabled-features.h | 8 +-
> crypto/Kconfig | 12 +
> 6 files changed, 1354 insertions(+), 2 deletions(-)
> create mode 100644 arch/x86/crypto/aes_ctrby16_avx512-x86_64.S
>
...
> diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
> index ad8a718..f45059e 100644
> --- a/arch/x86/crypto/aesni-intel_glue.c
> +++ b/arch/x86/crypto/aesni-intel_glue.c
> @@ -46,6 +46,10 @@
> #define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA)
> #define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA)
>
> +static bool use_avx512;
> +module_param(use_avx512, bool, 0644);
> +MODULE_PARM_DESC(use_avx512, "Use AVX512 optimized algorithm, if available");
> +
> /* This data is stored at the end of the crypto_tfm struct.
> * It's a type of per "session" data storage location.
> * This needs to be 16 byte aligned.
> @@ -191,6 +195,35 @@ asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
> void *keys, u8 *out, unsigned int num_bytes);
> asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
> void *keys, u8 *out, unsigned int num_bytes);
> +
> +#ifdef CONFIG_CRYPTO_AES_CTR_AVX512
> +asmlinkage void aes_ctr_enc_128_avx512_by16(void *keys, u8 *out,
> + const u8 *in,
> + unsigned int num_bytes,
> + u8 *iv);
> +asmlinkage void aes_ctr_enc_192_avx512_by16(void *keys, u8 *out,
> + const u8 *in,
> + unsigned int num_bytes,
> + u8 *iv);
> +asmlinkage void aes_ctr_enc_256_avx512_by16(void *keys, u8 *out,
> + const u8 *in,
> + unsigned int num_bytes,
> + u8 *iv);
> +#else
> +static inline void aes_ctr_enc_128_avx512_by16(void *keys, u8 *out,
> + const u8 *in,
> + unsigned int num_bytes,
> + u8 *iv) {}
> +static inline void aes_ctr_enc_192_avx512_by16(void *keys, u8 *out,
> + const u8 *in,
> + unsigned int num_bytes,
> + u8 *iv) {}
> +static inline void aes_ctr_enc_256_avx512_by16(void *keys, u8 *out,
> + const u8 *in,
> + unsigned int num_bytes,
> + u8 *iv) {}
> +#endif
> +

Please drop these alternatives.

> /*
> * asmlinkage void aesni_gcm_init_avx_gen2()
> * gcm_data *my_ctx_data, context data
> @@ -487,6 +520,23 @@ static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out,
> aes_ctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len);
> }
>
> +static void aesni_ctr_enc_avx512_tfm(struct crypto_aes_ctx *ctx, u8 *out,
> + const u8 *in, unsigned int len, u8 *iv)
> +{
> + /*
> + * based on key length, override with the by16 version
> + * of ctr mode encryption/decryption for improved performance.
> + * aes_set_key_common() ensures that key length is one of
> + * {128,192,256}
> + */
> + if (ctx->key_length == AES_KEYSIZE_128)
> + aes_ctr_enc_128_avx512_by16((void *)ctx, out, in, len, iv);
> + else if (ctx->key_length == AES_KEYSIZE_192)
> + aes_ctr_enc_192_avx512_by16((void *)ctx, out, in, len, iv);
> + else
> + aes_ctr_enc_256_avx512_by16((void *)ctx, out, in, len, iv);
> +}
> +
> static int ctr_crypt(struct skcipher_request *req)
> {
> struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
> @@ -1076,7 +1126,12 @@ static int __init aesni_init(void)
> aesni_gcm_tfm = &aesni_gcm_tfm_sse;
> }
> aesni_ctr_enc_tfm = aesni_ctr_enc;
> - if (boot_cpu_has(X86_FEATURE_AVX)) {
> + if (use_avx512 && IS_ENABLED(CONFIG_CRYPTO_AES_CTR_AVX512) &&
> + cpu_feature_enabled(X86_FEATURE_VAES)) {
> + /* Ctr mode performance optimization using AVX512 */
> + aesni_ctr_enc_tfm = aesni_ctr_enc_avx512_tfm;
> + pr_info("AES CTR mode by16 optimization enabled\n");

This will need to be changed to a static_call_update() once my
outstanding patch is merged.

> + } else if (boot_cpu_has(X86_FEATURE_AVX)) {
> /* optimize performance of ctr mode encryption transform */
> aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm;
> pr_info("AES CTR mode by8 optimization enabled\n");


2021-01-21 02:10:50

by Megha Dey

[permalink] [raw]
Subject: Re: [RFC V1 5/7] crypto: aesni - AES CTR x86_64 "by16" AVX512 optimization

Hi Ard,

On 1/16/2021 9:03 AM, Ard Biesheuvel wrote:
> On Fri, 18 Dec 2020 at 22:08, Megha Dey <[email protected]> wrote:
>> Introduce the "by16" implementation of the AES CTR mode using AVX512
>> optimizations. "by16" means that 16 independent blocks (each block
>> being 128 bits) can be ciphered simultaneously as opposed to the
>> current 8 blocks.
>>
>> The glue code in AESNI module overrides the existing "by8" CTR mode
>> encryption/decryption routines with the "by16" ones when the following
>> criteria are met:
>> At compile time:
>> 1. CONFIG_CRYPTO_AVX512 is enabled
>> 2. toolchain(assembler) supports VAES instructions
>> At runtime:
>> 1. VAES and AVX512VL features are supported on platform (currently
>> only Icelake)
>> 2. aesni_intel.use_avx512 module parameter is set at boot time. For this
>> algorithm, switching from AVX512 optimized version is not possible once
>> set at boot time because of how the code is structured today.(Can be
>> changed later if required)
>>
>> The functions aes_ctr_enc_128_avx512_by16(), aes_ctr_enc_192_avx512_by16()
>> and aes_ctr_enc_256_avx512_by16() are adapted from Intel Optimized IPSEC
>> Cryptographic library.
>>
>> On a Icelake desktop, with turbo disabled and all CPUs running at maximum
>> frequency, the "by16" CTR mode optimization shows better performance
>> across data & key sizes as measured by tcrypt.
>>
>> The average performance improvement of the "by16" version over the "by8"
>> version is as follows:
>> For all key sizes(128/192/256 bits),
>> data sizes < 128 bytes/block, negligible improvement(~3% loss)
>> data sizes > 128 bytes/block, there is an average improvement of
>> 48% for both encryption and decryption.
>>
>> A typical run of tcrypt with AES CTR mode encryption/decryption of the
>> "by8" and "by16" optimization on a Icelake desktop shows the following
>> results:
>>
>> --------------------------------------------------------------
>> | key | bytes | cycles/op (lower is better)| percentage |
>> | length | per | encryption | decryption | loss/gain |
>> | (bits) | block |-------------------------------------------|
>> | | | by8 | by16 | by8 | by16 | enc | dec |
>> |------------------------------------------------------------|
>> | 128 | 16 | 156 | 168 | 164 | 168 | -7.7 | -2.5 |
>> | 128 | 64 | 180 | 190 | 157 | 146 | -5.6 | 7.1 |
>> | 128 | 256 | 248 | 158 | 251 | 161 | 36.3 | 35.9 |
>> | 128 | 1024 | 633 | 316 | 642 | 319 | 50.1 | 50.4 |
>> | 128 | 1472 | 853 | 411 | 877 | 407 | 51.9 | 53.6 |
>> | 128 | 8192 | 4463 | 1959 | 4447 | 1940 | 56.2 | 56.4 |
>> | 192 | 16 | 136 | 145 | 149 | 166 | -6.7 | -11.5 |
>> | 192 | 64 | 159 | 154 | 157 | 160 | 3.2 | -2 |
>> | 192 | 256 | 268 | 172 | 274 | 177 | 35.9 | 35.5 |
>> | 192 | 1024 | 710 | 358 | 720 | 355 | 49.6 | 50.7 |
>> | 192 | 1472 | 989 | 468 | 983 | 469 | 52.7 | 52.3 |
>> | 192 | 8192 | 6326 | 3551 | 6301 | 3567 | 43.9 | 43.4 |
>> | 256 | 16 | 153 | 165 | 139 | 156 | -7.9 | -12.3 |
>> | 256 | 64 | 158 | 152 | 174 | 161 | 3.8 | 7.5 |
>> | 256 | 256 | 283 | 176 | 287 | 202 | 37.9 | 29.7 |
>> | 256 | 1024 | 797 | 393 | 807 | 395 | 50.7 | 51.1 |
>> | 256 | 1472 | 1108 | 534 | 1107 | 527 | 51.9 | 52.4 |
>> | 256 | 8192 | 5763 | 2616 | 5773 | 2617 | 54.7 | 54.7 |
>> --------------------------------------------------------------
>>
>> This work was inspired by the AES CTR mode optimization published
>> in Intel Optimized IPSEC Cryptographic library.
>> https://github.com/intel/intel-ipsec-mb/blob/master/lib/avx512/cntr_vaes_avx512.asm
>>
>> Co-developed-by: Tomasz Kantecki <[email protected]>
>> Signed-off-by: Tomasz Kantecki <[email protected]>
>> Signed-off-by: Megha Dey <[email protected]>
>> ---
>> arch/x86/crypto/Makefile | 1 +
>> arch/x86/crypto/aes_ctrby16_avx512-x86_64.S | 856 ++++++++++++++++++++++++++++
>> arch/x86/crypto/aesni-intel_glue.c | 57 +-
>> arch/x86/crypto/avx512_vaes_common.S | 422 ++++++++++++++
>> arch/x86/include/asm/disabled-features.h | 8 +-
>> crypto/Kconfig | 12 +
>> 6 files changed, 1354 insertions(+), 2 deletions(-)
>> create mode 100644 arch/x86/crypto/aes_ctrby16_avx512-x86_64.S
>>
> ...
>> diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
>> index ad8a718..f45059e 100644
>> --- a/arch/x86/crypto/aesni-intel_glue.c
>> +++ b/arch/x86/crypto/aesni-intel_glue.c
>> @@ -46,6 +46,10 @@
>> #define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA)
>> #define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA)
>>
>> +static bool use_avx512;
>> +module_param(use_avx512, bool, 0644);
>> +MODULE_PARM_DESC(use_avx512, "Use AVX512 optimized algorithm, if available");
>> +
>> /* This data is stored at the end of the crypto_tfm struct.
>> * It's a type of per "session" data storage location.
>> * This needs to be 16 byte aligned.
>> @@ -191,6 +195,35 @@ asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
>> void *keys, u8 *out, unsigned int num_bytes);
>> asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
>> void *keys, u8 *out, unsigned int num_bytes);
>> +
>> +#ifdef CONFIG_CRYPTO_AES_CTR_AVX512
>> +asmlinkage void aes_ctr_enc_128_avx512_by16(void *keys, u8 *out,
>> + const u8 *in,
>> + unsigned int num_bytes,
>> + u8 *iv);
>> +asmlinkage void aes_ctr_enc_192_avx512_by16(void *keys, u8 *out,
>> + const u8 *in,
>> + unsigned int num_bytes,
>> + u8 *iv);
>> +asmlinkage void aes_ctr_enc_256_avx512_by16(void *keys, u8 *out,
>> + const u8 *in,
>> + unsigned int num_bytes,
>> + u8 *iv);
>> +#else
>> +static inline void aes_ctr_enc_128_avx512_by16(void *keys, u8 *out,
>> + const u8 *in,
>> + unsigned int num_bytes,
>> + u8 *iv) {}
>> +static inline void aes_ctr_enc_192_avx512_by16(void *keys, u8 *out,
>> + const u8 *in,
>> + unsigned int num_bytes,
>> + u8 *iv) {}
>> +static inline void aes_ctr_enc_256_avx512_by16(void *keys, u8 *out,
>> + const u8 *in,
>> + unsigned int num_bytes,
>> + u8 *iv) {}
>> +#endif
>> +
> Please drop these alternatives.
ok
>
>> /*
>> * asmlinkage void aesni_gcm_init_avx_gen2()
>> * gcm_data *my_ctx_data, context data
>> @@ -487,6 +520,23 @@ static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out,
>> aes_ctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len);
>> }
>>
>> +static void aesni_ctr_enc_avx512_tfm(struct crypto_aes_ctx *ctx, u8 *out,
>> + const u8 *in, unsigned int len, u8 *iv)
>> +{
>> + /*
>> + * based on key length, override with the by16 version
>> + * of ctr mode encryption/decryption for improved performance.
>> + * aes_set_key_common() ensures that key length is one of
>> + * {128,192,256}
>> + */
>> + if (ctx->key_length == AES_KEYSIZE_128)
>> + aes_ctr_enc_128_avx512_by16((void *)ctx, out, in, len, iv);
>> + else if (ctx->key_length == AES_KEYSIZE_192)
>> + aes_ctr_enc_192_avx512_by16((void *)ctx, out, in, len, iv);
>> + else
>> + aes_ctr_enc_256_avx512_by16((void *)ctx, out, in, len, iv);
>> +}
>> +
>> static int ctr_crypt(struct skcipher_request *req)
>> {
>> struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
>> @@ -1076,7 +1126,12 @@ static int __init aesni_init(void)
>> aesni_gcm_tfm = &aesni_gcm_tfm_sse;
>> }
>> aesni_ctr_enc_tfm = aesni_ctr_enc;
>> - if (boot_cpu_has(X86_FEATURE_AVX)) {
>> + if (use_avx512 && IS_ENABLED(CONFIG_CRYPTO_AES_CTR_AVX512) &&
>> + cpu_feature_enabled(X86_FEATURE_VAES)) {
>> + /* Ctr mode performance optimization using AVX512 */
>> + aesni_ctr_enc_tfm = aesni_ctr_enc_avx512_tfm;
>> + pr_info("AES CTR mode by16 optimization enabled\n");
> This will need to be changed to a static_call_update() once my
> outstanding patch is merged.
yeah will do!
>
>> + } else if (boot_cpu_has(X86_FEATURE_AVX)) {
>> /* optimize performance of ctr mode encryption transform */
>> aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm;
>> pr_info("AES CTR mode by8 optimization enabled\n");