2021-06-10 13:45:39

by Tianjia Zhang

[permalink] [raw]
Subject: [PATCH 3/3] crypto: x86/sm4 - add AES-NI/AVX/x86_64 assembler implementation

This patch adds AES-NI/AVX/x86_64 assembler implementation of SM4
block cipher. Through two affine transforms, we can use the AES
S-Box to simulate the SM4 S-Box to achieve the effect of instruction
acceleration.

Signed-off-by: Tianjia Zhang <[email protected]>
---
arch/x86/crypto/Makefile | 3 +
arch/x86/crypto/sm4-aesni-avx-asm_64.S | 339 +++++++++++++++++++++++++
arch/x86/crypto/sm4_aesni_avx_glue.c | 115 +++++++++
crypto/Kconfig | 29 +++
4 files changed, 486 insertions(+)
create mode 100644 arch/x86/crypto/sm4-aesni-avx-asm_64.S
create mode 100644 arch/x86/crypto/sm4_aesni_avx_glue.c

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index d0959e7b809f..08f95d4e1e7c 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -88,6 +88,9 @@ nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o

obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o

+obj-$(CONFIG_CRYPTO_SM4_AESNI_AVX_X86_64) += sm4-aesni-avx-x86_64.o
+sm4-aesni-avx-x86_64-y := sm4-aesni-avx-asm_64.o sm4_aesni_avx_glue.o
+
quiet_cmd_perlasm = PERLASM $@
cmd_perlasm = $(PERL) $< > $@
$(obj)/%.S: $(src)/%.pl FORCE
diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
new file mode 100644
index 000000000000..c7cbb7e90bec
--- /dev/null
+++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
@@ -0,0 +1,339 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm, AES-NI/AVX optimized.
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-04.html
+ *
+ * Copyright (C) 2018 Markku-Juhani O. Saarinen <[email protected]>
+ * Copyright (C) 2020 Jussi Kivilinna <[email protected]>
+ * Copyright (c) 2021 Tianjia Zhang <[email protected]>
+ */
+
+/* Based on SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at:
+ * https://github.com/mjosaarinen/sm4ni
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define rRIP (%rip)
+
+#define RX0 %xmm0
+#define RX1 %xmm1
+#define MASK_4BIT %xmm2
+#define RTMP0 %xmm3
+#define RTMP1 %xmm4
+#define RTMP2 %xmm5
+#define RTMP3 %xmm6
+#define RTMP4 %xmm7
+
+#define RA0 %xmm8
+#define RA1 %xmm9
+#define RA2 %xmm10
+#define RA3 %xmm11
+
+#define RB0 %xmm12
+#define RB1 %xmm13
+#define RB2 %xmm14
+#define RB3 %xmm15
+
+#define RNOT %xmm0
+#define RBSWAP %xmm1
+
+
+/* Transpose four 32-bit words between 128-bit vectors. */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+/* pre-SubByte transform. */
+#define transform_pre(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpand x, mask4bit, tmp0; \
+ vpandn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+/* post-SubByte transform. Note: x has been XOR'ed with mask4bit by
+ * 'vaeslastenc' instruction.
+ */
+#define transform_post(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpandn mask4bit, x, tmp0; \
+ vpsrld $4, x, x; \
+ vpand x, mask4bit, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+/*
+ * Following four affine transform look-up tables are from work by
+ * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni
+ *
+ * These allow exposing SM4 S-Box from AES SubByte.
+ */
+
+/* pre-SubByte affine transform, from SM4 field to AES field. */
+.Lpre_tf_lo_s:
+ .quad 0x9197E2E474720701, 0xC7C1B4B222245157
+.Lpre_tf_hi_s:
+ .quad 0xE240AB09EB49A200, 0xF052B91BF95BB012
+
+/* post-SubByte affine transform, from AES field to SM4 field. */
+.Lpost_tf_lo_s:
+ .quad 0x5B67F2CEA19D0834, 0xEDD14478172BBE82
+.Lpost_tf_hi_s:
+ .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC1063BF
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+ .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+ .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_8:
+ .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x04, 0x01, 0x0e
+ .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x0c, 0x09, 0x06
+
+/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_16:
+ .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x0b, 0x04, 0x01
+ .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x03, 0x0c, 0x09
+
+/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */
+.Linv_shift_row_rol_24:
+ .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x0e, 0x0b, 0x04
+ .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x06, 0x03, 0x0c
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/* For input word byte-swap */
+.Lbswap32_mask:
+ .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+.align 4
+/* 4-bit mask */
+.L0f0f0f0f:
+ .long 0x0f0f0f0f
+
+
+.text
+.align 16
+
+/*
+ * void sm4_aesni_avx_expand_key(const u8 *key, u32 *rk_enc,
+ * u32 *rk_dec, const u32 *fk, const u32 *ck);
+ */
+SYM_FUNC_START(sm4_aesni_avx_expand_key)
+ /* input:
+ * %rdi: 128-bit key
+ * %rsi: rkey_enc
+ * %rdx: rkey_dec
+ * %rcx: fk array
+ * %r8: ck array
+ */
+ FRAME_BEGIN
+
+ vmovd 0*4(%rdi), RA0;
+ vmovd 1*4(%rdi), RA1;
+ vmovd 2*4(%rdi), RA2;
+ vmovd 3*4(%rdi), RA3;
+
+ vmovdqa .Lbswap32_mask rRIP, RTMP2;
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+
+ vmovd 0*4(%rcx), RB0;
+ vmovd 1*4(%rcx), RB1;
+ vmovd 2*4(%rcx), RB2;
+ vmovd 3*4(%rcx), RB3;
+ vpxor RB0, RA0, RA0;
+ vpxor RB1, RA1, RA1;
+ vpxor RB2, RA2, RA2;
+ vpxor RB3, RA3, RA3;
+
+ vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
+ vmovdqa .Lpre_tf_lo_s rRIP, RTMP4;
+ vmovdqa .Lpre_tf_hi_s rRIP, RB0;
+ vmovdqa .Lpost_tf_lo_s rRIP, RB1;
+ vmovdqa .Lpost_tf_hi_s rRIP, RB2;
+ vmovdqa .Linv_shift_row rRIP, RB3;
+
+#define ROUND(round, s0, s1, s2, s3) \
+ vbroadcastss (4*(round))(%r8), RX0; \
+ vpxor s1, RX0, RX0; \
+ vpxor s2, RX0, RX0; \
+ vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \
+ \
+ /* sbox, non-linear part */ \
+ transform_pre(RX0, RTMP4, RB0, MASK_4BIT, RTMP0); \
+ vaesenclast MASK_4BIT, RX0, RX0; \
+ transform_post(RX0, RB1, RB2, MASK_4BIT, RTMP0); \
+ \
+ /* linear part */ \
+ vpshufb RB3, RX0, RX0; \
+ vpxor RX0, s0, s0; /* s0 ^ x */ \
+ vpslld $13, RX0, RTMP0; \
+ vpsrld $19, RX0, RTMP1; \
+ vpslld $23, RX0, RTMP2; \
+ vpsrld $9, RX0, RTMP3; \
+ vpxor RTMP0, RTMP1, RTMP1; \
+ vpxor RTMP2, RTMP3, RTMP3; \
+ vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,13) */ \
+ vpxor RTMP3, s0, s0; /* s0 ^ x ^ rol(x,13) ^ rol(x,23) */
+
+ leaq (32*4)(%r8), %rax;
+ leaq (32*4)(%rdx), %rdx;
+.align 16
+.Lroundloop_expand_key:
+ leaq (-4*4)(%rdx), %rdx;
+ ROUND(0, RA0, RA1, RA2, RA3);
+ ROUND(1, RA1, RA2, RA3, RA0);
+ ROUND(2, RA2, RA3, RA0, RA1);
+ ROUND(3, RA3, RA0, RA1, RA2);
+ leaq (4*4)(%r8), %r8;
+ vmovd RA0, (0*4)(%rsi);
+ vmovd RA1, (1*4)(%rsi);
+ vmovd RA2, (2*4)(%rsi);
+ vmovd RA3, (3*4)(%rsi);
+ vmovd RA0, (3*4)(%rdx);
+ vmovd RA1, (2*4)(%rdx);
+ vmovd RA2, (1*4)(%rdx);
+ vmovd RA3, (0*4)(%rdx);
+ leaq (4*4)(%rsi), %rsi;
+ cmpq %rax, %r8;
+ jne .Lroundloop_expand_key;
+
+#undef ROUND
+
+ vzeroall;
+ FRAME_END
+ ret;
+SYM_FUNC_END(sm4_aesni_avx_expand_key)
+
+
+/*
+ * void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst,
+ * const u8 *src, int nblocks)
+ */
+SYM_FUNC_START(sm4_aesni_avx_crypt4)
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (1..4 blocks)
+ * %rdx: src (1..4 blocks)
+ * %rcx: num blocks (1..4)
+ */
+ FRAME_BEGIN
+
+ vmovdqu 0*16(%rdx), RA0;
+ vmovdqa RA0, RA1;
+ vmovdqa RA0, RA2;
+ vmovdqa RA0, RA3;
+ cmpq $2, %rcx;
+ jb .Lblk4_load_input_done;
+ vmovdqu 1*16(%rdx), RA1;
+ je .Lblk4_load_input_done;
+ vmovdqu 2*16(%rdx), RA2;
+ cmpq $3, %rcx;
+ je .Lblk4_load_input_done;
+ vmovdqu 3*16(%rdx), RA3;
+
+.Lblk4_load_input_done:
+
+ vmovdqa .Lbswap32_mask rRIP, RTMP2;
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+
+ vbroadcastss .L0f0f0f0f rRIP, MASK_4BIT;
+ vmovdqa .Lpre_tf_lo_s rRIP, RTMP4;
+ vmovdqa .Lpre_tf_hi_s rRIP, RB0;
+ vmovdqa .Lpost_tf_lo_s rRIP, RB1;
+ vmovdqa .Lpost_tf_hi_s rRIP, RB2;
+ vmovdqa .Linv_shift_row rRIP, RB3;
+ vmovdqa .Linv_shift_row_rol_8 rRIP, RTMP2;
+ vmovdqa .Linv_shift_row_rol_16 rRIP, RTMP3;
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+
+#define ROUND(round, s0, s1, s2, s3) \
+ vbroadcastss (4*(round))(%rdi), RX0; \
+ vpxor s1, RX0, RX0; \
+ vpxor s2, RX0, RX0; \
+ vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \
+ \
+ /* sbox, non-linear part */ \
+ transform_pre(RX0, RTMP4, RB0, MASK_4BIT, RTMP0); \
+ vaesenclast MASK_4BIT, RX0, RX0; \
+ transform_post(RX0, RB1, RB2, MASK_4BIT, RTMP0); \
+ \
+ /* linear part */ \
+ vpshufb RB3, RX0, RTMP0; \
+ vpxor RTMP0, s0, s0; /* s0 ^ x */ \
+ vpshufb RTMP2, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) */ \
+ vpshufb RTMP3, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \
+ vpshufb .Linv_shift_row_rol_24 rRIP, RX0, RTMP1; \
+ vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \
+ vpslld $2, RTMP0, RTMP1; \
+ vpsrld $30, RTMP0, RTMP0; \
+ vpxor RTMP0, s0, s0; \
+ /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+ vpxor RTMP1, s0, s0;
+
+ leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk4:
+ ROUND(0, RA0, RA1, RA2, RA3);
+ ROUND(1, RA1, RA2, RA3, RA0);
+ ROUND(2, RA2, RA3, RA0, RA1);
+ ROUND(3, RA3, RA0, RA1, RA2);
+ leaq (4*4)(%rdi), %rdi;
+ cmpq %rax, %rdi;
+ jne .Lroundloop_blk4;
+
+#undef ROUND
+
+ vmovdqa .Lbswap128_mask rRIP, RTMP2;
+
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+
+ vmovdqu RA0, 0*16(%rsi);
+ cmpq $2, %rcx;
+ jb .Lblk4_store_output_done;
+ vmovdqu RA1, 1*16(%rsi);
+ je .Lblk4_store_output_done;
+ vmovdqu RA2, 2*16(%rsi);
+ cmpq $3, %rcx;
+ je .Lblk4_store_output_done;
+ vmovdqu RA3, 3*16(%rsi);
+
+.Lblk4_store_output_done:
+ vzeroall;
+ FRAME_END
+ ret;
+SYM_FUNC_END(sm4_aesni_avx_crypt4)
diff --git a/arch/x86/crypto/sm4_aesni_avx_glue.c b/arch/x86/crypto/sm4_aesni_avx_glue.c
new file mode 100644
index 000000000000..3f822fa1070a
--- /dev/null
+++ b/arch/x86/crypto/sm4_aesni_avx_glue.c
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SM4 Cipher Algorithm, AES-NI/AVX optimized.
+ * as specified in
+ * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-04.html
+ *
+ * Copyright (c) 2021 Tianjia Zhang <[email protected]>
+ */
+
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <asm/simd.h>
+#include <crypto/internal/simd.h>
+#include <crypto/sm4.h>
+
+asmlinkage void sm4_aesni_avx_expand_key(const u8 *key, u32 *rk_enc,
+ u32 *rk_dec, const u32 *fk, const u32 *ck);
+asmlinkage void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst,
+ const u8 *src, int nblocks);
+
+static int sm4_setkey(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ if (key_len != SM4_KEY_SIZE)
+ return -EINVAL;
+
+ if (crypto_simd_usable()) {
+ kernel_fpu_begin();
+ sm4_aesni_avx_expand_key(in_key, ctx->rkey_enc,
+ ctx->rkey_dec, crypto_sm4_fk, crypto_sm4_ck);
+ kernel_fpu_end();
+ } else
+ crypto_sm4_expand_key(ctx, in_key, key_len);
+
+ return 0;
+}
+
+static void sm4_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+ const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ if (crypto_simd_usable()) {
+ kernel_fpu_begin();
+ sm4_aesni_avx_crypt4(ctx->rkey_enc, out, in, 1);
+ kernel_fpu_end();
+ } else
+ crypto_sm4_do_crypt(ctx->rkey_enc, out, in);
+}
+
+static void sm4_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+{
+ const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ if (crypto_simd_usable()) {
+ kernel_fpu_begin();
+ sm4_aesni_avx_crypt4(ctx->rkey_dec, out, in, 1);
+ kernel_fpu_end();
+ } else
+ crypto_sm4_do_crypt(ctx->rkey_dec, out, in);
+}
+
+static struct crypto_alg sm4_asm_alg = {
+ .cra_name = "sm4",
+ .cra_driver_name = "sm4-asm",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = SM4_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct crypto_sm4_ctx),
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = SM4_KEY_SIZE,
+ .cia_max_keysize = SM4_KEY_SIZE,
+ .cia_setkey = sm4_setkey,
+ .cia_encrypt = sm4_encrypt,
+ .cia_decrypt = sm4_decrypt
+ }
+ }
+};
+
+static int __init sm4_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX or AES-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return crypto_register_alg(&sm4_asm_alg);
+}
+
+static void __exit sm4_exit(void)
+{
+ crypto_unregister_alg(&sm4_asm_alg);
+}
+
+module_init(sm4_init);
+module_exit(sm4_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Tianjia Zhang <[email protected]>");
+MODULE_DESCRIPTION("SM4 Cipher Algorithm, AES-NI/AVX optimized");
+MODULE_ALIAS_CRYPTO("sm4");
+MODULE_ALIAS_CRYPTO("sm4-asm");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 4fbc9c080ca9..9f639395c667 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1570,6 +1570,35 @@ config CRYPTO_SM4

If unsure, say N.

+config CRYPTO_SM4_AESNI_AVX_X86_64
+ tristate "SM4 cipher algorithm (x86_64/AES-NI/AVX)"
+ depends on X86 && 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_SIMD
+ select CRYPTO_ALGAPI
+ select CRYPTO_LIB_SM4
+ help
+ SM4 cipher algorithms (OSCCA GB/T 32907-2016) (x86_64/AES-NI/AVX).
+
+ SM4 (GBT.32907-2016) is a cryptographic standard issued by the
+ Organization of State Commercial Administration of China (OSCCA)
+ as an authorized cryptographic algorithms for the use within China.
+
+ SMS4 was originally created for use in protecting wireless
+ networks, and is mandated in the Chinese National Standard for
+ Wireless LAN WAPI (Wired Authentication and Privacy Infrastructure)
+ (GB.15629.11-2003).
+
+ The latest SM4 standard (GBT.32907-2016) was proposed by OSCCA and
+ standardized through TC 260 of the Standardization Administration
+ of the People's Republic of China (SAC).
+
+ The input, output, and key of SMS4 are each 128 bits.
+
+ See also: <https://eprint.iacr.org/2008/329.pdf>
+
+ If unsure, say N.
+
config CRYPTO_TEA
tristate "TEA, XTEA and XETA cipher algorithms"
depends on CRYPTO_USER_API_ENABLE_OBSOLETE
--
2.19.1.3.ge56e4f7


2021-06-10 23:29:27

by Eric Biggers

[permalink] [raw]
Subject: Re: [PATCH 3/3] crypto: x86/sm4 - add AES-NI/AVX/x86_64 assembler implementation

On Thu, Jun 10, 2021 at 09:44:59PM +0800, Tianjia Zhang wrote:
> This patch adds AES-NI/AVX/x86_64 assembler implementation of SM4
> block cipher. Through two affine transforms, we can use the AES
> S-Box to simulate the SM4 S-Box to achieve the effect of instruction
> acceleration.
>

Benchmark results, please.

Also, is this passing the self-tests, including the fuzz tests?

> +/*
> + * void sm4_aesni_avx_expand_key(const u8 *key, u32 *rk_enc,
> + * u32 *rk_dec, const u32 *fk, const u32 *ck);
> + */
> +SYM_FUNC_START(sm4_aesni_avx_expand_key)
> + /* input:
> + * %rdi: 128-bit key
> + * %rsi: rkey_enc
> + * %rdx: rkey_dec
> + * %rcx: fk array
> + * %r8: ck array
> + */
> + FRAME_BEGIN

Key expansion isn't performance-critical. Can the C library version be used, or
does the key need to be expanded in a way specific to this x86 implementation?

> +/*
> + * void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst,
> + * const u8 *src, int nblocks)
> + */
> +SYM_FUNC_START(sm4_aesni_avx_crypt4)
> + /* input:
> + * %rdi: round key array, CTX
> + * %rsi: dst (1..4 blocks)
> + * %rdx: src (1..4 blocks)
> + * %rcx: num blocks (1..4)
> + */
> + FRAME_BEGIN
[...]

> +static void sm4_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
> +{
> + const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
> +
> + if (crypto_simd_usable()) {
> + kernel_fpu_begin();
> + sm4_aesni_avx_crypt4(ctx->rkey_enc, out, in, 1);
> + kernel_fpu_end();
> + } else
> + crypto_sm4_do_crypt(ctx->rkey_enc, out, in);
> +}
> +
> +static void sm4_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
> +{
> + const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
> +
> + if (crypto_simd_usable()) {
> + kernel_fpu_begin();
> + sm4_aesni_avx_crypt4(ctx->rkey_dec, out, in, 1);
> + kernel_fpu_end();
> + } else
> + crypto_sm4_do_crypt(ctx->rkey_dec, out, in);
> +}

Your assembly code appears to handle encrypting up to 4 blocks at a time.
However you have only wired this up to the "cipher" API which does 1 block at a
time. Is this intentional?

What are your performance results with real-world chaining modes like XTS, and
do you plan to implement any of these modes directly?

> +
> +static struct crypto_alg sm4_asm_alg = {
> + .cra_name = "sm4",
> + .cra_driver_name = "sm4-asm",

In arch/x86/crypto/, "-asm" usually means a vanilla x86 assembly implementation
without any AES-NI, SSE, AVX, etc. instructions. Calling this something like
"sm4-aesni-avx" would make more sense. (Or is it actually avx2, not avx?)

> +config CRYPTO_SM4_AESNI_AVX_X86_64
> + tristate "SM4 cipher algorithm (x86_64/AES-NI/AVX)"
> + depends on X86 && 64BIT
> + select CRYPTO_SKCIPHER
> + select CRYPTO_SIMD
> + select CRYPTO_ALGAPI
> + select CRYPTO_LIB_SM4

As-is, neither CRYPTO_SKCIPHER nor CRYPTO_SIMD needs to be selected here.

> + help
> + SM4 cipher algorithms (OSCCA GB/T 32907-2016) (x86_64/AES-NI/AVX).
> +
> + SM4 (GBT.32907-2016) is a cryptographic standard issued by the
> + Organization of State Commercial Administration of China (OSCCA)
> + as an authorized cryptographic algorithms for the use within China.
> +
> + SMS4 was originally created for use in protecting wireless
> + networks, and is mandated in the Chinese National Standard for
> + Wireless LAN WAPI (Wired Authentication and Privacy Infrastructure)
> + (GB.15629.11-2003).
> +
> + The latest SM4 standard (GBT.32907-2016) was proposed by OSCCA and
> + standardized through TC 260 of the Standardization Administration
> + of the People's Republic of China (SAC).
> +
> + The input, output, and key of SMS4 are each 128 bits.
> +
> + See also: <https://eprint.iacr.org/2008/329.pdf>
> +
> + If unsure, say N.

This is the help text for the x86 implementation specifically. Please don't
have boilerplate text about the algorithm here; that already exists for the
generic implementation. The text should explain about the x86 implementation.

- Eric

2021-06-13 10:16:18

by Tianjia Zhang

[permalink] [raw]
Subject: Re: [PATCH 3/3] crypto: x86/sm4 - add AES-NI/AVX/x86_64 assembler implementation

Hi Eric,

On 6/11/21 7:27 AM, Eric Biggers wrote:
> On Thu, Jun 10, 2021 at 09:44:59PM +0800, Tianjia Zhang wrote:
>> This patch adds AES-NI/AVX/x86_64 assembler implementation of SM4
>> block cipher. Through two affine transforms, we can use the AES
>> S-Box to simulate the SM4 S-Box to achieve the effect of instruction
>> acceleration.
>>
>
> Benchmark results, please.
>
> Also, is this passing the self-tests, including the fuzz tests?
>

I will provide this information in the next version.

>> +/*
>> + * void sm4_aesni_avx_expand_key(const u8 *key, u32 *rk_enc,
>> + * u32 *rk_dec, const u32 *fk, const u32 *ck);
>> + */
>> +SYM_FUNC_START(sm4_aesni_avx_expand_key)
>> + /* input:
>> + * %rdi: 128-bit key
>> + * %rsi: rkey_enc
>> + * %rdx: rkey_dec
>> + * %rcx: fk array
>> + * %r8: ck array
>> + */
>> + FRAME_BEGIN
>
> Key expansion isn't performance-critical. Can the C library version be used, or
> does the key need to be expanded in a way specific to this x86 implementation?
>

It can be replaced by a common implementation of C library. For expand
key that are not called frequently, the optimization of a specific
instruction set does not bring much benefit. Of course, it is possible
to delete this implementation.

>> +/*
>> + * void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst,
>> + * const u8 *src, int nblocks)
>> + */
>> +SYM_FUNC_START(sm4_aesni_avx_crypt4)
>> + /* input:
>> + * %rdi: round key array, CTX
>> + * %rsi: dst (1..4 blocks)
>> + * %rdx: src (1..4 blocks)
>> + * %rcx: num blocks (1..4)
>> + */
>> + FRAME_BEGIN
> [...]
>
>> +static void sm4_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>> +{
>> + const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
>> +
>> + if (crypto_simd_usable()) {
>> + kernel_fpu_begin();
>> + sm4_aesni_avx_crypt4(ctx->rkey_enc, out, in, 1);
>> + kernel_fpu_end();
>> + } else
>> + crypto_sm4_do_crypt(ctx->rkey_enc, out, in);
>> +}
>> +
>> +static void sm4_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
>> +{
>> + const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
>> +
>> + if (crypto_simd_usable()) {
>> + kernel_fpu_begin();
>> + sm4_aesni_avx_crypt4(ctx->rkey_dec, out, in, 1);
>> + kernel_fpu_end();
>> + } else
>> + crypto_sm4_do_crypt(ctx->rkey_dec, out, in);
>> +}
>
> Your assembly code appears to handle encrypting up to 4 blocks at a time.
> However you have only wired this up to the "cipher" API which does 1 block at a
> time. Is this intentional?
>
> What are your performance results with real-world chaining modes like XTS, and
> do you plan to implement any of these modes directly?
>

This implementation is intentional. First, a general block encryption is
provided. There is no obvious performance improvement in this
implementation. The key to optimization is to make full use of parallel
four blocks encryption at a time. This is still under development, and I
will continue to implement things like XTS in the future. Optimization
of such specific modes.

>> +
>> +static struct crypto_alg sm4_asm_alg = {
>> + .cra_name = "sm4",
>> + .cra_driver_name = "sm4-asm",
>
> In arch/x86/crypto/, "-asm" usually means a vanilla x86 assembly implementation
> without any AES-NI, SSE, AVX, etc. instructions. Calling this something like
> "sm4-aesni-avx" would make more sense. (Or is it actually avx2, not avx?)
>

will do in next version patch.

>> +config CRYPTO_SM4_AESNI_AVX_X86_64
>> + tristate "SM4 cipher algorithm (x86_64/AES-NI/AVX)"
>> + depends on X86 && 64BIT
>> + select CRYPTO_SKCIPHER
>> + select CRYPTO_SIMD
>> + select CRYPTO_ALGAPI
>> + select CRYPTO_LIB_SM4
>
> As-is, neither CRYPTO_SKCIPHER nor CRYPTO_SIMD needs to be selected here.
>

ditto.

>> + help
>> + SM4 cipher algorithms (OSCCA GB/T 32907-2016) (x86_64/AES-NI/AVX).
>> +
>> + SM4 (GBT.32907-2016) is a cryptographic standard issued by the
>> + Organization of State Commercial Administration of China (OSCCA)
>> + as an authorized cryptographic algorithms for the use within China.
>> +
>> + SMS4 was originally created for use in protecting wireless
>> + networks, and is mandated in the Chinese National Standard for
>> + Wireless LAN WAPI (Wired Authentication and Privacy Infrastructure)
>> + (GB.15629.11-2003).
>> +
>> + The latest SM4 standard (GBT.32907-2016) was proposed by OSCCA and
>> + standardized through TC 260 of the Standardization Administration
>> + of the People's Republic of China (SAC).
>> +
>> + The input, output, and key of SMS4 are each 128 bits.
>> +
>> + See also: <https://eprint.iacr.org/2008/329.pdf>
>> +
>> + If unsure, say N.
>
> This is the help text for the x86 implementation specifically. Please don't
> have boilerplate text about the algorithm here; that already exists for the
> generic implementation. The text should explain about the x86 implementation.
>

ditto.

> - Eric
>

Thanks for your suggestion.

Cheers,
Tianjia