LinuxLists.cc - [PATCH v2 2/3] crypto: aria-avx: add AES-NI/AVX/x86_64 assembler implementation of aria cipher

2022-08-26 05:42:13

Subject: [PATCH v2 2/3] crypto: aria-avx: add AES-NI/AVX/x86_64 assembler implementation of aria cipher

The implementation is based on the 32-bit implementation of the aria.
Also, aria-avx process steps are the similar to the camellia-avx.
1. Byteslice(16way)
2. Add-round-key.
3. Sbox
4. Diffusion layer.

Except for s-box, all steps are the same as the aria-generic
implementation. s-box step is very similar to camellia and
sm4 implementation.

There are 4 s-boxes in the ARIA and the 2 s-boxes are the same as
AES's s-boxes. The basic strategy is to use the aes-ni.

To calculate a first sbox, it just uses the aesenclast and then
inverts shift_row.
No more process is needed for this job because the first s-box is
the same as the AES encryption s-box.

To calculate a second sbox(invert of s1), it just uses the aesdeclast
and then inverts shift_row.
No more process is needed for this job because the second s-box is
the same as the AES decryption s-box.

To calculate a third and fourth s-boxes, it uses the aesenclast,
then inverts shift_row, and affine transformation.

The aria-generic implementation is based on a 32-bit implementation,
not an 8-bit implementation. the aria-avx Diffusion Layer implementation
is based on aria-generic implementation because 8-bit implementation is
not fit for parallel implementation but 32-bit is enough to fit for this.

Signed-off-by: Taehee Yoo <[email protected]>
---

v2:
- Do not call non-FPU functions(aria_{encrypt | decrypt}() in the
FPU context.
- Do not acquire FPU context for too long.

arch/x86/crypto/Makefile | 3 +
arch/x86/crypto/aria-aesni-avx-asm_64.S | 648 ++++++++++++++++++++++++
arch/x86/crypto/aria_aesni_avx_glue.c | 165 ++++++
crypto/Kconfig | 21 +
4 files changed, 837 insertions(+)
create mode 100644 arch/x86/crypto/aria-aesni-avx-asm_64.S
create mode 100644 arch/x86/crypto/aria_aesni_avx_glue.c

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 04d07ab744b2..3b1d701a4f6c 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -100,6 +100,9 @@ sm4-aesni-avx-x86_64-y := sm4-aesni-avx-asm_64.o sm4_aesni_avx_glue.o
obj-$(CONFIG_CRYPTO_SM4_AESNI_AVX2_X86_64) += sm4-aesni-avx2-x86_64.o
sm4-aesni-avx2-x86_64-y := sm4-aesni-avx2-asm_64.o sm4_aesni_avx2_glue.o

+obj-$(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64) += aria-aesni-avx-x86_64.o
+aria-aesni-avx-x86_64-y := aria-aesni-avx-asm_64.o aria_aesni_avx_glue.o
+
quiet_cmd_perlasm = PERLASM $@
cmd_perlasm = $(PERL) $< > $@
$(obj)/%.S: $(src)/%.pl FORCE
diff --git a/arch/x86/crypto/aria-aesni-avx-asm_64.S b/arch/x86/crypto/aria-aesni-avx-asm_64.S
new file mode 100644
index 000000000000..3d01f5229f72
--- /dev/null
+++ b/arch/x86/crypto/aria-aesni-avx-asm_64.S
@@ -0,0 +1,648 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ARIA Cipher 16-way parallel algorithm (AVX)
+ *
+ * Copyright (c) 2022 Taehee Yoo <[email protected]>
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpand x, mask4bit, tmp0; \
+ vpandn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b(a0, b0, c0, d0, \
+ a1, b1, c1, d1, \
+ a2, b2, c2, d2, \
+ a3, b3, c3, d3, \
+ st0, st1) \
+ vmovdqu d2, st0; \
+ vmovdqu d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu a0, st0; \
+ vmovdqu a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vmovdqu .Lshufb_16x16b, a0; \
+ vmovdqu st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu d3, st1; \
+ vmovdqu st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu d2, st0; \
+ \
+ transpose_4x4(a0, b0, c0, d0, d2, d3); \
+ transpose_4x4(a1, b1, c1, d1, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu b0, st0; \
+ vmovdqu b1, st1; \
+ transpose_4x4(a2, b2, c2, d2, b0, b1); \
+ transpose_4x4(a3, b3, c3, d3, b0, b1); \
+ vmovdqu st0, b0; \
+ vmovdqu st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+#define debyteslice_16x16b(a0, b0, c0, d0, \
+ a1, b1, c1, d1, \
+ a2, b2, c2, d2, \
+ a3, b3, c3, d3, \
+ st0, st1) \
+ vmovdqu d2, st0; \
+ vmovdqu d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu a0, st0; \
+ vmovdqu a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vmovdqu .Lshufb_16x16b, a0; \
+ vmovdqu st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu d3, st1; \
+ vmovdqu st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu d2, st0; \
+ \
+ transpose_4x4(c0, d0, a0, b0, d2, d3); \
+ transpose_4x4(c1, d1, a1, b1, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu b0, st0; \
+ vmovdqu b1, st1; \
+ transpose_4x4(c2, d2, a2, b2, b0, b1); \
+ transpose_4x4(c3, d3, a3, b3, b0, b1); \
+ vmovdqu st0, b0; \
+ vmovdqu st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ rio) \
+ vmovdqu (0 * 16)(rio), x0; \
+ vmovdqu (1 * 16)(rio), x1; \
+ vmovdqu (2 * 16)(rio), x2; \
+ vmovdqu (3 * 16)(rio), x3; \
+ vmovdqu (4 * 16)(rio), x4; \
+ vmovdqu (5 * 16)(rio), x5; \
+ vmovdqu (6 * 16)(rio), x6; \
+ vmovdqu (7 * 16)(rio), x7; \
+ vmovdqu (8 * 16)(rio), y0; \
+ vmovdqu (9 * 16)(rio), y1; \
+ vmovdqu (10 * 16)(rio), y2; \
+ vmovdqu (11 * 16)(rio), y3; \
+ vmovdqu (12 * 16)(rio), y4; \
+ vmovdqu (13 * 16)(rio), y5; \
+ vmovdqu (14 * 16)(rio), y6; \
+ vmovdqu (15 * 16)(rio), y7;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_ab, mem_cd) \
+ byteslice_16x16b(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ (mem_ab), (mem_cd)); \
+ \
+ vmovdqu x0, 0 * 16(mem_ab); \
+ vmovdqu x1, 1 * 16(mem_ab); \
+ vmovdqu x2, 2 * 16(mem_ab); \
+ vmovdqu x3, 3 * 16(mem_ab); \
+ vmovdqu x4, 4 * 16(mem_ab); \
+ vmovdqu x5, 5 * 16(mem_ab); \
+ vmovdqu x6, 6 * 16(mem_ab); \
+ vmovdqu x7, 7 * 16(mem_ab); \
+ vmovdqu y0, 0 * 16(mem_cd); \
+ vmovdqu y1, 1 * 16(mem_cd); \
+ vmovdqu y2, 2 * 16(mem_cd); \
+ vmovdqu y3, 3 * 16(mem_cd); \
+ vmovdqu y4, 4 * 16(mem_cd); \
+ vmovdqu y5, 5 * 16(mem_cd); \
+ vmovdqu y6, 6 * 16(mem_cd); \
+ vmovdqu y7, 7 * 16(mem_cd);
+
+/* de-byteslice, apply post-whitening and store blocks */
+#define outunpack16(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_ab, mem_cd) \
+ debyteslice_16x16b(y0, y4, x0, x4, \
+ y1, y5, x1, x5, \
+ y2, y6, x2, x6, \
+ y3, y7, x3, x7, \
+ (mem_ab), (mem_cd)); \
+ vmovdqu x0, 0 * 16(mem_ab); \
+ vmovdqu x1, 1 * 16(mem_ab); \
+ vmovdqu x2, 2 * 16(mem_ab); \
+ vmovdqu x3, 3 * 16(mem_ab); \
+ vmovdqu x4, 4 * 16(mem_ab); \
+ vmovdqu x5, 5 * 16(mem_ab); \
+ vmovdqu x6, 6 * 16(mem_ab); \
+ vmovdqu x7, 7 * 16(mem_ab); \
+ vmovdqu y0, 8 * 16(mem_ab); \
+ vmovdqu y1, 9 * 16(mem_ab); \
+ vmovdqu y2, 10 * 16(mem_ab); \
+ vmovdqu y3, 11 * 16(mem_ab); \
+ vmovdqu y4, 12 * 16(mem_ab); \
+ vmovdqu y5, 13 * 16(mem_ab); \
+ vmovdqu y6, 14 * 16(mem_ab); \
+ vmovdqu y7, 15 * 16(mem_ab); \
+
+#define aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, idx) \
+ vmovdqu x0, ((idx + 0) * 16)(mem_tmp); \
+ vmovdqu x1, ((idx + 1) * 16)(mem_tmp); \
+ vmovdqu x2, ((idx + 2) * 16)(mem_tmp); \
+ vmovdqu x3, ((idx + 3) * 16)(mem_tmp); \
+ vmovdqu x4, ((idx + 4) * 16)(mem_tmp); \
+ vmovdqu x5, ((idx + 5) * 16)(mem_tmp); \
+ vmovdqu x6, ((idx + 6) * 16)(mem_tmp); \
+ vmovdqu x7, ((idx + 7) * 16)(mem_tmp);
+
+#define aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, idx) \
+ vmovdqu ((idx + 0) * 16)(mem_tmp), x0; \
+ vmovdqu ((idx + 1) * 16)(mem_tmp), x1; \
+ vmovdqu ((idx + 2) * 16)(mem_tmp), x2; \
+ vmovdqu ((idx + 3) * 16)(mem_tmp), x3; \
+ vmovdqu ((idx + 4) * 16)(mem_tmp), x4; \
+ vmovdqu ((idx + 5) * 16)(mem_tmp), x5; \
+ vmovdqu ((idx + 6) * 16)(mem_tmp), x6; \
+ vmovdqu ((idx + 7) * 16)(mem_tmp), x7;
+
+#define aria_ark_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ t0, rk, idx, round) \
+ /* AddRoundKey */ \
+ vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \
+ vpxor t0, x0, x0; \
+ vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \
+ vpxor t0, x1, x1; \
+ vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \
+ vpxor t0, x2, x2; \
+ vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \
+ vpxor t0, x3, x3; \
+ vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \
+ vpxor t0, x4, x4; \
+ vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \
+ vpxor t0, x5, x5; \
+ vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \
+ vpxor t0, x6, x6; \
+ vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \
+ vpxor t0, x7, x7;
+
+#define aria_sbox_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ t0, t1, t2, t3, \
+ t4, t5, t6, t7) \
+ vpxor t0, t0, t0; \
+ vaesenclast t0, x0, x0; \
+ vaesenclast t0, x4, x4; \
+ vaesenclast t0, x1, x1; \
+ vaesenclast t0, x5, x5; \
+ vaesdeclast t0, x2, x2; \
+ vaesdeclast t0, x6, x6; \
+ \
+ /* AES inverse shift rows */ \
+ vmovdqa .Linv_shift_row, t0; \
+ vmovdqa .Lshift_row, t1; \
+ vpshufb t0, x0, x0; \
+ vpshufb t0, x4, x4; \
+ vpshufb t0, x1, x1; \
+ vpshufb t0, x5, x5; \
+ vpshufb t0, x3, x3; \
+ vpshufb t0, x7, x7; \
+ vpshufb t1, x2, x2; \
+ vpshufb t1, x6, x6; \
+ \
+ vmovdqa .Linv_lo, t0; \
+ vmovdqa .Linv_hi, t1; \
+ vmovdqa .Ltf_lo_s2, t2; \
+ vmovdqa .Ltf_hi_s2, t3; \
+ vmovdqa .Ltf_lo_x2, t4; \
+ vmovdqa .Ltf_hi_x2, t5; \
+ vbroadcastss .L0f0f0f0f, t6; \
+ \
+ /* extract multiplicative inverse */ \
+ filter_8bit(x1, t0, t1, t6, t7); \
+ /* affine transformation for S2 */ \
+ filter_8bit(x1, t2, t3, t6, t7); \
+ /* extract multiplicative inverse */ \
+ filter_8bit(x5, t0, t1, t6, t7); \
+ /* affine transformation for S2 */ \
+ filter_8bit(x5, t2, t3, t6, t7); \
+ \
+ /* affine transformation for X2 */ \
+ filter_8bit(x3, t4, t5, t6, t7); \
+ vpxor t7, t7, t7; \
+ vaesenclast t7, x3, x3; \
+ /* extract multiplicative inverse */ \
+ filter_8bit(x3, t0, t1, t6, t7); \
+ /* affine transformation for X2 */ \
+ filter_8bit(x7, t4, t5, t6, t7); \
+ vpxor t7, t7, t7; \
+ vaesenclast t7, x7, x7; \
+ /* extract multiplicative inverse */ \
+ filter_8bit(x7, t0, t1, t6, t7);
+
+#define aria_diff_m(x0, x1, x2, x3, \
+ t0, t1, t2, t3) \
+ /* T = rotr32(X, 8); */ \
+ /* X ^= T */ \
+ vpxor x0, x3, t0; \
+ vpxor x1, x0, t1; \
+ vpxor x2, x1, t2; \
+ vpxor x3, x2, t3; \
+ /* X = T ^ rotr(X, 16); */ \
+ vpxor t2, x0, x0; \
+ vpxor x1, t3, t3; \
+ vpxor t0, x2, x2; \
+ vpxor t1, x3, x1; \
+ vmovdqu t3, x3;
+
+#define aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7) \
+ /* t1 ^= t2; */ \
+ vpxor y0, x4, x4; \
+ vpxor y1, x5, x5; \
+ vpxor y2, x6, x6; \
+ vpxor y3, x7, x7; \
+ \
+ /* t2 ^= t3; */ \
+ vpxor y4, y0, y0; \
+ vpxor y5, y1, y1; \
+ vpxor y6, y2, y2; \
+ vpxor y7, y3, y3; \
+ \
+ /* t0 ^= t1; */ \
+ vpxor x4, x0, x0; \
+ vpxor x5, x1, x1; \
+ vpxor x6, x2, x2; \
+ vpxor x7, x3, x3; \
+ \
+ /* t3 ^= t1; */ \
+ vpxor x4, y4, y4; \
+ vpxor x5, y5, y5; \
+ vpxor x6, y6, y6; \
+ vpxor x7, y7, y7; \
+ \
+ /* t2 ^= t0; */ \
+ vpxor x0, y0, y0; \
+ vpxor x1, y1, y1; \
+ vpxor x2, y2, y2; \
+ vpxor x3, y3, y3; \
+ \
+ /* t1 ^= t2; */ \
+ vpxor y0, x4, x4; \
+ vpxor y1, x5, x5; \
+ vpxor y2, x6, x6; \
+ vpxor y3, x7, x7;
+
+#define aria_fe(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T3 = ABCD -> BADC \
+ * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
+ * T0 = ABCD -> CDAB \
+ * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
+ * T1 = ABCD -> DCBA \
+ * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
+ */ \
+ aria_diff_word(x2, x3, x0, x1, \
+ x7, x6, x5, x4, \
+ y0, y1, y2, y3, \
+ y5, y4, y7, y6); \
+ aria_store_state_8way(x3, x2, x1, x0, \
+ x6, x7, x4, x5, \
+ mem_tmp, 0);
+
+#define aria_fo(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T1 = ABCD -> BADC \
+ * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
+ * T2 = ABCD -> CDAB \
+ * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
+ * T3 = ABCD -> DCBA \
+ * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
+ */ \
+ aria_diff_word(x0, x1, x2, x3, \
+ x5, x4, x7, x6, \
+ y2, y3, y0, y1, \
+ y7, y6, y5, y4); \
+ aria_store_state_8way(x3, x2, x1, x0, \
+ x6, x7, x4, x5, \
+ mem_tmp, 0);
+
+#define aria_ff(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round, last_round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, last_round); \
+ \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, last_round); \
+ \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8);
+
+/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+#define SHUFB_BYTES(idx) \
+ 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+
+.Lshufb_16x16b:
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+ .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+ .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+.Lshift_row:
+ .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
+ .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
+/* extract multiplicative inverse from subByte(x) */
+.Linv_lo:
+ .byte 0x05, 0x4f, 0x91, 0xdb, 0x2c, 0x66, 0xb8, 0xf2
+ .byte 0x57, 0x1d, 0xc3, 0x89, 0x7e, 0x34, 0xea, 0xa0
+.Linv_hi:
+ .byte 0x00, 0xa4, 0x49, 0xed, 0x92, 0x36, 0xdb, 0x7f
+ .byte 0x25, 0x81, 0x6c, 0xc8, 0xb7, 0x13, 0xfe, 0x5a
+.Ltf_lo_s2:
+ .byte 0xe2, 0x4e, 0x1f, 0xb3, 0x24, 0x88, 0xd9, 0x75
+ .byte 0x61, 0xcd, 0x9c, 0x30, 0xa7, 0x0b, 0x5a, 0xf6
+.Ltf_hi_s2:
+ .byte 0x00, 0x26, 0xa7, 0x81, 0xfb, 0xdd, 0x5c, 0x7a
+ .byte 0x5f, 0x79, 0xf8, 0xde, 0xa4, 0x82, 0x03, 0x25
+.Ltf_lo_x2:
+ .byte 0x2c, 0xf4, 0x14, 0xcc, 0x56, 0x8e, 0x6e, 0xb6
+ .byte 0xed, 0x35, 0xd5, 0x0d, 0x97, 0x4f, 0xaf, 0x77
+.Ltf_hi_x2:
+ .byte 0x00, 0x75, 0x52, 0x27, 0xae, 0xdb, 0xfc, 0x89
+ .byte 0xe8, 0x9d, 0xba, 0xcf, 0x46, 0x33, 0x14, 0x61
+
+/* 4-bit mask */
+.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
+.align 4
+.L0f0f0f0f:
+ .long 0x0f0f0f0f
+
+.text
+
+.align 8
+SYM_FUNC_START(aria_aesni_avx_crypt_16way)
+ /* input:
+ * %rdi: rk
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: rounds
+ */
+
+ FRAME_BEGIN
+
+ inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rdx);
+
+ movq %rsi, %rax;
+ leaq 8 * 16(%rax), %r8;
+
+ inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r8);
+ aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %rdi, 0);
+ aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rdi, 1);
+ aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %rdi, 2);
+ aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rdi, 3);
+ aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %rdi, 4);
+ aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rdi, 5);
+ aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %rdi, 6);
+ aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rdi, 7);
+ aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %rdi, 8);
+ aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rdi, 9);
+ aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %rdi, 10);
+ cmp $12, %rcx;
+ jne .Laria192;
+ aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rdi, 11, 12);
+ jmp .Laria_end;
+.Laria192:
+ aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rdi, 11);
+ aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %rdi, 12);
+ cmp $14, %rcx;
+ jne .Laria256;
+ aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rdi, 13, 14);
+ jmp .Laria_end;
+.Laria256:
+ aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rdi, 13);
+ aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
+ %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rax, %rdi, 14);
+ aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rdi, 15, 16);
+.Laria_end:
+ outunpack16(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %r8);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx_crypt_16way)
diff --git a/arch/x86/crypto/aria_aesni_avx_glue.c b/arch/x86/crypto/aria_aesni_avx_glue.c
new file mode 100644
index 000000000000..535b4dac5b1c
--- /dev/null
+++ b/arch/x86/crypto/aria_aesni_avx_glue.c
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Glue Code for the AVX assembler implementation of the ARIA Cipher
+ *
+ * Copyright (c) 2022 Taehee Yoo <[email protected]>
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/internal/simd.h>
+#include <crypto/aria.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include "ecb_cbc_helpers.h"
+
+asmlinkage void aria_aesni_avx_crypt_16way(const u32 *rk, u8 *dst,
+ const u8 *src, int rounds);
+
+static int ecb_do_encrypt(struct skcipher_request *req, const u32 *rkey)
+{
+ struct aria_ctx *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ kernel_fpu_begin();
+ while (nbytes >= ARIA_AVX_BLOCK_SIZE) {
+ aria_aesni_avx_crypt_16way(rkey, dst, src, ctx->rounds);
+ dst += ARIA_AVX_BLOCK_SIZE;
+ src += ARIA_AVX_BLOCK_SIZE;
+ nbytes -= ARIA_AVX_BLOCK_SIZE;
+ }
+ kernel_fpu_end();
+ while (nbytes >= ARIA_BLOCK_SIZE) {
+ aria_encrypt(ctx, dst, src);
+ dst += ARIA_BLOCK_SIZE;
+ src += ARIA_BLOCK_SIZE;
+ nbytes -= ARIA_BLOCK_SIZE;
+ }
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int ecb_do_decrypt(struct skcipher_request *req, const u32 *rkey)
+{
+ struct aria_ctx *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ while (nbytes >= ARIA_AVX_BLOCK_SIZE) {
+ kernel_fpu_begin();
+ aria_aesni_avx_crypt_16way(rkey, dst, src, ctx->rounds);
+ kernel_fpu_end();
+ dst += ARIA_AVX_BLOCK_SIZE;
+ src += ARIA_AVX_BLOCK_SIZE;
+ nbytes -= ARIA_AVX_BLOCK_SIZE;
+ }
+ while (nbytes >= ARIA_BLOCK_SIZE) {
+ aria_decrypt(ctx, dst, src);
+ dst += ARIA_BLOCK_SIZE;
+ src += ARIA_BLOCK_SIZE;
+ nbytes -= ARIA_BLOCK_SIZE;
+ }
+
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int aria_avx_ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_encrypt(req, ctx->enc_key[0]);
+}
+
+static int aria_avx_ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_decrypt(req, ctx->dec_key[0]);
+}
+
+static int aria_avx_set_key(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ return aria_set_key(&tfm->base, key, keylen);
+}
+
+static struct skcipher_alg aria_algs[] = {
+ {
+ .base.cra_name = "__ecb(aria)",
+ .base.cra_driver_name = "__ecb-aria-avx",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = ARIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct aria_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = ARIA_MIN_KEY_SIZE,
+ .max_keysize = ARIA_MAX_KEY_SIZE,
+ .setkey = aria_avx_set_key,
+ .encrypt = aria_avx_ecb_encrypt,
+ .decrypt = aria_avx_ecb_decrypt,
+ }
+};
+
+static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)];
+
+static int __init aria_avx_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX or AES-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return simd_register_skciphers_compat(aria_algs,
+ ARRAY_SIZE(aria_algs),
+ aria_simd_algs);
+}
+
+static void __exit aria_avx_exit(void)
+{
+ simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs),
+ aria_simd_algs);
+}
+
+module_init(aria_avx_init);
+module_exit(aria_avx_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Taehee Yoo <[email protected]>");
+MODULE_DESCRIPTION("ARIA Cipher Algorithm, AVX/AES-NI optimized");
+MODULE_ALIAS_CRYPTO("aria");
+MODULE_ALIAS_CRYPTO("aria-aesni-avx");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index b1ccf873779d..cd63ea83ddd7 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1659,6 +1659,27 @@ config CRYPTO_ARIA
See also:
<https://seed.kisa.or.kr/kisa/algorithm/EgovAriaInfo.do>

+config CRYPTO_ARIA_AESNI_AVX_X86_64
+ tristate "ARIA cipher algorithm (x86_64/AES-NI/AVX)"
+ depends on X86 && 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_ARIA
+ select CRYPTO_SIMD
+ help
+ ARIA cipher algorithm (RFC5794).
+
+ ARIA is a standard encryption algorithm of the Republic of Korea.
+ The ARIA specifies three key sizes and rounds.
+ 128-bit: 12 rounds.
+ 192-bit: 14 rounds.
+ 256-bit: 16 rounds.
+
+ This module provides the ARIA cipher algorithm that processes
+ sixteen blocks parallel using the AVX instruction set.
+
+ See also:
+ <https://seed.kisa.or.kr/kisa/algorithm/EgovAriaInfo.do>
+
config CRYPTO_SERPENT
tristate "Serpent cipher algorithm"
select CRYPTO_ALGAPI
--
2.17.1

2022-08-26 15:22:16

by Elliott, Robert (Servers)

[permalink] [raw]

Subject: RE: [PATCH v2 2/3] crypto: aria-avx: add AES-NI/AVX/x86_64 assembler implementation of aria cipher

> -----Original Message-----
> From: Taehee Yoo <[email protected]>
> Sent: Friday, August 26, 2022 12:32 AM
> Subject: [PATCH v2 2/3] crypto: aria-avx: add AES-NI/AVX/x86_64 assembler
> implementation of aria cipher
>
> v2:
> - Do not call non-FPU functions(aria_{encrypt | decrypt}() in the
> FPU context.
> - Do not acquire FPU context for too long.

...
> +static int ecb_do_encrypt(struct skcipher_request *req, const u32 *rkey)
> +{
...
> + while ((nbytes = walk.nbytes) > 0) {
> + const u8 *src = walk.src.virt.addr;
> + u8 *dst = walk.dst.virt.addr;
> +
> + kernel_fpu_begin();
> + while (nbytes >= ARIA_AVX_BLOCK_SIZE) {
> + aria_aesni_avx_crypt_16way(rkey, dst, src, ctx->rounds);
> + dst += ARIA_AVX_BLOCK_SIZE;
> + src += ARIA_AVX_BLOCK_SIZE;
> + nbytes -= ARIA_AVX_BLOCK_SIZE;
> + }
> + kernel_fpu_end();

Per Herbert's reply on the sha512-avx RCU stall issue, another nesting
level might be necessary limiting the amount of data processed between
each kernel_fpu_begin() to kernel_fpu_end() pair to 4 KiB.

If you modify this driver to use the ECB_WALK_START, ECB_BLOCK, and
ECB_WALK_END macros from ecb_cbc_helpers.h and incorporate that fix,
then your fix would be easy to replicate into the other users (camellia,
cast5, cast6, serpent, and twofish).

2022-08-27 02:57:20

by Eric Biggers

[permalink] [raw]

Subject: Re: [PATCH v2 2/3] crypto: aria-avx: add AES-NI/AVX/x86_64 assembler implementation of aria cipher

On Fri, Aug 26, 2022 at 05:31:30AM +0000, Taehee Yoo wrote:
> +static struct skcipher_alg aria_algs[] = {
> + {
> + .base.cra_name = "__ecb(aria)",
> + .base.cra_driver_name = "__ecb-aria-avx",
> + .base.cra_priority = 400,
> + .base.cra_flags = CRYPTO_ALG_INTERNAL,
> + .base.cra_blocksize = ARIA_BLOCK_SIZE,
> + .base.cra_ctxsize = sizeof(struct aria_ctx),
> + .base.cra_module = THIS_MODULE,
> + .min_keysize = ARIA_MIN_KEY_SIZE,
> + .max_keysize = ARIA_MAX_KEY_SIZE,
> + .setkey = aria_avx_set_key,
> + .encrypt = aria_avx_ecb_encrypt,
> + .decrypt = aria_avx_ecb_decrypt,
> + }
> +};

Why do you want ECB mode and nothing else? At
https://lore.kernel.org/r/[email protected]
you claimed that the use case for ARIA support in the kernel is kTLS.

So you are using ECB mode in TLS?

- Eric

2022-08-27 06:26:14

by Taehee Yoo

[permalink] [raw]

Subject: Re: [PATCH v2 2/3] crypto: aria-avx: add AES-NI/AVX/x86_64 assembler implementation of aria cipher

Hi Elliott, Robert
Thanks for your review!

2022. 8. 27. 오전 12:12에 Elliott, Robert (Servers) 이(가) 쓴 글:
>
>
>> -----Original Message-----
>> From: Taehee Yoo <[email protected]>
>> Sent: Friday, August 26, 2022 12:32 AM
>> Subject: [PATCH v2 2/3] crypto: aria-avx: add AES-NI/AVX/x86_64
assembler
>> implementation of aria cipher
>>
>> v2:
>> - Do not call non-FPU functions(aria_{encrypt | decrypt}() in the
>> FPU context.
>> - Do not acquire FPU context for too long.
>
> ...
>> +static int ecb_do_encrypt(struct skcipher_request *req, const u32
*rkey)
>> +{
> ...
>> + while ((nbytes = walk.nbytes) > 0) {
>> + const u8 *src = walk.src.virt.addr;
>> + u8 *dst = walk.dst.virt.addr;
>> +
>> + kernel_fpu_begin();
>> + while (nbytes >= ARIA_AVX_BLOCK_SIZE) {
>> + aria_aesni_avx_crypt_16way(rkey, dst, src, ctx->rounds);
>> + dst += ARIA_AVX_BLOCK_SIZE;
>> + src += ARIA_AVX_BLOCK_SIZE;
>> + nbytes -= ARIA_AVX_BLOCK_SIZE;
>> + }
>> + kernel_fpu_end();
>
> Per Herbert's reply on the sha512-avx RCU stall issue, another nesting
> level might be necessary limiting the amount of data processed between
> each kernel_fpu_begin() to kernel_fpu_end() pair to 4 KiB.
>
> If you modify this driver to use the ECB_WALK_START, ECB_BLOCK, and
> ECB_WALK_END macros from ecb_cbc_helpers.h and incorporate that fix,
> then your fix would be easy to replicate into the other users (camellia,
> cast5, cast6, serpent, and twofish).
>

Now I understand why you suggested using ECB macro instead of open code.
I think your idea is nice for many users such as camellia, cast5, etc.

I will use ECB macro in the v3 patch.
Then, I will send a fixing the RCU stall issue patch separately instead
of containing it.

Thanks a lot!
Taehee Yoo

2022-08-27 06:36:13

by Eric Biggers

[permalink] [raw]

Subject: Re: [PATCH v2 2/3] crypto: aria-avx: add AES-NI/AVX/x86_64 assembler implementation of aria cipher

On Sat, Aug 27, 2022 at 03:30:55PM +0900, Taehee Yoo wrote:
> Hi Eric,
> Thanks for your review!
>
> 2022. 8. 27. 오전 11:46에 Eric Biggers 이(가) 쓴 글:
> > On Fri, Aug 26, 2022 at 05:31:30AM +0000, Taehee Yoo wrote:
> >> +static struct skcipher_alg aria_algs[] = {
> >> + {
> >> + .base.cra_name = "__ecb(aria)",
> >> + .base.cra_driver_name = "__ecb-aria-avx",
> >> + .base.cra_priority = 400,
> >> + .base.cra_flags = CRYPTO_ALG_INTERNAL,
> >> + .base.cra_blocksize = ARIA_BLOCK_SIZE,
> >> + .base.cra_ctxsize = sizeof(struct aria_ctx),
> >> + .base.cra_module = THIS_MODULE,
> >> + .min_keysize = ARIA_MIN_KEY_SIZE,
> >> + .max_keysize = ARIA_MAX_KEY_SIZE,
> >> + .setkey = aria_avx_set_key,
> >> + .encrypt = aria_avx_ecb_encrypt,
> >> + .decrypt = aria_avx_ecb_decrypt,
> >> + }
> >> +};
> >
> > Why do you want ECB mode and nothing else? At
> > https://lore.kernel.org/r/[email protected]
> > you claimed that the use case for ARIA support in the kernel is kTLS.
> >
> > So you are using ECB mode in TLS?
> >
>
> aria-ktls only uses GCM mode.
> So, ECB will not be used by ktls.
>
> My plan is to implement the GCM aria-avx eventually.
> ECB implementation will be a basic block of aria-avx.
> I think it can be used by gcm(aria).
> So, I will implement gcm mode of aria with this implementation.
>
> If this plan is not good, please let me know.
> If so, I will change my plan :)

GCM uses CTR mode, not ECB mode.

- Eric

2022-08-27 06:44:01

by Taehee Yoo

[permalink] [raw]

Subject: Re: [PATCH v2 2/3] crypto: aria-avx: add AES-NI/AVX/x86_64 assembler implementation of aria cipher

Hi Eric,
Thanks for your review!

2022. 8. 27. 오전 11:46에 Eric Biggers 이(가) 쓴 글:
> On Fri, Aug 26, 2022 at 05:31:30AM +0000, Taehee Yoo wrote:
>> +static struct skcipher_alg aria_algs[] = {
>> + {
>> + .base.cra_name = "__ecb(aria)",
>> + .base.cra_driver_name = "__ecb-aria-avx",
>> + .base.cra_priority = 400,
>> + .base.cra_flags = CRYPTO_ALG_INTERNAL,
>> + .base.cra_blocksize = ARIA_BLOCK_SIZE,
>> + .base.cra_ctxsize = sizeof(struct aria_ctx),
>> + .base.cra_module = THIS_MODULE,
>> + .min_keysize = ARIA_MIN_KEY_SIZE,
>> + .max_keysize = ARIA_MAX_KEY_SIZE,
>> + .setkey = aria_avx_set_key,
>> + .encrypt = aria_avx_ecb_encrypt,
>> + .decrypt = aria_avx_ecb_decrypt,
>> + }
>> +};
>
> Why do you want ECB mode and nothing else? At
> https://lore.kernel.org/r/[email protected]
> you claimed that the use case for ARIA support in the kernel is kTLS.
>
> So you are using ECB mode in TLS?
>

aria-ktls only uses GCM mode.
So, ECB will not be used by ktls.

My plan is to implement the GCM aria-avx eventually.
ECB implementation will be a basic block of aria-avx.
I think it can be used by gcm(aria).
So, I will implement gcm mode of aria with this implementation.

If this plan is not good, please let me know.
If so, I will change my plan :)

Thanks a lot!
Taehee Yoo

> - Eric

2022-08-27 06:56:48

by Taehee Yoo

[permalink] [raw]

Subject: Re: [PATCH v2 2/3] crypto: aria-avx: add AES-NI/AVX/x86_64 assembler implementation of aria cipher

2022. 8. 27. 오후 3:35에 Eric Biggers 이(가) 쓴 글:
> On Sat, Aug 27, 2022 at 03:30:55PM +0900, Taehee Yoo wrote:
>> Hi Eric,
>> Thanks for your review!
>>
>> 2022. 8. 27. 오전 11:46에 Eric Biggers 이(가) 쓴 글:
>>> On Fri, Aug 26, 2022 at 05:31:30AM +0000, Taehee Yoo wrote:
>>>> +static struct skcipher_alg aria_algs[] = {
>>>> + {
>>>> + .base.cra_name = "__ecb(aria)",
>>>> + .base.cra_driver_name = "__ecb-aria-avx",
>>>> + .base.cra_priority = 400,
>>>> + .base.cra_flags = CRYPTO_ALG_INTERNAL,
>>>> + .base.cra_blocksize = ARIA_BLOCK_SIZE,
>>>> + .base.cra_ctxsize = sizeof(struct aria_ctx),
>>>> + .base.cra_module = THIS_MODULE,
>>>> + .min_keysize = ARIA_MIN_KEY_SIZE,
>>>> + .max_keysize = ARIA_MAX_KEY_SIZE,
>>>> + .setkey = aria_avx_set_key,
>>>> + .encrypt = aria_avx_ecb_encrypt,
>>>> + .decrypt = aria_avx_ecb_decrypt,
>>>> + }
>>>> +};
>>>
>>> Why do you want ECB mode and nothing else? At
>>>
https://lore.kernel.org/r/[email protected]
>>> you claimed that the use case for ARIA support in the kernel is kTLS.
>>>
>>> So you are using ECB mode in TLS?
>>>
>>
>> aria-ktls only uses GCM mode.
>> So, ECB will not be used by ktls.
>>
>> My plan is to implement the GCM aria-avx eventually.
>> ECB implementation will be a basic block of aria-avx.
>> I think it can be used by gcm(aria).
>> So, I will implement gcm mode of aria with this implementation.
>>
>> If this plan is not good, please let me know.
>> If so, I will change my plan :)
>
> GCM uses CTR mode, not ECB mode.
>

Thanks for it,
I will implement CTR and includes it in the v3 patch.

Thanks a lot!
Taehee Yoo

2022-09-01 20:02:52

by Jussi Kivilinna

[permalink] [raw]

Subject: Re: [PATCH v2 2/3] crypto: aria-avx: add AES-NI/AVX/x86_64 assembler implementation of aria cipher

Hello,

On 26.8.2022 8.31, Taehee Yoo wrote:
> +#define aria_sbox_8way(x0, x1, x2, x3, \
> + x4, x5, x6, x7, \
> + t0, t1, t2, t3, \
> + t4, t5, t6, t7) \
> + vpxor t0, t0, t0; \
> + vaesenclast t0, x0, x0; \
> + vaesenclast t0, x4, x4; \
> + vaesenclast t0, x1, x1; \
> + vaesenclast t0, x5, x5; \
> + vaesdeclast t0, x2, x2; \
> + vaesdeclast t0, x6, x6; \
> + \
> + /* AES inverse shift rows */ \
> + vmovdqa .Linv_shift_row, t0; \
> + vmovdqa .Lshift_row, t1; \
> + vpshufb t0, x0, x0; \
> + vpshufb t0, x4, x4; \
> + vpshufb t0, x1, x1; \
> + vpshufb t0, x5, x5; \
> + vpshufb t0, x3, x3; \
> + vpshufb t0, x7, x7; \
> + vpshufb t1, x2, x2; \
> + vpshufb t1, x6, x6; \
> + \
> + vmovdqa .Linv_lo, t0; \
> + vmovdqa .Linv_hi, t1; \
> + vmovdqa .Ltf_lo_s2, t2; \
> + vmovdqa .Ltf_hi_s2, t3; \
> + vmovdqa .Ltf_lo_x2, t4; \
> + vmovdqa .Ltf_hi_x2, t5; \
> + vbroadcastss .L0f0f0f0f, t6; \
> + \
> + /* extract multiplicative inverse */ \
> + filter_8bit(x1, t0, t1, t6, t7); \
> + /* affine transformation for S2 */ \
> + filter_8bit(x1, t2, t3, t6, t7); \

Here's room for improvement. These two affine transformations
could be combined into single filter_8bit...

> + /* extract multiplicative inverse */ \
> + filter_8bit(x5, t0, t1, t6, t7); \
> + /* affine transformation for S2 */ \
> + filter_8bit(x5, t2, t3, t6, t7); \
> + \
> + /* affine transformation for X2 */ \
> + filter_8bit(x3, t4, t5, t6, t7); \
> + vpxor t7, t7, t7; \
> + vaesenclast t7, x3, x3; \
> + /* extract multiplicative inverse */ \
> + filter_8bit(x3, t0, t1, t6, t7); \
> + /* affine transformation for X2 */ \
> + filter_8bit(x7, t4, t5, t6, t7); \
> + vpxor t7, t7, t7; \
> + vaesenclast t7, x7, x7; \
> + /* extract multiplicative inverse */ \
> + filter_8bit(x7, t0, t1, t6, t7);

... as well as these two filter_8bit could be replaced with
one operation if 'vaesenclast' would be changed to 'vaesdeclast'.

With these optimizations, 'aria_sbox_8way' would look like this:

/////////////////////////////////////////////////////////
#define aria_sbox_8way(x0, x1, x2, x3, \
x4, x5, x6, x7, \
t0, t1, t2, t3, \
t4, t5, t6, t7) \
vpxor t7, t7, t7; \
vmovdqa .Linv_shift_row, t0; \
vmovdqa .Lshift_row, t1; \
vpbroadcastd .L0f0f0f0f, t6; \
vmovdqa .Ltf_lo__inv_aff__and__s2, t2; \
vmovdqa .Ltf_hi__inv_aff__and__s2, t3; \
vmovdqa .Ltf_lo__x2__and__fwd_aff, t4; \
vmovdqa .Ltf_hi__x2__and__fwd_aff, t5; \
\
vaesenclast t7, x0, x0; \
vaesenclast t7, x4, x4; \
vaesenclast t7, x1, x1; \
vaesenclast t7, x5, x5; \
vaesdeclast t7, x2, x2; \
vaesdeclast t7, x6, x6; \
\
/* AES inverse shift rows */ \
vpshufb t0, x0, x0; \
vpshufb t0, x4, x4; \
vpshufb t0, x1, x1; \
vpshufb t0, x5, x5; \
vpshufb t1, x3, x3; \
vpshufb t1, x7, x7; \
vpshufb t1, x2, x2; \
vpshufb t1, x6, x6; \
\
/* affine transformation for S2 */ \
filter_8bit(x1, t2, t3, t6, t0); \
/* affine transformation for S2 */ \
filter_8bit(x5, t2, t3, t6, t0); \
\
/* affine transformation for X2 */ \
filter_8bit(x3, t4, t5, t6, t0); \
/* affine transformation for X2 */ \
filter_8bit(x7, t4, t5, t6, t0); \
vaesdeclast t7, x3, x3; \
vaesdeclast t7, x7, x7;

/* AES inverse affine and S2 combined:
* 1 1 0 0 0 0 0 1 x0 0
* 0 1 0 0 1 0 0 0 x1 0
* 1 1 0 0 1 1 1 1 x2 0
* 0 1 1 0 1 0 0 1 x3 1
* 0 1 0 0 1 1 0 0 * x4 + 0
* 0 1 0 1 1 0 0 0 x5 0
* 0 0 0 0 0 1 0 1 x6 0
* 1 1 1 0 0 1 1 1 x7 1
*/
.Ltf_lo__inv_aff__and__s2:
.octa 0x92172DA81A9FA520B2370D883ABF8500
.Ltf_hi__inv_aff__and__s2:
.octa 0x2B15FFC1AF917B45E6D8320C625CB688

/* X2 and AES forward affine combined:
* 1 0 1 1 0 0 0 1 x0 0
* 0 1 1 1 1 0 1 1 x1 0
* 0 0 0 1 1 0 1 0 x2 1
* 0 1 0 0 0 1 0 0 x3 0
* 0 0 1 1 1 0 1 1 * x4 + 0
* 0 1 0 0 1 0 0 0 x5 0
* 1 1 0 1 0 0 1 1 x6 0
* 0 1 0 0 1 0 1 0 x7 0
*/
.Ltf_lo__x2__and__fwd_aff:
.octa 0xEFAE0544FCBD1657B8F95213ABEA4100
.Ltf_hi__x2__and__fwd_aff:
.octa 0x3F893781E95FE1576CDA64D2BA0CB204
/////////////////////////////////////////////////////////

I tested above quickly in userspace against aria-generic
and your original aria-avx implementation and output matches
to these references. In quick and dirty benchmark, function
execution time was ~30% faster on AMD Zen3 and ~20% faster
on Intel tiger-lake.

-Jussi

2022-09-02 08:34:54

by Taehee Yoo

[permalink] [raw]

Subject: Re: [PATCH v2 2/3] crypto: aria-avx: add AES-NI/AVX/x86_64 assembler implementation of aria cipher

Hi Jussi,
Thank you so much for this work!

On 9/2/22 04:51, Jussi Kivilinna wrote:
> Hello,
>
> On 26.8.2022 8.31, Taehee Yoo wrote:
>> +#define aria_sbox_8way(x0, x1, x2, x3, \
>> + x4, x5, x6, x7, \
>> + t0, t1, t2, t3, \
>> + t4, t5, t6, t7) \
>> + vpxor t0, t0, t0; \
>> + vaesenclast t0, x0, x0; \
>> + vaesenclast t0, x4, x4; \
>> + vaesenclast t0, x1, x1; \
>> + vaesenclast t0, x5, x5; \
>> + vaesdeclast t0, x2, x2; \
>> + vaesdeclast t0, x6, x6; \
>> + \
>> + /* AES inverse shift rows */ \
>> + vmovdqa .Linv_shift_row, t0; \
>> + vmovdqa .Lshift_row, t1; \
>> + vpshufb t0, x0, x0; \
>> + vpshufb t0, x4, x4; \
>> + vpshufb t0, x1, x1; \
>> + vpshufb t0, x5, x5; \
>> + vpshufb t0, x3, x3; \
>> + vpshufb t0, x7, x7; \
>> + vpshufb t1, x2, x2; \
>> + vpshufb t1, x6, x6; \
>> + \
>> + vmovdqa .Linv_lo, t0; \
>> + vmovdqa .Linv_hi, t1; \
>> + vmovdqa .Ltf_lo_s2, t2; \
>> + vmovdqa .Ltf_hi_s2, t3; \
>> + vmovdqa .Ltf_lo_x2, t4; \
>> + vmovdqa .Ltf_hi_x2, t5; \
>> + vbroadcastss .L0f0f0f0f, t6; \
>> + \
>> + /* extract multiplicative inverse */ \
>> + filter_8bit(x1, t0, t1, t6, t7); \
>> + /* affine transformation for S2 */ \
>> + filter_8bit(x1, t2, t3, t6, t7); \
>
> Here's room for improvement. These two affine transformations
> could be combined into single filter_8bit...
>
>> + /* extract multiplicative inverse */ \
>> + filter_8bit(x5, t0, t1, t6, t7); \
>> + /* affine transformation for S2 */ \
>> + filter_8bit(x5, t2, t3, t6, t7); \
>> + \
>> + /* affine transformation for X2 */ \
>> + filter_8bit(x3, t4, t5, t6, t7); \
>> + vpxor t7, t7, t7; \
>> + vaesenclast t7, x3, x3; \
>> + /* extract multiplicative inverse */ \
>> + filter_8bit(x3, t0, t1, t6, t7); \
>> + /* affine transformation for X2 */ \
>> + filter_8bit(x7, t4, t5, t6, t7); \
>> + vpxor t7, t7, t7; \
>> + vaesenclast t7, x7, x7; \
>> + /* extract multiplicative inverse */ \
>> + filter_8bit(x7, t0, t1, t6, t7);
>
> ... as well as these two filter_8bit could be replaced with
> one operation if 'vaesenclast' would be changed to 'vaesdeclast'.
>
> With these optimizations, 'aria_sbox_8way' would look like this:
>
> /////////////////////////////////////////////////////////
> #define aria_sbox_8way(x0, x1, x2, x3, \
> x4, x5, x6, x7, \
> t0, t1, t2, t3, \
> t4, t5, t6, t7) \
> vpxor t7, t7, t7; \
> vmovdqa .Linv_shift_row, t0; \
> vmovdqa .Lshift_row, t1; \
> vpbroadcastd .L0f0f0f0f, t6; \
> vmovdqa .Ltf_lo__inv_aff__and__s2, t2; \
> vmovdqa .Ltf_hi__inv_aff__and__s2, t3; \
> vmovdqa .Ltf_lo__x2__and__fwd_aff, t4; \
> vmovdqa .Ltf_hi__x2__and__fwd_aff, t5; \
> \
> vaesenclast t7, x0, x0; \
> vaesenclast t7, x4, x4; \
> vaesenclast t7, x1, x1; \
> vaesenclast t7, x5, x5; \
> vaesdeclast t7, x2, x2; \
> vaesdeclast t7, x6, x6; \
> \
> /* AES inverse shift rows */ \
> vpshufb t0, x0, x0; \
> vpshufb t0, x4, x4; \
> vpshufb t0, x1, x1; \
> vpshufb t0, x5, x5; \
> vpshufb t1, x3, x3; \
> vpshufb t1, x7, x7; \
> vpshufb t1, x2, x2; \
> vpshufb t1, x6, x6; \
> \
> /* affine transformation for S2 */ \
> filter_8bit(x1, t2, t3, t6, t0); \
> /* affine transformation for S2 */ \
> filter_8bit(x5, t2, t3, t6, t0); \
> \
> /* affine transformation for X2 */ \
> filter_8bit(x3, t4, t5, t6, t0); \
> /* affine transformation for X2 */ \
> filter_8bit(x7, t4, t5, t6, t0); \
> vaesdeclast t7, x3, x3; \
> vaesdeclast t7, x7, x7;
>
> /* AES inverse affine and S2 combined:
> * 1 1 0 0 0 0 0 1 x0 0
> * 0 1 0 0 1 0 0 0 x1 0
> * 1 1 0 0 1 1 1 1 x2 0
> * 0 1 1 0 1 0 0 1 x3 1
> * 0 1 0 0 1 1 0 0 * x4 + 0
> * 0 1 0 1 1 0 0 0 x5 0
> * 0 0 0 0 0 1 0 1 x6 0
> * 1 1 1 0 0 1 1 1 x7 1
> */
> .Ltf_lo__inv_aff__and__s2:
> .octa 0x92172DA81A9FA520B2370D883ABF8500
> .Ltf_hi__inv_aff__and__s2:
> .octa 0x2B15FFC1AF917B45E6D8320C625CB688
>
> /* X2 and AES forward affine combined:
> * 1 0 1 1 0 0 0 1 x0 0
> * 0 1 1 1 1 0 1 1 x1 0
> * 0 0 0 1 1 0 1 0 x2 1
> * 0 1 0 0 0 1 0 0 x3 0
> * 0 0 1 1 1 0 1 1 * x4 + 0
> * 0 1 0 0 1 0 0 0 x5 0
> * 1 1 0 1 0 0 1 1 x6 0
> * 0 1 0 0 1 0 1 0 x7 0
> */
> .Ltf_lo__x2__and__fwd_aff:
> .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
> .Ltf_hi__x2__and__fwd_aff:
> .octa 0x3F893781E95FE1576CDA64D2BA0CB204
> /////////////////////////////////////////////////////////
>
> I tested above quickly in userspace against aria-generic
> and your original aria-avx implementation and output matches
> to these references. In quick and dirty benchmark, function
> execution time was ~30% faster on AMD Zen3 and ~20% faster
> on Intel tiger-lake.

I tested your implementation.
It works very well and as you mentioned, it improves performance so much!
Before:
128bit 4096bytes: 14758 cycles
After:
128bit 4096bytes: 11972 cycles

I will apply your implementation in the v3 patch!
Thank you so much!

Taehee Yoo