2018-11-21 03:01:00

by Martin Willi

[permalink] [raw]
Subject: [PATCH 0/3] crypto: x86/chacha20 - AVX-512VL block functions

In the quest for pushing the limits of chacha20 encryption for both IPsec
and Wireguard, this small series adds AVX-512VL block functions. The VL
variant works on 256-bit ymm registers, but compared to AVX2 can benefit
from the new instructions.

Compared to the AVX2 version, these block functions bring an overall
speed improvement across encryption lengths of ~20%. Below the tcrypt
results for additional block sizes in kOps/s, for the current AVX2
code path, the new AVX-512VL code path and the comparison to Zinc in
AVX2 and AVX-512VL. All numbers from a Xeon Platinum 8168 (2.7GHz).

These numbers result in a very nice chart, available at:
https://download.strongswan.org/misc/chacha-avx-512vl.svg

zinc zinc
len avx2 512vl avx2 512vl
8 5719 5672 5468 5612
16 5675 5627 5355 5621
24 5687 5601 5322 5633
32 5667 5622 5244 5564
40 5603 5582 5337 5578
48 5638 5539 5400 5556
56 5624 5566 5375 5482
64 5590 5573 5352 5531
72 4841 5467 3365 3457
80 5316 5761 3310 3381
88 4798 5470 3239 3343
96 5324 5723 3197 3281
104 4819 5460 3155 3232
112 5266 5749 3020 3195
120 4776 5391 2959 3145
128 5291 5723 3398 3489
136 4122 4837 3321 3423
144 4507 5057 3247 3389
152 4139 4815 3233 3329
160 4482 5043 3159 3256
168 4142 4766 3131 3224
176 4506 5028 3073 3162
184 4119 4772 3010 3109
192 4499 5016 3402 3502
200 4127 4766 3329 3448
208 4452 5012 3276 3371
216 4128 4744 3243 3334
224 4484 5008 3203 3298
232 4103 4772 3141 3237
240 4458 4963 3115 3217
248 4121 4751 3085 3177
256 4461 4987 3364 4046
264 3406 4282 3270 4006
272 3408 4287 3207 3961
280 3371 4271 3203 3825
288 3625 4301 3129 3751
296 3402 4283 3093 3688
304 3401 4247 3062 3637
312 3382 4282 2995 3614
320 3611 4279 3305 4070
328 3386 4260 3276 3968
336 3369 4288 3171 3929
344 3389 4289 3134 3847
352 3609 4266 3127 3720
360 3355 4252 3076 3692
368 3387 4264 3048 3650
376 3387 4238 2967 3553
384 3568 4265 3277 4035
392 3369 4262 3299 3973
400 3362 4235 3239 3899
408 3352 4269 3196 3843
416 3585 4243 3127 3736
424 3364 4216 3092 3672
432 3341 4246 3067 3628
440 3353 4235 3018 3593
448 3538 4245 3327 4035
456 3322 4244 3275 3900
464 3340 4237 3212 3880
472 3330 4242 3054 3802
480 3530 4234 3078 3707
488 3337 4228 3094 3664
496 3330 4223 3015 3591
504 3317 4214 3002 3517
512 3531 4197 3339 4016
520 2511 3101 2030 2682
528 2627 3087 2027 2641
536 2508 3102 2001 2601
544 2638 3090 1964 2564
552 2494 3077 1962 2516
560 2625 3064 1941 2515
568 2500 3086 1922 2493
576 2611 3074 2050 2689
584 2482 3062 2041 2680
592 2595 3074 2026 2644
600 2470 3060 1985 2595
608 2581 3039 1961 2555
616 2478 3062 1956 2521
624 2587 3066 1930 2493
632 2457 3053 1923 2486
640 2581 3050 2059 2712
648 2296 2839 2024 2655
656 2389 2845 2019 2642
664 2292 2842 2002 2610
672 2404 2838 1959 2537
680 2273 2827 1956 2527
688 2389 2840 1938 2510
696 2280 2837 1911 2463
704 2370 2819 2055 2702
712 2277 2834 2029 2663
720 2369 2829 2020 2625
728 2255 2820 2001 2600
736 2373 2819 1958 2543
744 2269 2827 1956 2524
752 2364 2817 1937 2492
760 2270 2805 1909 2483
768 2378 2820 2050 2696
776 2053 2700 2002 2643
784 2066 2693 1922 2640
792 2065 2703 1928 2602
800 2138 2706 1962 2535
808 2065 2679 1938 2528
816 2063 2699 1929 2500
824 2053 2676 1915 2468
832 2149 2692 2036 2693
840 2055 2689 2024 2659
848 2049 2689 2006 2610
856 2057 2702 1979 2585
864 2144 2703 1960 2547
872 2047 2685 1945 2501
880 2055 2683 1902 2497
888 2060 2689 1897 2478
896 2139 2693 2023 2663
904 2049 2686 1970 2644
912 2055 2688 1925 2621
920 2047 2685 1911 2572
928 2114 2695 1907 2545
936 2055 2681 1927 2492
944 2055 2693 1930 2478
952 2042 2688 1909 2471
960 2136 2682 2014 2672
968 2054 2687 1999 2626
976 2040 2682 1982 2598
984 2055 2687 1943 2569
992 2138 2694 1884 2522
1000 2036 2681 1929 2506
1008 2052 2676 1926 2475
1016 2050 2686 1889 2430
1024 2125 2670 2039 2656
1032 1717 2175 1470 1995
1040 1768 2186 1456 1983
1048 1704 2185 1451 1950
1056 1770 2176 1410 1927
1064 1710 2178 1418 1918
1072 1753 2168 1394 1892
1080 1696 2170 1400 1892
1088 1761 2174 1472 2014
1096 1681 2158 1464 1968
1104 1746 2172 1457 1978
1112 1689 2167 1445 1955
1120 1738 2160 1431 1919
1128 1689 2155 1428 1915
1136 1747 2169 1415 1899
1144 1678 2161 1403 1881
1152 1749 2159 1474 2007
1160 1601 2050 1470 1991
1168 1648 2057 1461 1969
1176 1605 2043 1439 1948
1184 1654 2057 1428 1926
1192 1595 2051 1427 1899
1200 1647 2036 1419 1902
1208 1598 2048 1402 1888
1216 1643 2053 1471 1991
1224 1595 2043 1469 1987
1232 1649 2048 1456 1971
1240 1599 2040 1436 1939
1248 1644 2042 1433 1918
1256 1602 2045 1424 1900
1264 1648 2048 1413 1878
1272 1591 2034 1401 1878
1280 1649 2044 1475 2002
1288 1493 1984 1461 1972
1296 1484 1971 1438 1962
1304 1490 1985 1443 1947
1312 1535 1987 1425 1913
1320 1481 1965 1410 1901
1328 1493 1984 1407 1900
1336 1493 1979 1396 1882
1344 1526 1980 1465 1988
1352 1492 1970 1463 1983
1360 1487 1974 1452 1966
1368 1481 1977 1439 1937
1376 1535 1970 1428 1915
1384 1489 1973 1417 1905
1392 1483 1974 1415 1881
1400 1485 1963 1403 1882
1408 1523 1976 1466 1988
1416 1477 1969 1459 1964
1424 1487 1975 1455 1966
1432 1488 1972 1438 1941
1440 1518 1958 1432 1908
1448 1484 1972 1421 1905
1456 1485 1973 1398 1888
1464 1476 1962 1399 1870
1472 1530 1975 1471 1998
1480 1478 1967 1452 1979
1488 1478 1963 1453 1947
1496 1477 1963 1438 1930


Martin Willi (3):
crypto: x86/chacha20 - Add a 8-block AVX-512VL variant
crypto: x86/chacha20 - Add a 2-block AVX-512VL variant
crypto: x86/chacha20 - Add a 4-block AVX-512VL variant

arch/x86/crypto/Makefile | 5 +
arch/x86/crypto/chacha20-avx512vl-x86_64.S | 839 +++++++++++++++++++++
arch/x86/crypto/chacha20_glue.c | 40 +
3 files changed, 884 insertions(+)
create mode 100644 arch/x86/crypto/chacha20-avx512vl-x86_64.S

--
2.17.1


2018-11-21 03:01:01

by Martin Willi

[permalink] [raw]
Subject: [PATCH 3/3] crypto: x86/chacha20 - Add a 4-block AVX-512VL variant

This version uses the same principle as the AVX2 version by scheduling the
operations for two block pairs in parallel. It benefits from the AVX-512VL
rotate instructions and the more efficient partial block handling using
"vmovdqu8", resulting in a speedup of the raw block function of ~20%.

Signed-off-by: Martin Willi <[email protected]>
---
arch/x86/crypto/chacha20-avx512vl-x86_64.S | 272 +++++++++++++++++++++
arch/x86/crypto/chacha20_glue.c | 7 +
2 files changed, 279 insertions(+)

diff --git a/arch/x86/crypto/chacha20-avx512vl-x86_64.S b/arch/x86/crypto/chacha20-avx512vl-x86_64.S
index 261097578715..55d34de29e3e 100644
--- a/arch/x86/crypto/chacha20-avx512vl-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx512vl-x86_64.S
@@ -12,6 +12,11 @@
CTR2BL: .octa 0x00000000000000000000000000000000
.octa 0x00000000000000000000000000000001

+.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL: .octa 0x00000000000000000000000000000002
+ .octa 0x00000000000000000000000000000003
+
.section .rodata.cst32.CTR8BL, "aM", @progbits, 32
.align 32
CTR8BL: .octa 0x00000003000000020000000100000000
@@ -185,6 +190,273 @@ ENTRY(chacha20_2block_xor_avx512vl)

ENDPROC(chacha20_2block_xor_avx512vl)

+ENTRY(chacha20_4block_xor_avx512vl)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 4 data blocks output, o
+ # %rdx: up to 4 data blocks input, i
+ # %rcx: input/output length in bytes
+
+ # This function encrypts four ChaCha20 block by loading the state
+ # matrix four times across eight AVX registers. It performs matrix
+ # operations on four words in two matrices in parallel, sequentially
+ # to the operations on the four words of the other two matrices. The
+ # required word shuffling has a rather high latency, we can do the
+ # arithmetic on two matrix-pairs without much slowdown.
+
+ vzeroupper
+
+ # x0..3[0-4] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vmovdqa %ymm0,%ymm4
+ vmovdqa %ymm1,%ymm5
+ vmovdqa %ymm2,%ymm6
+ vmovdqa %ymm3,%ymm7
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+ vpaddd CTR4BL(%rip),%ymm7,%ymm7
+
+ vmovdqa %ymm0,%ymm11
+ vmovdqa %ymm1,%ymm12
+ vmovdqa %ymm2,%ymm13
+ vmovdqa %ymm3,%ymm14
+ vmovdqa %ymm7,%ymm15
+
+ mov $10,%rax
+
+.Ldoubleround4:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $16,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ vpshufd $0x39,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+ vpshufd $0x93,%ymm7,%ymm7
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $16,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ vpshufd $0x93,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+ vpshufd $0x39,%ymm7,%ymm7
+
+ dec %rax
+ jnz .Ldoubleround4
+
+ # o0 = i0 ^ (x0 + s0), first block
+ vpaddd %ymm11,%ymm0,%ymm10
+ cmp $0x10,%rcx
+ jl .Lxorpart4
+ vpxord 0x00(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x00(%rsi)
+ vextracti128 $1,%ymm10,%xmm0
+ # o1 = i1 ^ (x1 + s1), first block
+ vpaddd %ymm12,%ymm1,%ymm10
+ cmp $0x20,%rcx
+ jl .Lxorpart4
+ vpxord 0x10(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x10(%rsi)
+ vextracti128 $1,%ymm10,%xmm1
+ # o2 = i2 ^ (x2 + s2), first block
+ vpaddd %ymm13,%ymm2,%ymm10
+ cmp $0x30,%rcx
+ jl .Lxorpart4
+ vpxord 0x20(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x20(%rsi)
+ vextracti128 $1,%ymm10,%xmm2
+ # o3 = i3 ^ (x3 + s3), first block
+ vpaddd %ymm14,%ymm3,%ymm10
+ cmp $0x40,%rcx
+ jl .Lxorpart4
+ vpxord 0x30(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x30(%rsi)
+ vextracti128 $1,%ymm10,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm10
+ cmp $0x50,%rcx
+ jl .Lxorpart4
+ vpxord 0x40(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm10
+ cmp $0x60,%rcx
+ jl .Lxorpart4
+ vpxord 0x50(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm10
+ cmp $0x70,%rcx
+ jl .Lxorpart4
+ vpxord 0x60(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm10
+ cmp $0x80,%rcx
+ jl .Lxorpart4
+ vpxord 0x70(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x70(%rsi)
+
+ # o0 = i0 ^ (x0 + s0), third block
+ vpaddd %ymm11,%ymm4,%ymm10
+ cmp $0x90,%rcx
+ jl .Lxorpart4
+ vpxord 0x80(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x80(%rsi)
+ vextracti128 $1,%ymm10,%xmm4
+ # o1 = i1 ^ (x1 + s1), third block
+ vpaddd %ymm12,%ymm5,%ymm10
+ cmp $0xa0,%rcx
+ jl .Lxorpart4
+ vpxord 0x90(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x90(%rsi)
+ vextracti128 $1,%ymm10,%xmm5
+ # o2 = i2 ^ (x2 + s2), third block
+ vpaddd %ymm13,%ymm6,%ymm10
+ cmp $0xb0,%rcx
+ jl .Lxorpart4
+ vpxord 0xa0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xa0(%rsi)
+ vextracti128 $1,%ymm10,%xmm6
+ # o3 = i3 ^ (x3 + s3), third block
+ vpaddd %ymm15,%ymm7,%ymm10
+ cmp $0xc0,%rcx
+ jl .Lxorpart4
+ vpxord 0xb0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xb0(%rsi)
+ vextracti128 $1,%ymm10,%xmm7
+
+ # xor and write fourth block
+ vmovdqa %xmm4,%xmm10
+ cmp $0xd0,%rcx
+ jl .Lxorpart4
+ vpxord 0xc0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xc0(%rsi)
+
+ vmovdqa %xmm5,%xmm10
+ cmp $0xe0,%rcx
+ jl .Lxorpart4
+ vpxord 0xd0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xd0(%rsi)
+
+ vmovdqa %xmm6,%xmm10
+ cmp $0xf0,%rcx
+ jl .Lxorpart4
+ vpxord 0xe0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xe0(%rsi)
+
+ vmovdqa %xmm7,%xmm10
+ cmp $0x100,%rcx
+ jl .Lxorpart4
+ vpxord 0xf0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xf0(%rsi)
+
+.Ldone4:
+ vzeroupper
+ ret
+
+.Lxorpart4:
+ # xor remaining bytes from partial register into output
+ mov %rcx,%rax
+ and $0xf,%rcx
+ jz .Ldone8
+ mov %rax,%r9
+ and $~0xf,%r9
+
+ mov $1,%rax
+ shld %cl,%rax,%rax
+ sub $1,%rax
+ kmovq %rax,%k1
+
+ vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
+ vpxord %xmm10,%xmm1,%xmm1
+ vmovdqu8 %xmm1,(%rsi,%r9){%k1}
+
+ jmp .Ldone4
+
+ENDPROC(chacha20_4block_xor_avx512vl)
+
ENTRY(chacha20_8block_xor_avx512vl)
# %rdi: Input state matrix, s
# %rsi: up to 8 data blocks output, o
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index d6a95a6a324e..773d075a1483 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -34,6 +34,8 @@ static bool chacha20_use_avx2;
#ifdef CONFIG_AS_AVX512
asmlinkage void chacha20_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
unsigned int len);
+asmlinkage void chacha20_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len);
asmlinkage void chacha20_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
unsigned int len);
static bool chacha20_use_avx512vl;
@@ -64,6 +66,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
state[12] += chacha20_advance(bytes, 8);
return;
}
+ if (bytes > CHACHA_BLOCK_SIZE * 2) {
+ chacha20_4block_xor_avx512vl(state, dst, src, bytes);
+ state[12] += chacha20_advance(bytes, 4);
+ return;
+ }
if (bytes) {
chacha20_2block_xor_avx512vl(state, dst, src, bytes);
state[12] += chacha20_advance(bytes, 2);
--
2.17.1

2018-11-21 03:01:00

by Martin Willi

[permalink] [raw]
Subject: [PATCH 2/3] crypto: x86/chacha20 - Add a 2-block AVX-512VL variant

This version uses the same principle as the AVX2 version. It benefits
from the AVX-512VL rotate instructions and the more efficient partial
block handling using "vmovdqu8", resulting in a speedup of ~20%.

Unlike the AVX2 version, it is faster than the single block SSSE3 version
to process a single block. Hence we engage that function for (partial)
single block lengths as well.

Signed-off-by: Martin Willi <[email protected]>
---
arch/x86/crypto/chacha20-avx512vl-x86_64.S | 171 +++++++++++++++++++++
arch/x86/crypto/chacha20_glue.c | 7 +
2 files changed, 178 insertions(+)

diff --git a/arch/x86/crypto/chacha20-avx512vl-x86_64.S b/arch/x86/crypto/chacha20-avx512vl-x86_64.S
index e1877afcaa73..261097578715 100644
--- a/arch/x86/crypto/chacha20-avx512vl-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx512vl-x86_64.S
@@ -7,6 +7,11 @@

#include <linux/linkage.h>

+.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL: .octa 0x00000000000000000000000000000000
+ .octa 0x00000000000000000000000000000001
+
.section .rodata.cst32.CTR8BL, "aM", @progbits, 32
.align 32
CTR8BL: .octa 0x00000003000000020000000100000000
@@ -14,6 +19,172 @@ CTR8BL: .octa 0x00000003000000020000000100000000

.text

+ENTRY(chacha20_2block_xor_avx512vl)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 2 data blocks output, o
+ # %rdx: up to 2 data blocks input, i
+ # %rcx: input/output length in bytes
+
+ # This function encrypts two ChaCha20 blocks by loading the state
+ # matrix twice across four AVX registers. It performs matrix operations
+ # on four words in each matrix in parallel, but requires shuffling to
+ # rearrange the words after each round.
+
+ vzeroupper
+
+ # x0..3[0-2] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+
+ vmovdqa %ymm0,%ymm8
+ vmovdqa %ymm1,%ymm9
+ vmovdqa %ymm2,%ymm10
+ vmovdqa %ymm3,%ymm11
+
+ mov $10,%rax
+
+.Ldoubleround:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+
+ dec %rax
+ jnz .Ldoubleround
+
+ # o0 = i0 ^ (x0 + s0)
+ vpaddd %ymm8,%ymm0,%ymm7
+ cmp $0x10,%rcx
+ jl .Lxorpart2
+ vpxord 0x00(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x00(%rsi)
+ vextracti128 $1,%ymm7,%xmm0
+ # o1 = i1 ^ (x1 + s1)
+ vpaddd %ymm9,%ymm1,%ymm7
+ cmp $0x20,%rcx
+ jl .Lxorpart2
+ vpxord 0x10(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x10(%rsi)
+ vextracti128 $1,%ymm7,%xmm1
+ # o2 = i2 ^ (x2 + s2)
+ vpaddd %ymm10,%ymm2,%ymm7
+ cmp $0x30,%rcx
+ jl .Lxorpart2
+ vpxord 0x20(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x20(%rsi)
+ vextracti128 $1,%ymm7,%xmm2
+ # o3 = i3 ^ (x3 + s3)
+ vpaddd %ymm11,%ymm3,%ymm7
+ cmp $0x40,%rcx
+ jl .Lxorpart2
+ vpxord 0x30(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x30(%rsi)
+ vextracti128 $1,%ymm7,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm7
+ cmp $0x50,%rcx
+ jl .Lxorpart2
+ vpxord 0x40(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm7
+ cmp $0x60,%rcx
+ jl .Lxorpart2
+ vpxord 0x50(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm7
+ cmp $0x70,%rcx
+ jl .Lxorpart2
+ vpxord 0x60(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm7
+ cmp $0x80,%rcx
+ jl .Lxorpart2
+ vpxord 0x70(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x70(%rsi)
+
+.Ldone2:
+ vzeroupper
+ ret
+
+.Lxorpart2:
+ # xor remaining bytes from partial register into output
+ mov %rcx,%rax
+ and $0xf,%rcx
+ jz .Ldone8
+ mov %rax,%r9
+ and $~0xf,%r9
+
+ mov $1,%rax
+ shld %cl,%rax,%rax
+ sub $1,%rax
+ kmovq %rax,%k1
+
+ vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
+ vpxord %xmm7,%xmm1,%xmm1
+ vmovdqu8 %xmm1,(%rsi,%r9){%k1}
+
+ jmp .Ldone2
+
+ENDPROC(chacha20_2block_xor_avx512vl)
+
ENTRY(chacha20_8block_xor_avx512vl)
# %rdi: Input state matrix, s
# %rsi: up to 8 data blocks output, o
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 6a67e70bc82a..d6a95a6a324e 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -32,6 +32,8 @@ asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
unsigned int len);
static bool chacha20_use_avx2;
#ifdef CONFIG_AS_AVX512
+asmlinkage void chacha20_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len);
asmlinkage void chacha20_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
unsigned int len);
static bool chacha20_use_avx512vl;
@@ -62,6 +64,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
state[12] += chacha20_advance(bytes, 8);
return;
}
+ if (bytes) {
+ chacha20_2block_xor_avx512vl(state, dst, src, bytes);
+ state[12] += chacha20_advance(bytes, 2);
+ return;
+ }
}
#endif
if (chacha20_use_avx2) {
--
2.17.1

2018-11-29 21:18:16

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH 0/3] crypto: x86/chacha20 - AVX-512VL block functions

On Tue, Nov 20, 2018 at 05:30:47PM +0100, Martin Willi wrote:
> In the quest for pushing the limits of chacha20 encryption for both IPsec
> and Wireguard, this small series adds AVX-512VL block functions. The VL
> variant works on 256-bit ymm registers, but compared to AVX2 can benefit
> from the new instructions.
>
> Compared to the AVX2 version, these block functions bring an overall
> speed improvement across encryption lengths of ~20%. Below the tcrypt
> results for additional block sizes in kOps/s, for the current AVX2
> code path, the new AVX-512VL code path and the comparison to Zinc in
> AVX2 and AVX-512VL. All numbers from a Xeon Platinum 8168 (2.7GHz).
>
> These numbers result in a very nice chart, available at:
> https://download.strongswan.org/misc/chacha-avx-512vl.svg
>
> zinc zinc
> len avx2 512vl avx2 512vl
> 8 5719 5672 5468 5612
> 16 5675 5627 5355 5621
> 24 5687 5601 5322 5633
> 32 5667 5622 5244 5564
> 40 5603 5582 5337 5578
> 48 5638 5539 5400 5556
> 56 5624 5566 5375 5482
> 64 5590 5573 5352 5531
> 72 4841 5467 3365 3457
> 80 5316 5761 3310 3381
> 88 4798 5470 3239 3343
> 96 5324 5723 3197 3281
> 104 4819 5460 3155 3232
> 112 5266 5749 3020 3195
> 120 4776 5391 2959 3145
> 128 5291 5723 3398 3489
> 136 4122 4837 3321 3423
> 144 4507 5057 3247 3389
> 152 4139 4815 3233 3329
> 160 4482 5043 3159 3256
> 168 4142 4766 3131 3224
> 176 4506 5028 3073 3162
> 184 4119 4772 3010 3109
> 192 4499 5016 3402 3502
> 200 4127 4766 3329 3448
> 208 4452 5012 3276 3371
> 216 4128 4744 3243 3334
> 224 4484 5008 3203 3298
> 232 4103 4772 3141 3237
> 240 4458 4963 3115 3217
> 248 4121 4751 3085 3177
> 256 4461 4987 3364 4046
> 264 3406 4282 3270 4006
> 272 3408 4287 3207 3961
> 280 3371 4271 3203 3825
> 288 3625 4301 3129 3751
> 296 3402 4283 3093 3688
> 304 3401 4247 3062 3637
> 312 3382 4282 2995 3614
> 320 3611 4279 3305 4070
> 328 3386 4260 3276 3968
> 336 3369 4288 3171 3929
> 344 3389 4289 3134 3847
> 352 3609 4266 3127 3720
> 360 3355 4252 3076 3692
> 368 3387 4264 3048 3650
> 376 3387 4238 2967 3553
> 384 3568 4265 3277 4035
> 392 3369 4262 3299 3973
> 400 3362 4235 3239 3899
> 408 3352 4269 3196 3843
> 416 3585 4243 3127 3736
> 424 3364 4216 3092 3672
> 432 3341 4246 3067 3628
> 440 3353 4235 3018 3593
> 448 3538 4245 3327 4035
> 456 3322 4244 3275 3900
> 464 3340 4237 3212 3880
> 472 3330 4242 3054 3802
> 480 3530 4234 3078 3707
> 488 3337 4228 3094 3664
> 496 3330 4223 3015 3591
> 504 3317 4214 3002 3517
> 512 3531 4197 3339 4016
> 520 2511 3101 2030 2682
> 528 2627 3087 2027 2641
> 536 2508 3102 2001 2601
> 544 2638 3090 1964 2564
> 552 2494 3077 1962 2516
> 560 2625 3064 1941 2515
> 568 2500 3086 1922 2493
> 576 2611 3074 2050 2689
> 584 2482 3062 2041 2680
> 592 2595 3074 2026 2644
> 600 2470 3060 1985 2595
> 608 2581 3039 1961 2555
> 616 2478 3062 1956 2521
> 624 2587 3066 1930 2493
> 632 2457 3053 1923 2486
> 640 2581 3050 2059 2712
> 648 2296 2839 2024 2655
> 656 2389 2845 2019 2642
> 664 2292 2842 2002 2610
> 672 2404 2838 1959 2537
> 680 2273 2827 1956 2527
> 688 2389 2840 1938 2510
> 696 2280 2837 1911 2463
> 704 2370 2819 2055 2702
> 712 2277 2834 2029 2663
> 720 2369 2829 2020 2625
> 728 2255 2820 2001 2600
> 736 2373 2819 1958 2543
> 744 2269 2827 1956 2524
> 752 2364 2817 1937 2492
> 760 2270 2805 1909 2483
> 768 2378 2820 2050 2696
> 776 2053 2700 2002 2643
> 784 2066 2693 1922 2640
> 792 2065 2703 1928 2602
> 800 2138 2706 1962 2535
> 808 2065 2679 1938 2528
> 816 2063 2699 1929 2500
> 824 2053 2676 1915 2468
> 832 2149 2692 2036 2693
> 840 2055 2689 2024 2659
> 848 2049 2689 2006 2610
> 856 2057 2702 1979 2585
> 864 2144 2703 1960 2547
> 872 2047 2685 1945 2501
> 880 2055 2683 1902 2497
> 888 2060 2689 1897 2478
> 896 2139 2693 2023 2663
> 904 2049 2686 1970 2644
> 912 2055 2688 1925 2621
> 920 2047 2685 1911 2572
> 928 2114 2695 1907 2545
> 936 2055 2681 1927 2492
> 944 2055 2693 1930 2478
> 952 2042 2688 1909 2471
> 960 2136 2682 2014 2672
> 968 2054 2687 1999 2626
> 976 2040 2682 1982 2598
> 984 2055 2687 1943 2569
> 992 2138 2694 1884 2522
> 1000 2036 2681 1929 2506
> 1008 2052 2676 1926 2475
> 1016 2050 2686 1889 2430
> 1024 2125 2670 2039 2656
> 1032 1717 2175 1470 1995
> 1040 1768 2186 1456 1983
> 1048 1704 2185 1451 1950
> 1056 1770 2176 1410 1927
> 1064 1710 2178 1418 1918
> 1072 1753 2168 1394 1892
> 1080 1696 2170 1400 1892
> 1088 1761 2174 1472 2014
> 1096 1681 2158 1464 1968
> 1104 1746 2172 1457 1978
> 1112 1689 2167 1445 1955
> 1120 1738 2160 1431 1919
> 1128 1689 2155 1428 1915
> 1136 1747 2169 1415 1899
> 1144 1678 2161 1403 1881
> 1152 1749 2159 1474 2007
> 1160 1601 2050 1470 1991
> 1168 1648 2057 1461 1969
> 1176 1605 2043 1439 1948
> 1184 1654 2057 1428 1926
> 1192 1595 2051 1427 1899
> 1200 1647 2036 1419 1902
> 1208 1598 2048 1402 1888
> 1216 1643 2053 1471 1991
> 1224 1595 2043 1469 1987
> 1232 1649 2048 1456 1971
> 1240 1599 2040 1436 1939
> 1248 1644 2042 1433 1918
> 1256 1602 2045 1424 1900
> 1264 1648 2048 1413 1878
> 1272 1591 2034 1401 1878
> 1280 1649 2044 1475 2002
> 1288 1493 1984 1461 1972
> 1296 1484 1971 1438 1962
> 1304 1490 1985 1443 1947
> 1312 1535 1987 1425 1913
> 1320 1481 1965 1410 1901
> 1328 1493 1984 1407 1900
> 1336 1493 1979 1396 1882
> 1344 1526 1980 1465 1988
> 1352 1492 1970 1463 1983
> 1360 1487 1974 1452 1966
> 1368 1481 1977 1439 1937
> 1376 1535 1970 1428 1915
> 1384 1489 1973 1417 1905
> 1392 1483 1974 1415 1881
> 1400 1485 1963 1403 1882
> 1408 1523 1976 1466 1988
> 1416 1477 1969 1459 1964
> 1424 1487 1975 1455 1966
> 1432 1488 1972 1438 1941
> 1440 1518 1958 1432 1908
> 1448 1484 1972 1421 1905
> 1456 1485 1973 1398 1888
> 1464 1476 1962 1399 1870
> 1472 1530 1975 1471 1998
> 1480 1478 1967 1452 1979
> 1488 1478 1963 1453 1947
> 1496 1477 1963 1438 1930
>
>
> Martin Willi (3):
> crypto: x86/chacha20 - Add a 8-block AVX-512VL variant
> crypto: x86/chacha20 - Add a 2-block AVX-512VL variant
> crypto: x86/chacha20 - Add a 4-block AVX-512VL variant
>
> arch/x86/crypto/Makefile | 5 +
> arch/x86/crypto/chacha20-avx512vl-x86_64.S | 839 +++++++++++++++++++++
> arch/x86/crypto/chacha20_glue.c | 40 +
> 3 files changed, 884 insertions(+)
> create mode 100644 arch/x86/crypto/chacha20-avx512vl-x86_64.S

All applied. Thanks.
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2018-11-21 03:01:01

by Martin Willi

[permalink] [raw]
Subject: [PATCH 1/3] crypto: x86/chacha20 - Add a 8-block AVX-512VL variant

This variant is similar to the AVX2 version, but benefits from the AVX-512
rotate instructions and the additional registers, so it can operate without
any data on the stack. It uses ymm registers only to avoid the massive core
throttling on Skylake-X platforms. Nontheless does it bring a ~30% speed
improvement compared to the AVX2 variant for random encryption lengths.

The AVX2 version uses "rep movsb" for partial block XORing via the stack.
With AVX-512, the new "vmovdqu8" can do this much more efficiently. The
associated "kmov" instructions to work with dynamic masks is not part of
the AVX-512VL instruction set, hence we depend on AVX-512BW as well. Given
that the major AVX-512VL architectures provide AVX-512BW and this extension
does not affect core clocking, this seems to be no problem at least for
now.

Signed-off-by: Martin Willi <[email protected]>
---
arch/x86/crypto/Makefile | 5 +
arch/x86/crypto/chacha20-avx512vl-x86_64.S | 396 +++++++++++++++++++++
arch/x86/crypto/chacha20_glue.c | 26 ++
3 files changed, 427 insertions(+)
create mode 100644 arch/x86/crypto/chacha20-avx512vl-x86_64.S

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index a4b0007a54e1..ce4e43642984 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -8,6 +8,7 @@ OBJECT_FILES_NON_STANDARD := y
avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
+avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)

@@ -103,6 +104,10 @@ ifeq ($(avx2_supported),yes)
morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
endif

+ifeq ($(avx512_supported),yes)
+ chacha20-x86_64-y += chacha20-avx512vl-x86_64.o
+endif
+
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/chacha20-avx512vl-x86_64.S b/arch/x86/crypto/chacha20-avx512vl-x86_64.S
new file mode 100644
index 000000000000..e1877afcaa73
--- /dev/null
+++ b/arch/x86/crypto/chacha20-avx512vl-x86_64.S
@@ -0,0 +1,396 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX-512VL functions
+ *
+ * Copyright (C) 2018 Martin Willi
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst32.CTR8BL, "aM", @progbits, 32
+.align 32
+CTR8BL: .octa 0x00000003000000020000000100000000
+ .octa 0x00000007000000060000000500000004
+
+.text
+
+ENTRY(chacha20_8block_xor_avx512vl)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 8 data blocks output, o
+ # %rdx: up to 8 data blocks input, i
+ # %rcx: input/output length in bytes
+
+ # This function encrypts eight consecutive ChaCha20 blocks by loading
+ # the state matrix in AVX registers eight times. Compared to AVX2, this
+ # mostly benefits from the new rotate instructions in VL and the
+ # additional registers.
+
+ vzeroupper
+
+ # x0..15[0-7] = s[0..15]
+ vpbroadcastd 0x00(%rdi),%ymm0
+ vpbroadcastd 0x04(%rdi),%ymm1
+ vpbroadcastd 0x08(%rdi),%ymm2
+ vpbroadcastd 0x0c(%rdi),%ymm3
+ vpbroadcastd 0x10(%rdi),%ymm4
+ vpbroadcastd 0x14(%rdi),%ymm5
+ vpbroadcastd 0x18(%rdi),%ymm6
+ vpbroadcastd 0x1c(%rdi),%ymm7
+ vpbroadcastd 0x20(%rdi),%ymm8
+ vpbroadcastd 0x24(%rdi),%ymm9
+ vpbroadcastd 0x28(%rdi),%ymm10
+ vpbroadcastd 0x2c(%rdi),%ymm11
+ vpbroadcastd 0x30(%rdi),%ymm12
+ vpbroadcastd 0x34(%rdi),%ymm13
+ vpbroadcastd 0x38(%rdi),%ymm14
+ vpbroadcastd 0x3c(%rdi),%ymm15
+
+ # x12 += counter values 0-3
+ vpaddd CTR8BL(%rip),%ymm12,%ymm12
+
+ vmovdqa64 %ymm0,%ymm16
+ vmovdqa64 %ymm1,%ymm17
+ vmovdqa64 %ymm2,%ymm18
+ vmovdqa64 %ymm3,%ymm19
+ vmovdqa64 %ymm4,%ymm20
+ vmovdqa64 %ymm5,%ymm21
+ vmovdqa64 %ymm6,%ymm22
+ vmovdqa64 %ymm7,%ymm23
+ vmovdqa64 %ymm8,%ymm24
+ vmovdqa64 %ymm9,%ymm25
+ vmovdqa64 %ymm10,%ymm26
+ vmovdqa64 %ymm11,%ymm27
+ vmovdqa64 %ymm12,%ymm28
+ vmovdqa64 %ymm13,%ymm29
+ vmovdqa64 %ymm14,%ymm30
+ vmovdqa64 %ymm15,%ymm31
+
+ mov $10,%eax
+
+.Ldoubleround8:
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ vpaddd %ymm0,%ymm4,%ymm0
+ vpxord %ymm0,%ymm12,%ymm12
+ vprold $16,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ vpaddd %ymm1,%ymm5,%ymm1
+ vpxord %ymm1,%ymm13,%ymm13
+ vprold $16,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ vpaddd %ymm2,%ymm6,%ymm2
+ vpxord %ymm2,%ymm14,%ymm14
+ vprold $16,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ vpaddd %ymm3,%ymm7,%ymm3
+ vpxord %ymm3,%ymm15,%ymm15
+ vprold $16,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxord %ymm8,%ymm4,%ymm4
+ vprold $12,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxord %ymm9,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxord %ymm10,%ymm6,%ymm6
+ vprold $12,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxord %ymm11,%ymm7,%ymm7
+ vprold $12,%ymm7,%ymm7
+
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ vpaddd %ymm0,%ymm4,%ymm0
+ vpxord %ymm0,%ymm12,%ymm12
+ vprold $8,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ vpaddd %ymm1,%ymm5,%ymm1
+ vpxord %ymm1,%ymm13,%ymm13
+ vprold $8,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ vpaddd %ymm2,%ymm6,%ymm2
+ vpxord %ymm2,%ymm14,%ymm14
+ vprold $8,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ vpaddd %ymm3,%ymm7,%ymm3
+ vpxord %ymm3,%ymm15,%ymm15
+ vprold $8,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxord %ymm8,%ymm4,%ymm4
+ vprold $7,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxord %ymm9,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxord %ymm10,%ymm6,%ymm6
+ vprold $7,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxord %ymm11,%ymm7,%ymm7
+ vprold $7,%ymm7,%ymm7
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ vpaddd %ymm0,%ymm5,%ymm0
+ vpxord %ymm0,%ymm15,%ymm15
+ vprold $16,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+ vpaddd %ymm1,%ymm6,%ymm1
+ vpxord %ymm1,%ymm12,%ymm12
+ vprold $16,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ vpaddd %ymm2,%ymm7,%ymm2
+ vpxord %ymm2,%ymm13,%ymm13
+ vprold $16,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ vpaddd %ymm3,%ymm4,%ymm3
+ vpxord %ymm3,%ymm14,%ymm14
+ vprold $16,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxord %ymm10,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxord %ymm11,%ymm6,%ymm6
+ vprold $12,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxord %ymm8,%ymm7,%ymm7
+ vprold $12,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxord %ymm9,%ymm4,%ymm4
+ vprold $12,%ymm4,%ymm4
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ vpaddd %ymm0,%ymm5,%ymm0
+ vpxord %ymm0,%ymm15,%ymm15
+ vprold $8,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ vpaddd %ymm1,%ymm6,%ymm1
+ vpxord %ymm1,%ymm12,%ymm12
+ vprold $8,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ vpaddd %ymm2,%ymm7,%ymm2
+ vpxord %ymm2,%ymm13,%ymm13
+ vprold $8,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ vpaddd %ymm3,%ymm4,%ymm3
+ vpxord %ymm3,%ymm14,%ymm14
+ vprold $8,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxord %ymm10,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxord %ymm11,%ymm6,%ymm6
+ vprold $7,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxord %ymm8,%ymm7,%ymm7
+ vprold $7,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxord %ymm9,%ymm4,%ymm4
+ vprold $7,%ymm4,%ymm4
+
+ dec %eax
+ jnz .Ldoubleround8
+
+ # x0..15[0-3] += s[0..15]
+ vpaddd %ymm16,%ymm0,%ymm0
+ vpaddd %ymm17,%ymm1,%ymm1
+ vpaddd %ymm18,%ymm2,%ymm2
+ vpaddd %ymm19,%ymm3,%ymm3
+ vpaddd %ymm20,%ymm4,%ymm4
+ vpaddd %ymm21,%ymm5,%ymm5
+ vpaddd %ymm22,%ymm6,%ymm6
+ vpaddd %ymm23,%ymm7,%ymm7
+ vpaddd %ymm24,%ymm8,%ymm8
+ vpaddd %ymm25,%ymm9,%ymm9
+ vpaddd %ymm26,%ymm10,%ymm10
+ vpaddd %ymm27,%ymm11,%ymm11
+ vpaddd %ymm28,%ymm12,%ymm12
+ vpaddd %ymm29,%ymm13,%ymm13
+ vpaddd %ymm30,%ymm14,%ymm14
+ vpaddd %ymm31,%ymm15,%ymm15
+
+ # interleave 32-bit words in state n, n+1
+ vpunpckldq %ymm1,%ymm0,%ymm16
+ vpunpckhdq %ymm1,%ymm0,%ymm17
+ vpunpckldq %ymm3,%ymm2,%ymm18
+ vpunpckhdq %ymm3,%ymm2,%ymm19
+ vpunpckldq %ymm5,%ymm4,%ymm20
+ vpunpckhdq %ymm5,%ymm4,%ymm21
+ vpunpckldq %ymm7,%ymm6,%ymm22
+ vpunpckhdq %ymm7,%ymm6,%ymm23
+ vpunpckldq %ymm9,%ymm8,%ymm24
+ vpunpckhdq %ymm9,%ymm8,%ymm25
+ vpunpckldq %ymm11,%ymm10,%ymm26
+ vpunpckhdq %ymm11,%ymm10,%ymm27
+ vpunpckldq %ymm13,%ymm12,%ymm28
+ vpunpckhdq %ymm13,%ymm12,%ymm29
+ vpunpckldq %ymm15,%ymm14,%ymm30
+ vpunpckhdq %ymm15,%ymm14,%ymm31
+
+ # interleave 64-bit words in state n, n+2
+ vpunpcklqdq %ymm18,%ymm16,%ymm0
+ vpunpcklqdq %ymm19,%ymm17,%ymm1
+ vpunpckhqdq %ymm18,%ymm16,%ymm2
+ vpunpckhqdq %ymm19,%ymm17,%ymm3
+ vpunpcklqdq %ymm22,%ymm20,%ymm4
+ vpunpcklqdq %ymm23,%ymm21,%ymm5
+ vpunpckhqdq %ymm22,%ymm20,%ymm6
+ vpunpckhqdq %ymm23,%ymm21,%ymm7
+ vpunpcklqdq %ymm26,%ymm24,%ymm8
+ vpunpcklqdq %ymm27,%ymm25,%ymm9
+ vpunpckhqdq %ymm26,%ymm24,%ymm10
+ vpunpckhqdq %ymm27,%ymm25,%ymm11
+ vpunpcklqdq %ymm30,%ymm28,%ymm12
+ vpunpcklqdq %ymm31,%ymm29,%ymm13
+ vpunpckhqdq %ymm30,%ymm28,%ymm14
+ vpunpckhqdq %ymm31,%ymm29,%ymm15
+
+ # interleave 128-bit words in state n, n+4
+ # xor/write first four blocks
+ vmovdqa64 %ymm0,%ymm16
+ vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
+ cmp $0x0020,%rcx
+ jl .Lxorpart8
+ vpxord 0x0000(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0000(%rsi)
+ vmovdqa64 %ymm16,%ymm0
+ vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
+
+ vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
+ cmp $0x0040,%rcx
+ jl .Lxorpart8
+ vpxord 0x0020(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0020(%rsi)
+ vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
+
+ vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
+ cmp $0x0060,%rcx
+ jl .Lxorpart8
+ vpxord 0x0040(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0040(%rsi)
+ vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
+
+ vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
+ cmp $0x0080,%rcx
+ jl .Lxorpart8
+ vpxord 0x0060(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0060(%rsi)
+ vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
+
+ vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
+ cmp $0x00a0,%rcx
+ jl .Lxorpart8
+ vpxord 0x0080(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0080(%rsi)
+ vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
+
+ vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
+ cmp $0x00c0,%rcx
+ jl .Lxorpart8
+ vpxord 0x00a0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x00a0(%rsi)
+ vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
+
+ vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
+ cmp $0x00e0,%rcx
+ jl .Lxorpart8
+ vpxord 0x00c0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x00c0(%rsi)
+ vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
+
+ vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
+ cmp $0x0100,%rcx
+ jl .Lxorpart8
+ vpxord 0x00e0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x00e0(%rsi)
+ vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
+
+ # xor remaining blocks, write to output
+ vmovdqa64 %ymm4,%ymm0
+ cmp $0x0120,%rcx
+ jl .Lxorpart8
+ vpxord 0x0100(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0100(%rsi)
+
+ vmovdqa64 %ymm12,%ymm0
+ cmp $0x0140,%rcx
+ jl .Lxorpart8
+ vpxord 0x0120(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0120(%rsi)
+
+ vmovdqa64 %ymm6,%ymm0
+ cmp $0x0160,%rcx
+ jl .Lxorpart8
+ vpxord 0x0140(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0140(%rsi)
+
+ vmovdqa64 %ymm14,%ymm0
+ cmp $0x0180,%rcx
+ jl .Lxorpart8
+ vpxord 0x0160(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0160(%rsi)
+
+ vmovdqa64 %ymm5,%ymm0
+ cmp $0x01a0,%rcx
+ jl .Lxorpart8
+ vpxord 0x0180(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0180(%rsi)
+
+ vmovdqa64 %ymm13,%ymm0
+ cmp $0x01c0,%rcx
+ jl .Lxorpart8
+ vpxord 0x01a0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x01a0(%rsi)
+
+ vmovdqa64 %ymm7,%ymm0
+ cmp $0x01e0,%rcx
+ jl .Lxorpart8
+ vpxord 0x01c0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x01c0(%rsi)
+
+ vmovdqa64 %ymm15,%ymm0
+ cmp $0x0200,%rcx
+ jl .Lxorpart8
+ vpxord 0x01e0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x01e0(%rsi)
+
+.Ldone8:
+ vzeroupper
+ ret
+
+.Lxorpart8:
+ # xor remaining bytes from partial register into output
+ mov %rcx,%rax
+ and $0x1f,%rcx
+ jz .Ldone8
+ mov %rax,%r9
+ and $~0x1f,%r9
+
+ mov $1,%rax
+ shld %cl,%rax,%rax
+ sub $1,%rax
+ kmovq %rax,%k1
+
+ vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
+ vpxord %ymm0,%ymm1,%ymm1
+ vmovdqu8 %ymm1,(%rsi,%r9){%k1}
+
+ jmp .Ldone8
+
+ENDPROC(chacha20_8block_xor_avx512vl)
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 1e9e66509226..6a67e70bc82a 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -31,6 +31,11 @@ asmlinkage void chacha20_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
unsigned int len);
static bool chacha20_use_avx2;
+#ifdef CONFIG_AS_AVX512
+asmlinkage void chacha20_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len);
+static bool chacha20_use_avx512vl;
+#endif
#endif

static unsigned int chacha20_advance(unsigned int len, unsigned int maxblocks)
@@ -43,6 +48,22 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes)
{
#ifdef CONFIG_AS_AVX2
+#ifdef CONFIG_AS_AVX512
+ if (chacha20_use_avx512vl) {
+ while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+ chacha20_8block_xor_avx512vl(state, dst, src, bytes);
+ bytes -= CHACHA_BLOCK_SIZE * 8;
+ src += CHACHA_BLOCK_SIZE * 8;
+ dst += CHACHA_BLOCK_SIZE * 8;
+ state[12] += 8;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE * 4) {
+ chacha20_8block_xor_avx512vl(state, dst, src, bytes);
+ state[12] += chacha20_advance(bytes, 8);
+ return;
+ }
+ }
+#endif
if (chacha20_use_avx2) {
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
chacha20_8block_xor_avx2(state, dst, src, bytes);
@@ -149,6 +170,11 @@ static int __init chacha20_simd_mod_init(void)
chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_AVX2) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+#ifdef CONFIG_AS_AVX512
+ chacha20_use_avx512vl = chacha20_use_avx2 &&
+ boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
+#endif
#endif
return crypto_register_skcipher(&alg);
}
--
2.17.1