2018-11-11 19:24:49

by Martin Willi

[permalink] [raw]
Subject: [PATCH 0/6] crypto: x86/chacha20 - SIMD performance improvements

This patchset improves performance of the ChaCha20 SIMD implementations
for x86_64. For some specific encryption lengths, performance is more
than doubled. Two mechanisms are used to achieve this:

* Instead of calculating the minimal number of required blocks for a
given encryption length, functions producing more blocks are used
more aggressively. Calculating a 4-block function can be faster than
calculating a 2-block and a 1-block function, even if only three
blocks are actually required.

* In addition to the 8-block AVX2 function, a 4-block and a 2-block
function are introduced.

Patches 1-3 add support for partial lengths to the existing 1-, 4- and
8-block functions. Patch 4 makes use of that by engaging the next higher
level block functions more aggressively. Patch 5 and 6 add the new AVX2
functions for 2 and 4 blocks. Patches are based on cryptodev and would
need adjustments to apply on top of the Adiantum patchset.

Note that the more aggressive use of larger block functions calculate
blocks that may get discarded. This may have a negative impact on energy
usage or the processors thermal budget. However, with the new block
functions we can avoid this over-calculation for many lengths, so the
performance win can be considered more important.

Below are performance numbers measured with tcrypt using additional
encryption lengths; numbers in kOps/s, on my i7-5557U. old is the
existing, new the implementation with this patchset. As comparison
the numbers for zinc in v6:

len old new zinc
8 5908 5818 5818
16 5917 5828 5726
24 5916 5869 5757
32 5920 5789 5813
40 5868 5799 5710
48 5877 5761 5761
56 5869 5797 5742
64 5897 5862 5685
72 3381 4979 3520
80 3364 5541 3475
88 3350 4977 3424
96 3342 5530 3371
104 3328 4923 3313
112 3317 5528 3207
120 3313 4970 3150
128 3492 5535 3568
136 2487 4570 3690
144 2481 5047 3599
152 2473 4565 3566
160 2459 5022 3515
168 2461 4550 3437
176 2454 5020 3325
184 2449 4535 3279
192 2538 5011 3762
200 1962 4537 3702
208 1962 4971 3622
216 1954 4487 3518
224 1949 4936 3445
232 1948 4497 3422
240 1941 4947 3317
248 1940 4481 3279
256 3798 4964 3723
264 2638 3577 3639
272 2637 3567 3597
280 2628 3563 3565
288 2630 3795 3484
296 2621 3580 3422
304 2612 3569 3352
312 2602 3599 3308
320 2694 3821 3694
328 2060 3538 3681
336 2054 3565 3599
344 2054 3553 3523
352 2049 3809 3419
360 2045 3575 3403
368 2035 3560 3334
376 2036 3555 3257
384 2092 3785 3715
392 1691 3505 3612
400 1684 3527 3553
408 1686 3527 3496
416 1684 3804 3430
424 1681 3555 3402
432 1675 3559 3311
440 1672 3558 3275
448 1710 3780 3689
456 1431 3541 3618
464 1428 3538 3576
472 1430 3527 3509
480 1426 3788 3405
488 1423 3502 3397
496 1423 3519 3298
504 1418 3519 3277
512 3694 3736 3735
520 2601 2571 2209
528 2601 2677 2148
536 2587 2534 2164
544 2578 2659 2138
552 2570 2552 2126
560 2566 2661 2035
568 2567 2542 2041
576 2639 2674 2199
584 2031 2531 2183
592 2027 2660 2145
600 2016 2513 2155
608 2009 2638 2133
616 2006 2522 2115
624 2000 2649 2064
632 1996 2518 2045
640 2053 2651 2188
648 1666 2402 2182
656 1663 2517 2158
664 1659 2397 2147
672 1657 2510 2139
680 1656 2394 2114
688 1653 2497 2077
696 1646 2393 2043
704 1678 2510 2208
712 1414 2391 2189
720 1412 2506 2169
728 1411 2384 2145
736 1408 2494 2142
744 1408 2379 2081
752 1405 2485 2064
760 1403 2376 2043
768 2189 2498 2211
776 1756 2137 2192
784 1746 2145 2146
792 1744 2141 2141
800 1743 2222 2094
808 1742 2140 2100
816 1735 2134 2061
824 1731 2135 2045
832 1778 2222 2223
840 1480 2132 2184
848 1480 2134 2173
856 1476 2124 2145
864 1474 2210 2126
872 1472 2127 2105
880 1463 2123 2056
888 1468 2123 2043
896 1494 2208 2219
904 1278 2120 2192
912 1277 2121 2170
920 1273 2118 2149
928 1272 2207 2125
936 1267 2125 2098
944 1265 2127 2060
952 1267 2126 2049
960 1289 2213 2204
968 1125 2123 2187
976 1122 2127 2166
984 1120 2123 2136
992 1118 2207 2119
1000 1118 2120 2101
1008 1117 2122 2042
1016 1115 2121 2048
1024 2174 2191 2195
1032 1748 1724 1565
1040 1745 1782 1544
1048 1736 1737 1554
1056 1738 1802 1541
1064 1735 1728 1523
1072 1730 1780 1507
1080 1729 1724 1497
1088 1757 1783 1592
1096 1475 1723 1575
1104 1474 1778 1563
1112 1472 1708 1544
1120 1468 1774 1521
1128 1466 1718 1521
1136 1462 1780 1501
1144 1460 1719 1491
1152 1481 1782 1575
1160 1271 1647 1558
1168 1271 1706 1554
1176 1268 1645 1545
1184 1265 1711 1538
1192 1265 1648 1530
1200 1264 1705 1493
1208 1262 1647 1498
1216 1277 1695 1581
1224 1120 1642 1563
1232 1115 1702 1549
1240 1121 1646 1538
1248 1119 1703 1527
1256 1115 1640 1520
1264 1114 1693 1505
1272 1112 1642 1492
1280 1552 1699 1574
1288 1314 1525 1573
1296 1315 1522 1551
1304 1312 1521 1548
1312 1311 1564 1535
1320 1309 1518 1524
1328 1302 1527 1508
1336 1303 1521 1500
1344 1333 1561 1579
1352 1157 1524 1573
1360 1152 1520 1546
1368 1154 1522 1545
1376 1153 1562 1536
1384 1151 1525 1526
1392 1149 1523 1504
1400 1148 1517 1480
1408 1167 1561 1589
1416 1030 1516 1558
1424 1028 1516 1546
1432 1027 1522 1537
1440 1027 1564 1523
1448 1026 1507 1512
1456 1025 1515 1491
1464 1023 1522 1481
1472 1037 1559 1577
1480 927 1518 1559
1488 926 1514 1548
1496 926 1513 1534


Martin Willi (6):
crypto: x86/chacha20 - Support partial lengths in 1-block SSSE3
variant
crypto: x86/chacha20 - Support partial lengths in 4-block SSSE3
variant
crypto: x86/chacha20 - Support partial lengths in 8-block AVX2 variant
crypto: x86/chacha20 - Use larger block functions more aggressively
crypto: x86/chacha20 - Add a 2-block AVX2 variant
crypto: x86/chacha20 - Add a 4-block AVX2 variant

arch/x86/crypto/chacha20-avx2-x86_64.S | 696 ++++++++++++++++++++++--
arch/x86/crypto/chacha20-ssse3-x86_64.S | 237 ++++++--
arch/x86/crypto/chacha20_glue.c | 72 ++-
3 files changed, 868 insertions(+), 137 deletions(-)

--
2.17.1


2018-11-11 19:24:48

by Martin Willi

[permalink] [raw]
Subject: [PATCH 2/6] crypto: x86/chacha20 - Support partial lengths in 4-block SSSE3 variant

Add a length argument to the quad block function for SSSE3, so the
block function may XOR only a partial length of four blocks.

As we already have the stack set up, the partial XORing does not need
to. This gives a slightly different function trailer, so we keep that
separate from the 1-block function.

Signed-off-by: Martin Willi <[email protected]>
---
arch/x86/crypto/chacha20-ssse3-x86_64.S | 163 ++++++++++++++++++------
arch/x86/crypto/chacha20_glue.c | 5 +-
2 files changed, 128 insertions(+), 40 deletions(-)

diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S
index 98d130b5e4ab..d8ac75bb448f 100644
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
@@ -191,8 +191,9 @@ ENDPROC(chacha20_block_xor_ssse3)

ENTRY(chacha20_4block_xor_ssse3)
# %rdi: Input state matrix, s
- # %rsi: 4 data blocks output, o
- # %rdx: 4 data blocks input, i
+ # %rsi: up to 4 data blocks output, o
+ # %rdx: up to 4 data blocks input, i
+ # %rcx: input/output length in bytes

# This function encrypts four consecutive ChaCha20 blocks by loading the
# the state matrix in SSE registers four times. As we need some scratch
@@ -207,6 +208,7 @@ ENTRY(chacha20_4block_xor_ssse3)
lea 8(%rsp),%r10
sub $0x80,%rsp
and $~63,%rsp
+ mov %rcx,%rax

# x0..15[0-3] = s0..3[0..3]
movq 0x00(%rdi),%xmm1
@@ -617,58 +619,143 @@ ENTRY(chacha20_4block_xor_ssse3)

# xor with corresponding input, write to output
movdqa 0x00(%rsp),%xmm0
+ cmp $0x10,%rax
+ jl .Lxorpart4
movdqu 0x00(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x00(%rsi)
- movdqa 0x10(%rsp),%xmm0
- movdqu 0x80(%rdx),%xmm1
+
+ movdqu %xmm4,%xmm0
+ cmp $0x20,%rax
+ jl .Lxorpart4
+ movdqu 0x10(%rdx),%xmm1
pxor %xmm1,%xmm0
- movdqu %xmm0,0x80(%rsi)
+ movdqu %xmm0,0x10(%rsi)
+
+ movdqu %xmm8,%xmm0
+ cmp $0x30,%rax
+ jl .Lxorpart4
+ movdqu 0x20(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x20(%rsi)
+
+ movdqu %xmm12,%xmm0
+ cmp $0x40,%rax
+ jl .Lxorpart4
+ movdqu 0x30(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x30(%rsi)
+
movdqa 0x20(%rsp),%xmm0
+ cmp $0x50,%rax
+ jl .Lxorpart4
movdqu 0x40(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x40(%rsi)
+
+ movdqu %xmm6,%xmm0
+ cmp $0x60,%rax
+ jl .Lxorpart4
+ movdqu 0x50(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x50(%rsi)
+
+ movdqu %xmm10,%xmm0
+ cmp $0x70,%rax
+ jl .Lxorpart4
+ movdqu 0x60(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x60(%rsi)
+
+ movdqu %xmm14,%xmm0
+ cmp $0x80,%rax
+ jl .Lxorpart4
+ movdqu 0x70(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x70(%rsi)
+
+ movdqa 0x10(%rsp),%xmm0
+ cmp $0x90,%rax
+ jl .Lxorpart4
+ movdqu 0x80(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x80(%rsi)
+
+ movdqu %xmm5,%xmm0
+ cmp $0xa0,%rax
+ jl .Lxorpart4
+ movdqu 0x90(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x90(%rsi)
+
+ movdqu %xmm9,%xmm0
+ cmp $0xb0,%rax
+ jl .Lxorpart4
+ movdqu 0xa0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xa0(%rsi)
+
+ movdqu %xmm13,%xmm0
+ cmp $0xc0,%rax
+ jl .Lxorpart4
+ movdqu 0xb0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xb0(%rsi)
+
movdqa 0x30(%rsp),%xmm0
+ cmp $0xd0,%rax
+ jl .Lxorpart4
movdqu 0xc0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xc0(%rsi)
- movdqu 0x10(%rdx),%xmm1
- pxor %xmm1,%xmm4
- movdqu %xmm4,0x10(%rsi)
- movdqu 0x90(%rdx),%xmm1
- pxor %xmm1,%xmm5
- movdqu %xmm5,0x90(%rsi)
- movdqu 0x50(%rdx),%xmm1
- pxor %xmm1,%xmm6
- movdqu %xmm6,0x50(%rsi)
+
+ movdqu %xmm7,%xmm0
+ cmp $0xe0,%rax
+ jl .Lxorpart4
movdqu 0xd0(%rdx),%xmm1
- pxor %xmm1,%xmm7
- movdqu %xmm7,0xd0(%rsi)
- movdqu 0x20(%rdx),%xmm1
- pxor %xmm1,%xmm8
- movdqu %xmm8,0x20(%rsi)
- movdqu 0xa0(%rdx),%xmm1
- pxor %xmm1,%xmm9
- movdqu %xmm9,0xa0(%rsi)
- movdqu 0x60(%rdx),%xmm1
- pxor %xmm1,%xmm10
- movdqu %xmm10,0x60(%rsi)
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xd0(%rsi)
+
+ movdqu %xmm11,%xmm0
+ cmp $0xf0,%rax
+ jl .Lxorpart4
movdqu 0xe0(%rdx),%xmm1
- pxor %xmm1,%xmm11
- movdqu %xmm11,0xe0(%rsi)
- movdqu 0x30(%rdx),%xmm1
- pxor %xmm1,%xmm12
- movdqu %xmm12,0x30(%rsi)
- movdqu 0xb0(%rdx),%xmm1
- pxor %xmm1,%xmm13
- movdqu %xmm13,0xb0(%rsi)
- movdqu 0x70(%rdx),%xmm1
- pxor %xmm1,%xmm14
- movdqu %xmm14,0x70(%rsi)
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xe0(%rsi)
+
+ movdqu %xmm15,%xmm0
+ cmp $0x100,%rax
+ jl .Lxorpart4
movdqu 0xf0(%rdx),%xmm1
- pxor %xmm1,%xmm15
- movdqu %xmm15,0xf0(%rsi)
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xf0(%rsi)

+.Ldone4:
lea -8(%r10),%rsp
ret
+
+.Lxorpart4:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone4
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ pxor 0x00(%rsp),%xmm0
+ movdqa %xmm0,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ jmp .Ldone4
+
ENDPROC(chacha20_4block_xor_ssse3)
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index cc4571736ce8..8f1ef1a9ce5c 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -21,7 +21,8 @@

asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
unsigned int len);
-asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len);
#ifdef CONFIG_AS_AVX2
asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
static bool chacha20_use_avx2;
@@ -42,7 +43,7 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
}
#endif
while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
- chacha20_4block_xor_ssse3(state, dst, src);
+ chacha20_4block_xor_ssse3(state, dst, src, bytes);
bytes -= CHACHA20_BLOCK_SIZE * 4;
src += CHACHA20_BLOCK_SIZE * 4;
dst += CHACHA20_BLOCK_SIZE * 4;
--
2.17.1

2018-11-16 12:58:43

by Jason A. Donenfeld

[permalink] [raw]
Subject: Re: [PATCH 0/6] crypto: x86/chacha20 - SIMD performance improvements

Hi Martin,

This is nice work, and given that it's quite clean -- and that it's
usually hard to screw up chacha in subtle ways when test vectors pass
(unlike, say, poly1305 or curve25519), I'd be inclined to roll with
your implementation if it can eventually become competitive with Andy
Polyakov's, which I'm currently working on for Zinc (which no longer
has pre-generated code, addressing the biggest hurdle; v9 will be sent
shortly). Specifically, I'm not quite sure the improvements here tip
the balance apply to all avx2 microarchitectures, and most
importantly, there are still no AVX-512 paths, which means it's
considerably slower on all newer generation Intel chips. Andy's has
the AVX-512VL implementation for Skylake (using ymm, so as not to hit
throttling) and AVX-512F for Cannon Lake and beyond (using zmm). I've
attached some measurements below showing how stark the difference is.

The take away is that while Andy's implementation is still ahead in
terms of performance today, I'd certainly encourage your efforts to
gain parity with that, and I'd be happy have that when the performance
and fuzzing time is right for it. So please do keep chipping away at
it; I think it's a potentially useful effort.

Regards,
Jason

size old zinc
---- ---- ----
0 64 54
16 386 372
32 388 396
48 388 420
64 366 350
80 708 666
96 708 692
112 706 736
128 692 648
144 1036 682
160 1036 708
176 1036 730
192 1016 658
208 1360 684
224 1362 708
240 1360 732
256 644 500
272 990 526
288 988 556
304 988 576
320 972 500
336 1314 532
352 1316 558
368 1318 578
384 1308 506
400 1644 532
416 1644 556
432 1644 594
448 1624 508
464 1970 534
480 1970 556
496 1968 582
512 660 624
528 1016 682
544 1016 702
560 1018 728
576 998 654
592 1344 680
608 1344 708
624 1344 730
640 1326 654
656 1670 686
672 1670 708
688 1670 732
704 1652 658
720 1998 682
736 1998 710
752 1996 734
768 1256 662
784 1606 688
800 1606 714
816 1606 736
832 1584 660
848 1948 688
864 1950 714
880 1948 736
896 1912 688
912 2258 718
928 2258 744
944 2256 768
960 2238 692
976 2584 718
992 2584 744
1008 2584 770



On Thu, Nov 15, 2018 at 6:21 PM Herbert Xu <[email protected]> wrote:
>
> On Sun, Nov 11, 2018 at 10:36:24AM +0100, Martin Willi wrote:
> > This patchset improves performance of the ChaCha20 SIMD implementations
> > for x86_64. For some specific encryption lengths, performance is more
> > than doubled. Two mechanisms are used to achieve this:
> >
> > * Instead of calculating the minimal number of required blocks for a
> > given encryption length, functions producing more blocks are used
> > more aggressively. Calculating a 4-block function can be faster than
> > calculating a 2-block and a 1-block function, even if only three
> > blocks are actually required.
> >
> > * In addition to the 8-block AVX2 function, a 4-block and a 2-block
> > function are introduced.
> >
> > Patches 1-3 add support for partial lengths to the existing 1-, 4- and
> > 8-block functions. Patch 4 makes use of that by engaging the next higher
> > level block functions more aggressively. Patch 5 and 6 add the new AVX2
> > functions for 2 and 4 blocks. Patches are based on cryptodev and would
> > need adjustments to apply on top of the Adiantum patchset.
> >
> > Note that the more aggressive use of larger block functions calculate
> > blocks that may get discarded. This may have a negative impact on energy
> > usage or the processors thermal budget. However, with the new block
> > functions we can avoid this over-calculation for many lengths, so the
> > performance win can be considered more important.
> >
> > Below are performance numbers measured with tcrypt using additional
> > encryption lengths; numbers in kOps/s, on my i7-5557U. old is the
> > existing, new the implementation with this patchset. As comparison
> > the numbers for zinc in v6:
> >
> > len old new zinc
> > 8 5908 5818 5818
> > 16 5917 5828 5726
> > 24 5916 5869 5757
> > 32 5920 5789 5813
> > 40 5868 5799 5710
> > 48 5877 5761 5761
> > 56 5869 5797 5742
> > 64 5897 5862 5685
> > 72 3381 4979 3520
> > 80 3364 5541 3475
> > 88 3350 4977 3424
> > 96 3342 5530 3371
> > 104 3328 4923 3313
> > 112 3317 5528 3207
> > 120 3313 4970 3150
> > 128 3492 5535 3568
> > 136 2487 4570 3690
> > 144 2481 5047 3599
> > 152 2473 4565 3566
> > 160 2459 5022 3515
> > 168 2461 4550 3437
> > 176 2454 5020 3325
> > 184 2449 4535 3279
> > 192 2538 5011 3762
> > 200 1962 4537 3702
> > 208 1962 4971 3622
> > 216 1954 4487 3518
> > 224 1949 4936 3445
> > 232 1948 4497 3422
> > 240 1941 4947 3317
> > 248 1940 4481 3279
> > 256 3798 4964 3723
> > 264 2638 3577 3639
> > 272 2637 3567 3597
> > 280 2628 3563 3565
> > 288 2630 3795 3484
> > 296 2621 3580 3422
> > 304 2612 3569 3352
> > 312 2602 3599 3308
> > 320 2694 3821 3694
> > 328 2060 3538 3681
> > 336 2054 3565 3599
> > 344 2054 3553 3523
> > 352 2049 3809 3419
> > 360 2045 3575 3403
> > 368 2035 3560 3334
> > 376 2036 3555 3257
> > 384 2092 3785 3715
> > 392 1691 3505 3612
> > 400 1684 3527 3553
> > 408 1686 3527 3496
> > 416 1684 3804 3430
> > 424 1681 3555 3402
> > 432 1675 3559 3311
> > 440 1672 3558 3275
> > 448 1710 3780 3689
> > 456 1431 3541 3618
> > 464 1428 3538 3576
> > 472 1430 3527 3509
> > 480 1426 3788 3405
> > 488 1423 3502 3397
> > 496 1423 3519 3298
> > 504 1418 3519 3277
> > 512 3694 3736 3735
> > 520 2601 2571 2209
> > 528 2601 2677 2148
> > 536 2587 2534 2164
> > 544 2578 2659 2138
> > 552 2570 2552 2126
> > 560 2566 2661 2035
> > 568 2567 2542 2041
> > 576 2639 2674 2199
> > 584 2031 2531 2183
> > 592 2027 2660 2145
> > 600 2016 2513 2155
> > 608 2009 2638 2133
> > 616 2006 2522 2115
> > 624 2000 2649 2064
> > 632 1996 2518 2045
> > 640 2053 2651 2188
> > 648 1666 2402 2182
> > 656 1663 2517 2158
> > 664 1659 2397 2147
> > 672 1657 2510 2139
> > 680 1656 2394 2114
> > 688 1653 2497 2077
> > 696 1646 2393 2043
> > 704 1678 2510 2208
> > 712 1414 2391 2189
> > 720 1412 2506 2169
> > 728 1411 2384 2145
> > 736 1408 2494 2142
> > 744 1408 2379 2081
> > 752 1405 2485 2064
> > 760 1403 2376 2043
> > 768 2189 2498 2211
> > 776 1756 2137 2192
> > 784 1746 2145 2146
> > 792 1744 2141 2141
> > 800 1743 2222 2094
> > 808 1742 2140 2100
> > 816 1735 2134 2061
> > 824 1731 2135 2045
> > 832 1778 2222 2223
> > 840 1480 2132 2184
> > 848 1480 2134 2173
> > 856 1476 2124 2145
> > 864 1474 2210 2126
> > 872 1472 2127 2105
> > 880 1463 2123 2056
> > 888 1468 2123 2043
> > 896 1494 2208 2219
> > 904 1278 2120 2192
> > 912 1277 2121 2170
> > 920 1273 2118 2149
> > 928 1272 2207 2125
> > 936 1267 2125 2098
> > 944 1265 2127 2060
> > 952 1267 2126 2049
> > 960 1289 2213 2204
> > 968 1125 2123 2187
> > 976 1122 2127 2166
> > 984 1120 2123 2136
> > 992 1118 2207 2119
> > 1000 1118 2120 2101
> > 1008 1117 2122 2042
> > 1016 1115 2121 2048
> > 1024 2174 2191 2195
> > 1032 1748 1724 1565
> > 1040 1745 1782 1544
> > 1048 1736 1737 1554
> > 1056 1738 1802 1541
> > 1064 1735 1728 1523
> > 1072 1730 1780 1507
> > 1080 1729 1724 1497
> > 1088 1757 1783 1592
> > 1096 1475 1723 1575
> > 1104 1474 1778 1563
> > 1112 1472 1708 1544
> > 1120 1468 1774 1521
> > 1128 1466 1718 1521
> > 1136 1462 1780 1501
> > 1144 1460 1719 1491
> > 1152 1481 1782 1575
> > 1160 1271 1647 1558
> > 1168 1271 1706 1554
> > 1176 1268 1645 1545
> > 1184 1265 1711 1538
> > 1192 1265 1648 1530
> > 1200 1264 1705 1493
> > 1208 1262 1647 1498
> > 1216 1277 1695 1581
> > 1224 1120 1642 1563
> > 1232 1115 1702 1549
> > 1240 1121 1646 1538
> > 1248 1119 1703 1527
> > 1256 1115 1640 1520
> > 1264 1114 1693 1505
> > 1272 1112 1642 1492
> > 1280 1552 1699 1574
> > 1288 1314 1525 1573
> > 1296 1315 1522 1551
> > 1304 1312 1521 1548
> > 1312 1311 1564 1535
> > 1320 1309 1518 1524
> > 1328 1302 1527 1508
> > 1336 1303 1521 1500
> > 1344 1333 1561 1579
> > 1352 1157 1524 1573
> > 1360 1152 1520 1546
> > 1368 1154 1522 1545
> > 1376 1153 1562 1536
> > 1384 1151 1525 1526
> > 1392 1149 1523 1504
> > 1400 1148 1517 1480
> > 1408 1167 1561 1589
> > 1416 1030 1516 1558
> > 1424 1028 1516 1546
> > 1432 1027 1522 1537
> > 1440 1027 1564 1523
> > 1448 1026 1507 1512
> > 1456 1025 1515 1491
> > 1464 1023 1522 1481
> > 1472 1037 1559 1577
> > 1480 927 1518 1559
> > 1488 926 1514 1548
> > 1496 926 1513 1534
>
> Nice work Martin!
>
> In light of this, and the grumblings over the wholesale replacment
> of the existing chacha20 implementations, could we add the zinc
> interface in a piecemeal fashion?
>
> That is, instead of adding a bunch of new implementations through
> zinc and then converting the crypto users over, we instead convert
> the existing implementations over to zinc in-place. This would
> also resolve the complaints over not being able to choose different
> implementations through the crypto API and leaving that choice
> solely up to zinc.
>
> After that is done, wireguard could then proceed in parallel with
> replacing any implementations should the need arise.
>
> Cheers,
> --
> Email: Herbert Xu <[email protected]>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2018-11-11 19:24:52

by Martin Willi

[permalink] [raw]
Subject: [PATCH 6/6] crypto: x86/chacha20 - Add a 4-block AVX2 variant

This variant builds upon the idea of the 2-block AVX2 variant that
shuffles words after each round. The shuffling has a rather high latency,
so the arithmetic units are not optimally used.

Given that we have plenty of registers in AVX, this version parallelizes
the 2-block variant to do four blocks. While the first two blocks are
shuffling, the CPU can do the XORing on the second two blocks and
vice-versa, which makes this version much faster than the SSSE3 variant
for four blocks. The latter is now mostly for systems that do not have
AVX2, but there it is the work-horse, so we keep it in place.

The partial XORing function trailer is very similar to the AVX2 2-block
variant. While it could be shared, that code segment is rather short;
profiling is also easier with the trailer integrated, so we keep it per
function.

Signed-off-by: Martin Willi <[email protected]>
---
arch/x86/crypto/chacha20-avx2-x86_64.S | 310 +++++++++++++++++++++++++
arch/x86/crypto/chacha20_glue.c | 7 +
2 files changed, 317 insertions(+)

diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S
index 8247076b0ba7..b6ab082be657 100644
--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx2-x86_64.S
@@ -31,6 +31,11 @@ CTRINC: .octa 0x00000003000000020000000100000000
CTR2BL: .octa 0x00000000000000000000000000000000
.octa 0x00000000000000000000000000000001

+.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL: .octa 0x00000000000000000000000000000002
+ .octa 0x00000000000000000000000000000003
+
.text

ENTRY(chacha20_2block_xor_avx2)
@@ -225,6 +230,311 @@ ENTRY(chacha20_2block_xor_avx2)

ENDPROC(chacha20_2block_xor_avx2)

+ENTRY(chacha20_4block_xor_avx2)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 4 data blocks output, o
+ # %rdx: up to 4 data blocks input, i
+ # %rcx: input/output length in bytes
+
+ # This function encrypts four ChaCha20 block by loading the state
+ # matrix four times across eight AVX registers. It performs matrix
+ # operations on four words in two matrices in parallel, sequentially
+ # to the operations on the four words of the other two matrices. The
+ # required word shuffling has a rather high latency, we can do the
+ # arithmetic on two matrix-pairs without much slowdown.
+
+ vzeroupper
+
+ # x0..3[0-4] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vmovdqa %ymm0,%ymm4
+ vmovdqa %ymm1,%ymm5
+ vmovdqa %ymm2,%ymm6
+ vmovdqa %ymm3,%ymm7
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+ vpaddd CTR4BL(%rip),%ymm7,%ymm7
+
+ vmovdqa %ymm0,%ymm11
+ vmovdqa %ymm1,%ymm12
+ vmovdqa %ymm2,%ymm13
+ vmovdqa %ymm3,%ymm14
+ vmovdqa %ymm7,%ymm15
+
+ vmovdqa ROT8(%rip),%ymm8
+ vmovdqa ROT16(%rip),%ymm9
+
+ mov %rcx,%rax
+ mov $10,%ecx
+
+.Ldoubleround4:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm9,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm9,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ vpshufd $0x39,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+ vpshufd $0x93,%ymm7,%ymm7
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm9,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm9,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ vpshufd $0x93,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+ vpshufd $0x39,%ymm7,%ymm7
+
+ dec %ecx
+ jnz .Ldoubleround4
+
+ # o0 = i0 ^ (x0 + s0), first block
+ vpaddd %ymm11,%ymm0,%ymm10
+ cmp $0x10,%rax
+ jl .Lxorpart4
+ vpxor 0x00(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x00(%rsi)
+ vextracti128 $1,%ymm10,%xmm0
+ # o1 = i1 ^ (x1 + s1), first block
+ vpaddd %ymm12,%ymm1,%ymm10
+ cmp $0x20,%rax
+ jl .Lxorpart4
+ vpxor 0x10(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x10(%rsi)
+ vextracti128 $1,%ymm10,%xmm1
+ # o2 = i2 ^ (x2 + s2), first block
+ vpaddd %ymm13,%ymm2,%ymm10
+ cmp $0x30,%rax
+ jl .Lxorpart4
+ vpxor 0x20(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x20(%rsi)
+ vextracti128 $1,%ymm10,%xmm2
+ # o3 = i3 ^ (x3 + s3), first block
+ vpaddd %ymm14,%ymm3,%ymm10
+ cmp $0x40,%rax
+ jl .Lxorpart4
+ vpxor 0x30(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x30(%rsi)
+ vextracti128 $1,%ymm10,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm10
+ cmp $0x50,%rax
+ jl .Lxorpart4
+ vpxor 0x40(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm10
+ cmp $0x60,%rax
+ jl .Lxorpart4
+ vpxor 0x50(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm10
+ cmp $0x70,%rax
+ jl .Lxorpart4
+ vpxor 0x60(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm10
+ cmp $0x80,%rax
+ jl .Lxorpart4
+ vpxor 0x70(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x70(%rsi)
+
+ # o0 = i0 ^ (x0 + s0), third block
+ vpaddd %ymm11,%ymm4,%ymm10
+ cmp $0x90,%rax
+ jl .Lxorpart4
+ vpxor 0x80(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x80(%rsi)
+ vextracti128 $1,%ymm10,%xmm4
+ # o1 = i1 ^ (x1 + s1), third block
+ vpaddd %ymm12,%ymm5,%ymm10
+ cmp $0xa0,%rax
+ jl .Lxorpart4
+ vpxor 0x90(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x90(%rsi)
+ vextracti128 $1,%ymm10,%xmm5
+ # o2 = i2 ^ (x2 + s2), third block
+ vpaddd %ymm13,%ymm6,%ymm10
+ cmp $0xb0,%rax
+ jl .Lxorpart4
+ vpxor 0xa0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xa0(%rsi)
+ vextracti128 $1,%ymm10,%xmm6
+ # o3 = i3 ^ (x3 + s3), third block
+ vpaddd %ymm15,%ymm7,%ymm10
+ cmp $0xc0,%rax
+ jl .Lxorpart4
+ vpxor 0xb0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xb0(%rsi)
+ vextracti128 $1,%ymm10,%xmm7
+
+ # xor and write fourth block
+ vmovdqa %xmm4,%xmm10
+ cmp $0xd0,%rax
+ jl .Lxorpart4
+ vpxor 0xc0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xc0(%rsi)
+
+ vmovdqa %xmm5,%xmm10
+ cmp $0xe0,%rax
+ jl .Lxorpart4
+ vpxor 0xd0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xd0(%rsi)
+
+ vmovdqa %xmm6,%xmm10
+ cmp $0xf0,%rax
+ jl .Lxorpart4
+ vpxor 0xe0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xe0(%rsi)
+
+ vmovdqa %xmm7,%xmm10
+ cmp $0x100,%rax
+ jl .Lxorpart4
+ vpxor 0xf0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xf0(%rsi)
+
+.Ldone4:
+ vzeroupper
+ ret
+
+.Lxorpart4:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone4
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea 8(%rsp),%r10
+ sub $0x10,%rsp
+ and $~31,%rsp
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ vpxor 0x00(%rsp),%xmm10,%xmm10
+ vmovdqa %xmm10,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ lea -8(%r10),%rsp
+ jmp .Ldone4
+
+ENDPROC(chacha20_4block_xor_avx2)
+
ENTRY(chacha20_8block_xor_avx2)
# %rdi: Input state matrix, s
# %rsi: up to 8 data blocks output, o
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 82e46589a189..9fd84fe6ec09 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -26,6 +26,8 @@ asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
#ifdef CONFIG_AS_AVX2
asmlinkage void chacha20_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
unsigned int len);
+asmlinkage void chacha20_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len);
asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
unsigned int len);
static bool chacha20_use_avx2;
@@ -54,6 +56,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
state[12] += chacha20_advance(bytes, 8);
return;
}
+ if (bytes > CHACHA20_BLOCK_SIZE * 2) {
+ chacha20_4block_xor_avx2(state, dst, src, bytes);
+ state[12] += chacha20_advance(bytes, 4);
+ return;
+ }
if (bytes > CHACHA20_BLOCK_SIZE) {
chacha20_2block_xor_avx2(state, dst, src, bytes);
state[12] += chacha20_advance(bytes, 2);
--
2.17.1

2018-11-11 19:24:48

by Martin Willi

[permalink] [raw]
Subject: [PATCH 3/6] crypto: x86/chacha20 - Support partial lengths in 8-block AVX2 variant

Add a length argument to the eight block function for AVX2, so the
block function may XOR only a partial length of eight blocks.

To avoid unnecessary operations, we integrate XORing of the first four
blocks in the final lane interleaving; this also avoids some work in
the partial lengths path.

Signed-off-by: Martin Willi <[email protected]>
---
arch/x86/crypto/chacha20-avx2-x86_64.S | 189 +++++++++++++++++--------
arch/x86/crypto/chacha20_glue.c | 5 +-
2 files changed, 133 insertions(+), 61 deletions(-)

diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S
index f3cd26f48332..7b62d55bee3d 100644
--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx2-x86_64.S
@@ -30,8 +30,9 @@ CTRINC: .octa 0x00000003000000020000000100000000

ENTRY(chacha20_8block_xor_avx2)
# %rdi: Input state matrix, s
- # %rsi: 8 data blocks output, o
- # %rdx: 8 data blocks input, i
+ # %rsi: up to 8 data blocks output, o
+ # %rdx: up to 8 data blocks input, i
+ # %rcx: input/output length in bytes

# This function encrypts eight consecutive ChaCha20 blocks by loading
# the state matrix in AVX registers eight times. As we need some
@@ -48,6 +49,7 @@ ENTRY(chacha20_8block_xor_avx2)
lea 8(%rsp),%r10
and $~31, %rsp
sub $0x80, %rsp
+ mov %rcx,%rax

# x0..15[0-7] = s[0..15]
vpbroadcastd 0x00(%rdi),%ymm0
@@ -375,74 +377,143 @@ ENTRY(chacha20_8block_xor_avx2)
vpunpckhqdq %ymm15,%ymm0,%ymm15

# interleave 128-bit words in state n, n+4
- vmovdqa 0x00(%rsp),%ymm0
- vperm2i128 $0x20,%ymm4,%ymm0,%ymm1
- vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
- vmovdqa %ymm1,0x00(%rsp)
- vmovdqa 0x20(%rsp),%ymm0
- vperm2i128 $0x20,%ymm5,%ymm0,%ymm1
- vperm2i128 $0x31,%ymm5,%ymm0,%ymm5
- vmovdqa %ymm1,0x20(%rsp)
- vmovdqa 0x40(%rsp),%ymm0
- vperm2i128 $0x20,%ymm6,%ymm0,%ymm1
- vperm2i128 $0x31,%ymm6,%ymm0,%ymm6
- vmovdqa %ymm1,0x40(%rsp)
- vmovdqa 0x60(%rsp),%ymm0
- vperm2i128 $0x20,%ymm7,%ymm0,%ymm1
- vperm2i128 $0x31,%ymm7,%ymm0,%ymm7
- vmovdqa %ymm1,0x60(%rsp)
+ # xor/write first four blocks
+ vmovdqa 0x00(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm4,%ymm1,%ymm0
+ cmp $0x0020,%rax
+ jl .Lxorpart8
+ vpxor 0x0000(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0000(%rsi)
+ vperm2i128 $0x31,%ymm4,%ymm1,%ymm4
+
vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
+ cmp $0x0040,%rax
+ jl .Lxorpart8
+ vpxor 0x0020(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0020(%rsi)
vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
- vmovdqa %ymm0,%ymm8
- vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
- vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
- vmovdqa %ymm0,%ymm9
+
+ vmovdqa 0x40(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm6,%ymm1,%ymm0
+ cmp $0x0060,%rax
+ jl .Lxorpart8
+ vpxor 0x0040(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0040(%rsi)
+ vperm2i128 $0x31,%ymm6,%ymm1,%ymm6
+
vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
+ cmp $0x0080,%rax
+ jl .Lxorpart8
+ vpxor 0x0060(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0060(%rsi)
vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
- vmovdqa %ymm0,%ymm10
- vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
- vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
- vmovdqa %ymm0,%ymm11

- # xor with corresponding input, write to output
- vmovdqa 0x00(%rsp),%ymm0
- vpxor 0x0000(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0000(%rsi)
- vmovdqa 0x20(%rsp),%ymm0
+ vmovdqa 0x20(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
+ cmp $0x00a0,%rax
+ jl .Lxorpart8
vpxor 0x0080(%rdx),%ymm0,%ymm0
vmovdqu %ymm0,0x0080(%rsi)
- vmovdqa 0x40(%rsp),%ymm0
- vpxor 0x0040(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0040(%rsi)
- vmovdqa 0x60(%rsp),%ymm0
+ vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
+
+ vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
+ cmp $0x00c0,%rax
+ jl .Lxorpart8
+ vpxor 0x00a0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x00a0(%rsi)
+ vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
+
+ vmovdqa 0x60(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm7,%ymm1,%ymm0
+ cmp $0x00e0,%rax
+ jl .Lxorpart8
vpxor 0x00c0(%rdx),%ymm0,%ymm0
vmovdqu %ymm0,0x00c0(%rsi)
- vpxor 0x0100(%rdx),%ymm4,%ymm4
- vmovdqu %ymm4,0x0100(%rsi)
- vpxor 0x0180(%rdx),%ymm5,%ymm5
- vmovdqu %ymm5,0x00180(%rsi)
- vpxor 0x0140(%rdx),%ymm6,%ymm6
- vmovdqu %ymm6,0x0140(%rsi)
- vpxor 0x01c0(%rdx),%ymm7,%ymm7
- vmovdqu %ymm7,0x01c0(%rsi)
- vpxor 0x0020(%rdx),%ymm8,%ymm8
- vmovdqu %ymm8,0x0020(%rsi)
- vpxor 0x00a0(%rdx),%ymm9,%ymm9
- vmovdqu %ymm9,0x00a0(%rsi)
- vpxor 0x0060(%rdx),%ymm10,%ymm10
- vmovdqu %ymm10,0x0060(%rsi)
- vpxor 0x00e0(%rdx),%ymm11,%ymm11
- vmovdqu %ymm11,0x00e0(%rsi)
- vpxor 0x0120(%rdx),%ymm12,%ymm12
- vmovdqu %ymm12,0x0120(%rsi)
- vpxor 0x01a0(%rdx),%ymm13,%ymm13
- vmovdqu %ymm13,0x01a0(%rsi)
- vpxor 0x0160(%rdx),%ymm14,%ymm14
- vmovdqu %ymm14,0x0160(%rsi)
- vpxor 0x01e0(%rdx),%ymm15,%ymm15
- vmovdqu %ymm15,0x01e0(%rsi)
+ vperm2i128 $0x31,%ymm7,%ymm1,%ymm7
+
+ vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
+ cmp $0x0100,%rax
+ jl .Lxorpart8
+ vpxor 0x00e0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x00e0(%rsi)
+ vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
+
+ # xor remaining blocks, write to output
+ vmovdqa %ymm4,%ymm0
+ cmp $0x0120,%rax
+ jl .Lxorpart8
+ vpxor 0x0100(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0100(%rsi)

+ vmovdqa %ymm12,%ymm0
+ cmp $0x0140,%rax
+ jl .Lxorpart8
+ vpxor 0x0120(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0120(%rsi)
+
+ vmovdqa %ymm6,%ymm0
+ cmp $0x0160,%rax
+ jl .Lxorpart8
+ vpxor 0x0140(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0140(%rsi)
+
+ vmovdqa %ymm14,%ymm0
+ cmp $0x0180,%rax
+ jl .Lxorpart8
+ vpxor 0x0160(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0160(%rsi)
+
+ vmovdqa %ymm5,%ymm0
+ cmp $0x01a0,%rax
+ jl .Lxorpart8
+ vpxor 0x0180(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0180(%rsi)
+
+ vmovdqa %ymm13,%ymm0
+ cmp $0x01c0,%rax
+ jl .Lxorpart8
+ vpxor 0x01a0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x01a0(%rsi)
+
+ vmovdqa %ymm7,%ymm0
+ cmp $0x01e0,%rax
+ jl .Lxorpart8
+ vpxor 0x01c0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x01c0(%rsi)
+
+ vmovdqa %ymm15,%ymm0
+ cmp $0x0200,%rax
+ jl .Lxorpart8
+ vpxor 0x01e0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x01e0(%rsi)
+
+.Ldone8:
vzeroupper
lea -8(%r10),%rsp
ret
+
+.Lxorpart8:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x1f,%r9
+ jz .Ldone8
+ and $~0x1f,%rax
+
+ mov %rsi,%r11
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ vpxor 0x00(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ jmp .Ldone8
+
ENDPROC(chacha20_8block_xor_avx2)
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 8f1ef1a9ce5c..882e8bf5965a 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -24,7 +24,8 @@ asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
unsigned int len);
#ifdef CONFIG_AS_AVX2
-asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len);
static bool chacha20_use_avx2;
#endif

@@ -34,7 +35,7 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
#ifdef CONFIG_AS_AVX2
if (chacha20_use_avx2) {
while (bytes >= CHACHA20_BLOCK_SIZE * 8) {
- chacha20_8block_xor_avx2(state, dst, src);
+ chacha20_8block_xor_avx2(state, dst, src, bytes);
bytes -= CHACHA20_BLOCK_SIZE * 8;
src += CHACHA20_BLOCK_SIZE * 8;
dst += CHACHA20_BLOCK_SIZE * 8;
--
2.17.1

2018-11-20 03:10:09

by Jason A. Donenfeld

[permalink] [raw]
Subject: Re: [PATCH 0/6] crypto: x86/chacha20 - SIMD performance improvements

Hi Martin,

On Mon, Nov 19, 2018 at 8:52 AM Martin Willi <[email protected]> wrote:
>
> Adding AVX-512VL support is relatively simple. I have a patchset mostly
> ready that is more than competitive with the code from Zinc. I'll clean
> that up and do more testing before posting it later this week.

Terrific. Depending on how it turns out, it'll be nice to try
integrating this into Zinc. I have a massive Xeon Gold 5120 machine
that I can give you access to if you'd like to do some testing and
benching. Poke me on IRC -- I'm zx2c4.

> I don't think that having AVX-512F is that important until it is really
> usable on CPUs in the market.

Actually, similarly here, a 10nm Cannon Lake machine should be
arriving at my house this week, which should make for some interesting
testing ground for non-throttled zmm, if you'd like to play with it.

Jason

2018-11-11 19:24:48

by Martin Willi

[permalink] [raw]
Subject: [PATCH 1/6] crypto: x86/chacha20 - Support partial lengths in 1-block SSSE3 variant

Add a length argument to the single block function for SSSE3, so the
block function may XOR only a partial length of the full block. Given
that the setup code is rather cheap, the function does not process more
than one block; this allows us to keep the block function selection in
the C glue code.

The required branching does not negatively affect performance for full
block sizes. The partial XORing uses simple "rep movsb" to copy the
data before and after doing XOR in SSE. This is rather efficient on
modern processors; movsw can be slightly faster, but the additional
complexity is probably not worth it.

Signed-off-by: Martin Willi <[email protected]>
---
arch/x86/crypto/chacha20-ssse3-x86_64.S | 74 ++++++++++++++++++++-----
arch/x86/crypto/chacha20_glue.c | 11 ++--
2 files changed, 63 insertions(+), 22 deletions(-)

diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S
index 512a2b500fd1..98d130b5e4ab 100644
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
@@ -25,12 +25,13 @@ CTRINC: .octa 0x00000003000000020000000100000000

ENTRY(chacha20_block_xor_ssse3)
# %rdi: Input state matrix, s
- # %rsi: 1 data block output, o
- # %rdx: 1 data block input, i
+ # %rsi: up to 1 data block output, o
+ # %rdx: up to 1 data block input, i
+ # %rcx: input/output length in bytes

# This function encrypts one ChaCha20 block by loading the state matrix
# in four SSE registers. It performs matrix operation on four words in
- # parallel, but requireds shuffling to rearrange the words after each
+ # parallel, but requires shuffling to rearrange the words after each
# round. 8/16-bit word rotation is done with the slightly better
# performing SSSE3 byte shuffling, 7/12-bit word rotation uses
# traditional shift+OR.
@@ -48,7 +49,8 @@ ENTRY(chacha20_block_xor_ssse3)
movdqa ROT8(%rip),%xmm4
movdqa ROT16(%rip),%xmm5

- mov $10,%ecx
+ mov %rcx,%rax
+ mov $10,%ecx

.Ldoubleround:

@@ -122,27 +124,69 @@ ENTRY(chacha20_block_xor_ssse3)
jnz .Ldoubleround

# o0 = i0 ^ (x0 + s0)
- movdqu 0x00(%rdx),%xmm4
paddd %xmm8,%xmm0
+ cmp $0x10,%rax
+ jl .Lxorpart
+ movdqu 0x00(%rdx),%xmm4
pxor %xmm4,%xmm0
movdqu %xmm0,0x00(%rsi)
# o1 = i1 ^ (x1 + s1)
- movdqu 0x10(%rdx),%xmm5
paddd %xmm9,%xmm1
- pxor %xmm5,%xmm1
- movdqu %xmm1,0x10(%rsi)
+ movdqa %xmm1,%xmm0
+ cmp $0x20,%rax
+ jl .Lxorpart
+ movdqu 0x10(%rdx),%xmm0
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x10(%rsi)
# o2 = i2 ^ (x2 + s2)
- movdqu 0x20(%rdx),%xmm6
paddd %xmm10,%xmm2
- pxor %xmm6,%xmm2
- movdqu %xmm2,0x20(%rsi)
+ movdqa %xmm2,%xmm0
+ cmp $0x30,%rax
+ jl .Lxorpart
+ movdqu 0x20(%rdx),%xmm0
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,0x20(%rsi)
# o3 = i3 ^ (x3 + s3)
- movdqu 0x30(%rdx),%xmm7
paddd %xmm11,%xmm3
- pxor %xmm7,%xmm3
- movdqu %xmm3,0x30(%rsi)
-
+ movdqa %xmm3,%xmm0
+ cmp $0x40,%rax
+ jl .Lxorpart
+ movdqu 0x30(%rdx),%xmm0
+ pxor %xmm3,%xmm0
+ movdqu %xmm0,0x30(%rsi)
+
+.Ldone:
ret
+
+.Lxorpart:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea 8(%rsp),%r10
+ sub $0x10,%rsp
+ and $~31,%rsp
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ pxor 0x00(%rsp),%xmm0
+ movdqa %xmm0,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ lea -8(%r10),%rsp
+ jmp .Ldone
+
ENDPROC(chacha20_block_xor_ssse3)

ENTRY(chacha20_4block_xor_ssse3)
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index dce7c5d39c2f..cc4571736ce8 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -19,7 +19,8 @@

#define CHACHA20_STATE_ALIGN 16

-asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len);
asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
#ifdef CONFIG_AS_AVX2
asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
@@ -29,8 +30,6 @@ static bool chacha20_use_avx2;
static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes)
{
- u8 buf[CHACHA20_BLOCK_SIZE];
-
#ifdef CONFIG_AS_AVX2
if (chacha20_use_avx2) {
while (bytes >= CHACHA20_BLOCK_SIZE * 8) {
@@ -50,16 +49,14 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
state[12] += 4;
}
while (bytes >= CHACHA20_BLOCK_SIZE) {
- chacha20_block_xor_ssse3(state, dst, src);
+ chacha20_block_xor_ssse3(state, dst, src, bytes);
bytes -= CHACHA20_BLOCK_SIZE;
src += CHACHA20_BLOCK_SIZE;
dst += CHACHA20_BLOCK_SIZE;
state[12]++;
}
if (bytes) {
- memcpy(buf, src, bytes);
- chacha20_block_xor_ssse3(state, buf, buf);
- memcpy(dst, buf, bytes);
+ chacha20_block_xor_ssse3(state, dst, src, bytes);
}
}

--
2.17.1

2018-11-16 16:30:47

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH 0/6] crypto: x86/chacha20 - SIMD performance improvements

On Sun, Nov 11, 2018 at 10:36:24AM +0100, Martin Willi wrote:
> This patchset improves performance of the ChaCha20 SIMD implementations
> for x86_64. For some specific encryption lengths, performance is more
> than doubled. Two mechanisms are used to achieve this:
>
> * Instead of calculating the minimal number of required blocks for a
> given encryption length, functions producing more blocks are used
> more aggressively. Calculating a 4-block function can be faster than
> calculating a 2-block and a 1-block function, even if only three
> blocks are actually required.
>
> * In addition to the 8-block AVX2 function, a 4-block and a 2-block
> function are introduced.
>
> Patches 1-3 add support for partial lengths to the existing 1-, 4- and
> 8-block functions. Patch 4 makes use of that by engaging the next higher
> level block functions more aggressively. Patch 5 and 6 add the new AVX2
> functions for 2 and 4 blocks. Patches are based on cryptodev and would
> need adjustments to apply on top of the Adiantum patchset.
>
> Note that the more aggressive use of larger block functions calculate
> blocks that may get discarded. This may have a negative impact on energy
> usage or the processors thermal budget. However, with the new block
> functions we can avoid this over-calculation for many lengths, so the
> performance win can be considered more important.
>
> Below are performance numbers measured with tcrypt using additional
> encryption lengths; numbers in kOps/s, on my i7-5557U. old is the
> existing, new the implementation with this patchset. As comparison
> the numbers for zinc in v6:
>
> len old new zinc
> 8 5908 5818 5818
> 16 5917 5828 5726
> 24 5916 5869 5757
> 32 5920 5789 5813
> 40 5868 5799 5710
> 48 5877 5761 5761
> 56 5869 5797 5742
> 64 5897 5862 5685
> 72 3381 4979 3520
> 80 3364 5541 3475
> 88 3350 4977 3424
> 96 3342 5530 3371
> 104 3328 4923 3313
> 112 3317 5528 3207
> 120 3313 4970 3150
> 128 3492 5535 3568
> 136 2487 4570 3690
> 144 2481 5047 3599
> 152 2473 4565 3566
> 160 2459 5022 3515
> 168 2461 4550 3437
> 176 2454 5020 3325
> 184 2449 4535 3279
> 192 2538 5011 3762
> 200 1962 4537 3702
> 208 1962 4971 3622
> 216 1954 4487 3518
> 224 1949 4936 3445
> 232 1948 4497 3422
> 240 1941 4947 3317
> 248 1940 4481 3279
> 256 3798 4964 3723
> 264 2638 3577 3639
> 272 2637 3567 3597
> 280 2628 3563 3565
> 288 2630 3795 3484
> 296 2621 3580 3422
> 304 2612 3569 3352
> 312 2602 3599 3308
> 320 2694 3821 3694
> 328 2060 3538 3681
> 336 2054 3565 3599
> 344 2054 3553 3523
> 352 2049 3809 3419
> 360 2045 3575 3403
> 368 2035 3560 3334
> 376 2036 3555 3257
> 384 2092 3785 3715
> 392 1691 3505 3612
> 400 1684 3527 3553
> 408 1686 3527 3496
> 416 1684 3804 3430
> 424 1681 3555 3402
> 432 1675 3559 3311
> 440 1672 3558 3275
> 448 1710 3780 3689
> 456 1431 3541 3618
> 464 1428 3538 3576
> 472 1430 3527 3509
> 480 1426 3788 3405
> 488 1423 3502 3397
> 496 1423 3519 3298
> 504 1418 3519 3277
> 512 3694 3736 3735
> 520 2601 2571 2209
> 528 2601 2677 2148
> 536 2587 2534 2164
> 544 2578 2659 2138
> 552 2570 2552 2126
> 560 2566 2661 2035
> 568 2567 2542 2041
> 576 2639 2674 2199
> 584 2031 2531 2183
> 592 2027 2660 2145
> 600 2016 2513 2155
> 608 2009 2638 2133
> 616 2006 2522 2115
> 624 2000 2649 2064
> 632 1996 2518 2045
> 640 2053 2651 2188
> 648 1666 2402 2182
> 656 1663 2517 2158
> 664 1659 2397 2147
> 672 1657 2510 2139
> 680 1656 2394 2114
> 688 1653 2497 2077
> 696 1646 2393 2043
> 704 1678 2510 2208
> 712 1414 2391 2189
> 720 1412 2506 2169
> 728 1411 2384 2145
> 736 1408 2494 2142
> 744 1408 2379 2081
> 752 1405 2485 2064
> 760 1403 2376 2043
> 768 2189 2498 2211
> 776 1756 2137 2192
> 784 1746 2145 2146
> 792 1744 2141 2141
> 800 1743 2222 2094
> 808 1742 2140 2100
> 816 1735 2134 2061
> 824 1731 2135 2045
> 832 1778 2222 2223
> 840 1480 2132 2184
> 848 1480 2134 2173
> 856 1476 2124 2145
> 864 1474 2210 2126
> 872 1472 2127 2105
> 880 1463 2123 2056
> 888 1468 2123 2043
> 896 1494 2208 2219
> 904 1278 2120 2192
> 912 1277 2121 2170
> 920 1273 2118 2149
> 928 1272 2207 2125
> 936 1267 2125 2098
> 944 1265 2127 2060
> 952 1267 2126 2049
> 960 1289 2213 2204
> 968 1125 2123 2187
> 976 1122 2127 2166
> 984 1120 2123 2136
> 992 1118 2207 2119
> 1000 1118 2120 2101
> 1008 1117 2122 2042
> 1016 1115 2121 2048
> 1024 2174 2191 2195
> 1032 1748 1724 1565
> 1040 1745 1782 1544
> 1048 1736 1737 1554
> 1056 1738 1802 1541
> 1064 1735 1728 1523
> 1072 1730 1780 1507
> 1080 1729 1724 1497
> 1088 1757 1783 1592
> 1096 1475 1723 1575
> 1104 1474 1778 1563
> 1112 1472 1708 1544
> 1120 1468 1774 1521
> 1128 1466 1718 1521
> 1136 1462 1780 1501
> 1144 1460 1719 1491
> 1152 1481 1782 1575
> 1160 1271 1647 1558
> 1168 1271 1706 1554
> 1176 1268 1645 1545
> 1184 1265 1711 1538
> 1192 1265 1648 1530
> 1200 1264 1705 1493
> 1208 1262 1647 1498
> 1216 1277 1695 1581
> 1224 1120 1642 1563
> 1232 1115 1702 1549
> 1240 1121 1646 1538
> 1248 1119 1703 1527
> 1256 1115 1640 1520
> 1264 1114 1693 1505
> 1272 1112 1642 1492
> 1280 1552 1699 1574
> 1288 1314 1525 1573
> 1296 1315 1522 1551
> 1304 1312 1521 1548
> 1312 1311 1564 1535
> 1320 1309 1518 1524
> 1328 1302 1527 1508
> 1336 1303 1521 1500
> 1344 1333 1561 1579
> 1352 1157 1524 1573
> 1360 1152 1520 1546
> 1368 1154 1522 1545
> 1376 1153 1562 1536
> 1384 1151 1525 1526
> 1392 1149 1523 1504
> 1400 1148 1517 1480
> 1408 1167 1561 1589
> 1416 1030 1516 1558
> 1424 1028 1516 1546
> 1432 1027 1522 1537
> 1440 1027 1564 1523
> 1448 1026 1507 1512
> 1456 1025 1515 1491
> 1464 1023 1522 1481
> 1472 1037 1559 1577
> 1480 927 1518 1559
> 1488 926 1514 1548
> 1496 926 1513 1534
>
>
> Martin Willi (6):
> crypto: x86/chacha20 - Support partial lengths in 1-block SSSE3
> variant
> crypto: x86/chacha20 - Support partial lengths in 4-block SSSE3
> variant
> crypto: x86/chacha20 - Support partial lengths in 8-block AVX2 variant
> crypto: x86/chacha20 - Use larger block functions more aggressively
> crypto: x86/chacha20 - Add a 2-block AVX2 variant
> crypto: x86/chacha20 - Add a 4-block AVX2 variant
>
> arch/x86/crypto/chacha20-avx2-x86_64.S | 696 ++++++++++++++++++++++--
> arch/x86/crypto/chacha20-ssse3-x86_64.S | 237 ++++++--
> arch/x86/crypto/chacha20_glue.c | 72 ++-
> 3 files changed, 868 insertions(+), 137 deletions(-)

All applied. Thanks.
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2018-11-16 12:31:37

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH 0/6] crypto: x86/chacha20 - SIMD performance improvements

On Sun, Nov 11, 2018 at 10:36:24AM +0100, Martin Willi wrote:
> This patchset improves performance of the ChaCha20 SIMD implementations
> for x86_64. For some specific encryption lengths, performance is more
> than doubled. Two mechanisms are used to achieve this:
>
> * Instead of calculating the minimal number of required blocks for a
> given encryption length, functions producing more blocks are used
> more aggressively. Calculating a 4-block function can be faster than
> calculating a 2-block and a 1-block function, even if only three
> blocks are actually required.
>
> * In addition to the 8-block AVX2 function, a 4-block and a 2-block
> function are introduced.
>
> Patches 1-3 add support for partial lengths to the existing 1-, 4- and
> 8-block functions. Patch 4 makes use of that by engaging the next higher
> level block functions more aggressively. Patch 5 and 6 add the new AVX2
> functions for 2 and 4 blocks. Patches are based on cryptodev and would
> need adjustments to apply on top of the Adiantum patchset.
>
> Note that the more aggressive use of larger block functions calculate
> blocks that may get discarded. This may have a negative impact on energy
> usage or the processors thermal budget. However, with the new block
> functions we can avoid this over-calculation for many lengths, so the
> performance win can be considered more important.
>
> Below are performance numbers measured with tcrypt using additional
> encryption lengths; numbers in kOps/s, on my i7-5557U. old is the
> existing, new the implementation with this patchset. As comparison
> the numbers for zinc in v6:
>
> len old new zinc
> 8 5908 5818 5818
> 16 5917 5828 5726
> 24 5916 5869 5757
> 32 5920 5789 5813
> 40 5868 5799 5710
> 48 5877 5761 5761
> 56 5869 5797 5742
> 64 5897 5862 5685
> 72 3381 4979 3520
> 80 3364 5541 3475
> 88 3350 4977 3424
> 96 3342 5530 3371
> 104 3328 4923 3313
> 112 3317 5528 3207
> 120 3313 4970 3150
> 128 3492 5535 3568
> 136 2487 4570 3690
> 144 2481 5047 3599
> 152 2473 4565 3566
> 160 2459 5022 3515
> 168 2461 4550 3437
> 176 2454 5020 3325
> 184 2449 4535 3279
> 192 2538 5011 3762
> 200 1962 4537 3702
> 208 1962 4971 3622
> 216 1954 4487 3518
> 224 1949 4936 3445
> 232 1948 4497 3422
> 240 1941 4947 3317
> 248 1940 4481 3279
> 256 3798 4964 3723
> 264 2638 3577 3639
> 272 2637 3567 3597
> 280 2628 3563 3565
> 288 2630 3795 3484
> 296 2621 3580 3422
> 304 2612 3569 3352
> 312 2602 3599 3308
> 320 2694 3821 3694
> 328 2060 3538 3681
> 336 2054 3565 3599
> 344 2054 3553 3523
> 352 2049 3809 3419
> 360 2045 3575 3403
> 368 2035 3560 3334
> 376 2036 3555 3257
> 384 2092 3785 3715
> 392 1691 3505 3612
> 400 1684 3527 3553
> 408 1686 3527 3496
> 416 1684 3804 3430
> 424 1681 3555 3402
> 432 1675 3559 3311
> 440 1672 3558 3275
> 448 1710 3780 3689
> 456 1431 3541 3618
> 464 1428 3538 3576
> 472 1430 3527 3509
> 480 1426 3788 3405
> 488 1423 3502 3397
> 496 1423 3519 3298
> 504 1418 3519 3277
> 512 3694 3736 3735
> 520 2601 2571 2209
> 528 2601 2677 2148
> 536 2587 2534 2164
> 544 2578 2659 2138
> 552 2570 2552 2126
> 560 2566 2661 2035
> 568 2567 2542 2041
> 576 2639 2674 2199
> 584 2031 2531 2183
> 592 2027 2660 2145
> 600 2016 2513 2155
> 608 2009 2638 2133
> 616 2006 2522 2115
> 624 2000 2649 2064
> 632 1996 2518 2045
> 640 2053 2651 2188
> 648 1666 2402 2182
> 656 1663 2517 2158
> 664 1659 2397 2147
> 672 1657 2510 2139
> 680 1656 2394 2114
> 688 1653 2497 2077
> 696 1646 2393 2043
> 704 1678 2510 2208
> 712 1414 2391 2189
> 720 1412 2506 2169
> 728 1411 2384 2145
> 736 1408 2494 2142
> 744 1408 2379 2081
> 752 1405 2485 2064
> 760 1403 2376 2043
> 768 2189 2498 2211
> 776 1756 2137 2192
> 784 1746 2145 2146
> 792 1744 2141 2141
> 800 1743 2222 2094
> 808 1742 2140 2100
> 816 1735 2134 2061
> 824 1731 2135 2045
> 832 1778 2222 2223
> 840 1480 2132 2184
> 848 1480 2134 2173
> 856 1476 2124 2145
> 864 1474 2210 2126
> 872 1472 2127 2105
> 880 1463 2123 2056
> 888 1468 2123 2043
> 896 1494 2208 2219
> 904 1278 2120 2192
> 912 1277 2121 2170
> 920 1273 2118 2149
> 928 1272 2207 2125
> 936 1267 2125 2098
> 944 1265 2127 2060
> 952 1267 2126 2049
> 960 1289 2213 2204
> 968 1125 2123 2187
> 976 1122 2127 2166
> 984 1120 2123 2136
> 992 1118 2207 2119
> 1000 1118 2120 2101
> 1008 1117 2122 2042
> 1016 1115 2121 2048
> 1024 2174 2191 2195
> 1032 1748 1724 1565
> 1040 1745 1782 1544
> 1048 1736 1737 1554
> 1056 1738 1802 1541
> 1064 1735 1728 1523
> 1072 1730 1780 1507
> 1080 1729 1724 1497
> 1088 1757 1783 1592
> 1096 1475 1723 1575
> 1104 1474 1778 1563
> 1112 1472 1708 1544
> 1120 1468 1774 1521
> 1128 1466 1718 1521
> 1136 1462 1780 1501
> 1144 1460 1719 1491
> 1152 1481 1782 1575
> 1160 1271 1647 1558
> 1168 1271 1706 1554
> 1176 1268 1645 1545
> 1184 1265 1711 1538
> 1192 1265 1648 1530
> 1200 1264 1705 1493
> 1208 1262 1647 1498
> 1216 1277 1695 1581
> 1224 1120 1642 1563
> 1232 1115 1702 1549
> 1240 1121 1646 1538
> 1248 1119 1703 1527
> 1256 1115 1640 1520
> 1264 1114 1693 1505
> 1272 1112 1642 1492
> 1280 1552 1699 1574
> 1288 1314 1525 1573
> 1296 1315 1522 1551
> 1304 1312 1521 1548
> 1312 1311 1564 1535
> 1320 1309 1518 1524
> 1328 1302 1527 1508
> 1336 1303 1521 1500
> 1344 1333 1561 1579
> 1352 1157 1524 1573
> 1360 1152 1520 1546
> 1368 1154 1522 1545
> 1376 1153 1562 1536
> 1384 1151 1525 1526
> 1392 1149 1523 1504
> 1400 1148 1517 1480
> 1408 1167 1561 1589
> 1416 1030 1516 1558
> 1424 1028 1516 1546
> 1432 1027 1522 1537
> 1440 1027 1564 1523
> 1448 1026 1507 1512
> 1456 1025 1515 1491
> 1464 1023 1522 1481
> 1472 1037 1559 1577
> 1480 927 1518 1559
> 1488 926 1514 1548
> 1496 926 1513 1534

Nice work Martin!

In light of this, and the grumblings over the wholesale replacment
of the existing chacha20 implementations, could we add the zinc
interface in a piecemeal fashion?

That is, instead of adding a bunch of new implementations through
zinc and then converting the crypto users over, we instead convert
the existing implementations over to zinc in-place. This would
also resolve the complaints over not being able to choose different
implementations through the crypto API and leaving that choice
solely up to zinc.

After that is done, wireguard could then proceed in parallel with
replacing any implementations should the need arise.

Cheers,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2018-11-19 18:15:20

by Martin Willi

[permalink] [raw]
Subject: Re: [PATCH 0/6] crypto: x86/chacha20 - SIMD performance improvements

Hi Jason,

> I'd be inclined to roll with your implementation if it can eventually
> become competitive with Andy Polyakov's, [...]

I think for the SSSE3/AVX2 code paths it is competitive; especially for
small sizes it is faster, which is not that unimportant when
implementing layer 3 VPNs.

> there are still no AVX-512 paths, which means it's considerably
> slower on all newer generation Intel chips. Andy's has the AVX-512VL
> implementation for Skylake (using ymm, so as not to hit throttling)
> and AVX-512F for Cannon Lake and beyond (using zmm).

I don't think that having AVX-512F is that important until it is really
usable on CPUs in the market.

Adding AVX-512VL support is relatively simple. I have a patchset mostly
ready that is more than competitive with the code from Zinc. I'll clean
that up and do more testing before posting it later this week.

Best regards
Martin

2018-11-21 02:59:42

by Martin Willi

[permalink] [raw]
Subject: Re: [PATCH 0/6] crypto: x86/chacha20 - SIMD performance improvements

Hi Jason,

> [...] I have a massive Xeon Gold 5120 machine that I can give you
> access to if you'd like to do some testing and benching.

Thanks for the offer, no need at this time. But I certainly would
welcome if you could do some (Wireguard) benching with that code to see
if it works for you.

> Actually, similarly here, a 10nm Cannon Lake machine should be
> arriving at my house this week, which should make for some
> interesting testing ground for non-throttled zmm, if you'd like to
> play with it.

Maybe in a future iteration, thanks. In fact would it be interesting to
know if Cannon Lake can handle that throttling better.

Regards
Martin

2018-11-11 19:24:48

by Martin Willi

[permalink] [raw]
Subject: [PATCH 4/6] crypto: x86/chacha20 - Use larger block functions more aggressively

Now that all block functions support partial lengths, engage the wider
block sizes more aggressively. This prevents using smaller block
functions multiple times, where the next larger block function would
have been faster.

Signed-off-by: Martin Willi <[email protected]>
---
arch/x86/crypto/chacha20_glue.c | 39 ++++++++++++++++++++-------------
1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 882e8bf5965a..b541da71f11e 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -29,6 +29,12 @@ asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
static bool chacha20_use_avx2;
#endif

+static unsigned int chacha20_advance(unsigned int len, unsigned int maxblocks)
+{
+ len = min(len, maxblocks * CHACHA20_BLOCK_SIZE);
+ return round_up(len, CHACHA20_BLOCK_SIZE) / CHACHA20_BLOCK_SIZE;
+}
+
static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes)
{
@@ -41,6 +47,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
dst += CHACHA20_BLOCK_SIZE * 8;
state[12] += 8;
}
+ if (bytes > CHACHA20_BLOCK_SIZE * 4) {
+ chacha20_8block_xor_avx2(state, dst, src, bytes);
+ state[12] += chacha20_advance(bytes, 8);
+ return;
+ }
}
#endif
while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
@@ -50,15 +61,14 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
dst += CHACHA20_BLOCK_SIZE * 4;
state[12] += 4;
}
- while (bytes >= CHACHA20_BLOCK_SIZE) {
- chacha20_block_xor_ssse3(state, dst, src, bytes);
- bytes -= CHACHA20_BLOCK_SIZE;
- src += CHACHA20_BLOCK_SIZE;
- dst += CHACHA20_BLOCK_SIZE;
- state[12]++;
+ if (bytes > CHACHA20_BLOCK_SIZE) {
+ chacha20_4block_xor_ssse3(state, dst, src, bytes);
+ state[12] += chacha20_advance(bytes, 4);
+ return;
}
if (bytes) {
chacha20_block_xor_ssse3(state, dst, src, bytes);
+ state[12]++;
}
}

@@ -82,17 +92,16 @@ static int chacha20_simd(struct skcipher_request *req)

kernel_fpu_begin();

- while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
- chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
- rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
- err = skcipher_walk_done(&walk,
- walk.nbytes % CHACHA20_BLOCK_SIZE);
- }
+ while (walk.nbytes > 0) {
+ unsigned int nbytes = walk.nbytes;
+
+ if (nbytes < walk.total)
+ nbytes = round_down(nbytes, walk.stride);

- if (walk.nbytes) {
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
- walk.nbytes);
- err = skcipher_walk_done(&walk, 0);
+ nbytes);
+
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}

kernel_fpu_end();
--
2.17.1

2018-11-21 03:03:10

by Jason A. Donenfeld

[permalink] [raw]
Subject: Re: [PATCH 0/6] crypto: x86/chacha20 - SIMD performance improvements

Hi Martin,

On Tue, Nov 20, 2018 at 5:29 PM Martin Willi <[email protected]> wrote:
> Thanks for the offer, no need at this time. But I certainly would
> welcome if you could do some (Wireguard) benching with that code to see
> if it works for you.

I certainly will test it in a few different network circumstances,
especially since real testing like this is sometimes more telling than
busy-loop benchmarks.

> > Actually, similarly here, a 10nm Cannon Lake machine should be
> > arriving at my house this week, which should make for some
> > interesting testing ground for non-throttled zmm, if you'd like to
> > play with it.
>
> Maybe in a future iteration, thanks. In fact would it be interesting to
> know if Cannon Lake can handle that throttling better.

Everything I've read on the Internet seems to indicate that's the
case, so one of the first things I'll be doing is seeing if that's
true. There are also the AVX512 IFMA instructions to play with!

Jason

2018-11-11 19:24:51

by Martin Willi

[permalink] [raw]
Subject: [PATCH 5/6] crypto: x86/chacha20 - Add a 2-block AVX2 variant

This variant uses the same principle as the single block SSSE3 variant
by shuffling the state matrix after each round. With the wider AVX
registers, we can do two blocks in parallel, though.

This function can increase performance and efficiency significantly for
lengths that would otherwise require a 4-block function.

Signed-off-by: Martin Willi <[email protected]>
---
arch/x86/crypto/chacha20-avx2-x86_64.S | 197 +++++++++++++++++++++++++
arch/x86/crypto/chacha20_glue.c | 7 +
2 files changed, 204 insertions(+)

diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S
index 7b62d55bee3d..8247076b0ba7 100644
--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx2-x86_64.S
@@ -26,8 +26,205 @@ ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
CTRINC: .octa 0x00000003000000020000000100000000
.octa 0x00000007000000060000000500000004

+.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL: .octa 0x00000000000000000000000000000000
+ .octa 0x00000000000000000000000000000001
+
.text

+ENTRY(chacha20_2block_xor_avx2)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 2 data blocks output, o
+ # %rdx: up to 2 data blocks input, i
+ # %rcx: input/output length in bytes
+
+ # This function encrypts two ChaCha20 blocks by loading the state
+ # matrix twice across four AVX registers. It performs matrix operations
+ # on four words in each matrix in parallel, but requires shuffling to
+ # rearrange the words after each round.
+
+ vzeroupper
+
+ # x0..3[0-2] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+
+ vmovdqa %ymm0,%ymm8
+ vmovdqa %ymm1,%ymm9
+ vmovdqa %ymm2,%ymm10
+ vmovdqa %ymm3,%ymm11
+
+ vmovdqa ROT8(%rip),%ymm4
+ vmovdqa ROT16(%rip),%ymm5
+
+ mov %rcx,%rax
+ mov $10,%ecx
+
+.Ldoubleround:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm5,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm6
+ vpslld $12,%ymm6,%ymm6
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm6,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm4,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm7
+ vpslld $7,%ymm7,%ymm7
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm5,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm6
+ vpslld $12,%ymm6,%ymm6
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm6,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm4,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm7
+ vpslld $7,%ymm7,%ymm7
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+
+ dec %ecx
+ jnz .Ldoubleround
+
+ # o0 = i0 ^ (x0 + s0)
+ vpaddd %ymm8,%ymm0,%ymm7
+ cmp $0x10,%rax
+ jl .Lxorpart2
+ vpxor 0x00(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x00(%rsi)
+ vextracti128 $1,%ymm7,%xmm0
+ # o1 = i1 ^ (x1 + s1)
+ vpaddd %ymm9,%ymm1,%ymm7
+ cmp $0x20,%rax
+ jl .Lxorpart2
+ vpxor 0x10(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x10(%rsi)
+ vextracti128 $1,%ymm7,%xmm1
+ # o2 = i2 ^ (x2 + s2)
+ vpaddd %ymm10,%ymm2,%ymm7
+ cmp $0x30,%rax
+ jl .Lxorpart2
+ vpxor 0x20(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x20(%rsi)
+ vextracti128 $1,%ymm7,%xmm2
+ # o3 = i3 ^ (x3 + s3)
+ vpaddd %ymm11,%ymm3,%ymm7
+ cmp $0x40,%rax
+ jl .Lxorpart2
+ vpxor 0x30(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x30(%rsi)
+ vextracti128 $1,%ymm7,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm7
+ cmp $0x50,%rax
+ jl .Lxorpart2
+ vpxor 0x40(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm7
+ cmp $0x60,%rax
+ jl .Lxorpart2
+ vpxor 0x50(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm7
+ cmp $0x70,%rax
+ jl .Lxorpart2
+ vpxor 0x60(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm7
+ cmp $0x80,%rax
+ jl .Lxorpart2
+ vpxor 0x70(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x70(%rsi)
+
+.Ldone2:
+ vzeroupper
+ ret
+
+.Lxorpart2:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone2
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea 8(%rsp),%r10
+ sub $0x10,%rsp
+ and $~31,%rsp
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ vpxor 0x00(%rsp),%xmm7,%xmm7
+ vmovdqa %xmm7,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ lea -8(%r10),%rsp
+ jmp .Ldone2
+
+ENDPROC(chacha20_2block_xor_avx2)
+
ENTRY(chacha20_8block_xor_avx2)
# %rdi: Input state matrix, s
# %rsi: up to 8 data blocks output, o
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index b541da71f11e..82e46589a189 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -24,6 +24,8 @@ asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
unsigned int len);
#ifdef CONFIG_AS_AVX2
+asmlinkage void chacha20_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len);
asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
unsigned int len);
static bool chacha20_use_avx2;
@@ -52,6 +54,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
state[12] += chacha20_advance(bytes, 8);
return;
}
+ if (bytes > CHACHA20_BLOCK_SIZE) {
+ chacha20_2block_xor_avx2(state, dst, src, bytes);
+ state[12] += chacha20_advance(bytes, 2);
+ return;
+ }
}
#endif
while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
--
2.17.1