LinuxLists.cc - [PATCH v5 08/14] arm64: Import latest optimization of memcpy

2020-09-14 15:13:58

Subject: [PATCH v5 08/14] arm64: Import latest optimization of memcpy

From: Sam Tebbs <[email protected]>

Import the latest memcpy implementation into memcpy,
copy_{from, to and in}_user.
The implementation of the user routines is separated into two forms:
one for when UAO is enabled and one for when UAO is disabled, with
the two being chosen between with a runtime patch.
This avoids executing the many NOPs emitted when UAO is disabled.

The project containing optimized implementations for various library
functions has now been renamed from 'cortex-strings' to
'optimized-routines', and the new upstream source is
string/aarch64/memcpy.S as of commit 4c175c8be12 in
https://github.com/ARM-software/optimized-routines.

Signed-off-by: Sam Tebbs <[email protected]>
[ rm: add UAO fixups, streamline copy_exit paths, expand commit message ]
Signed-off-by: Robin Murphy <[email protected]>
[ os: import newer memcpy algorithm, update commit message ]
Signed-off-by: Oliver Swede <[email protected]>
---
arch/arm64/include/asm/alternative.h | 36 ---
arch/arm64/lib/copy_from_user.S | 113 ++++++--
arch/arm64/lib/copy_in_user.S | 129 +++++++--
arch/arm64/lib/copy_template.S | 375 +++++++++++++++------------
arch/arm64/lib/copy_template_user.S | 24 ++
arch/arm64/lib/copy_to_user.S | 112 ++++++--
arch/arm64/lib/copy_user_fixup.S | 14 +
arch/arm64/lib/memcpy.S | 47 ++--
8 files changed, 557 insertions(+), 293 deletions(-)
create mode 100644 arch/arm64/lib/copy_template_user.S
create mode 100644 arch/arm64/lib/copy_user_fixup.S

diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h
index 619db9b4c9d5..581bacacc1bc 100644
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h
@@ -230,36 +230,6 @@ alternative_endif
* unprivileged instructions, and USER() only works for single instructions.
*/
#ifdef CONFIG_ARM64_UAO
- .macro uao_ldp l, reg1, reg2, addr, post_inc
- alternative_if_not ARM64_HAS_UAO
-8888: ldp \reg1, \reg2, [\addr], \post_inc;
-8889: nop;
- nop;
- alternative_else
- ldtr \reg1, [\addr];
- ldtr \reg2, [\addr, #8];
- add \addr, \addr, \post_inc;
- alternative_endif
-
- _asm_extable 8888b,\l;
- _asm_extable 8889b,\l;
- .endm
-
- .macro uao_stp l, reg1, reg2, addr, post_inc
- alternative_if_not ARM64_HAS_UAO
-8888: stp \reg1, \reg2, [\addr], \post_inc;
-8889: nop;
- nop;
- alternative_else
- sttr \reg1, [\addr];
- sttr \reg2, [\addr, #8];
- add \addr, \addr, \post_inc;
- alternative_endif
-
- _asm_extable 8888b,\l;
- _asm_extable 8889b,\l;
- .endm
-
.macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc
alternative_if_not ARM64_HAS_UAO
8888: \inst \reg, [\addr], \post_inc;
@@ -272,12 +242,6 @@ alternative_endif
_asm_extable 8888b,\l;
.endm
#else
- .macro uao_ldp l, reg1, reg2, addr, post_inc
- USER(\l, ldp \reg1, \reg2, [\addr], \post_inc)
- .endm
- .macro uao_stp l, reg1, reg2, addr, post_inc
- USER(\l, stp \reg1, \reg2, [\addr], \post_inc)
- .endm
.macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc
USER(\l, \inst \reg, [\addr], \post_inc)
.endm
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 0f8a3a9e3795..86945e84c009 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -19,50 +19,111 @@
* Returns:
* x0 - bytes not copied
*/
+ 8888: ldtrb \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro strb1 reg, ptr, offset=0
+ strb \reg, [\ptr, \offset]
+ .endm
+
+ .macro ldrb1_reg reg, ptr, offset
+ add \ptr, \ptr, \offset
+ 8888: ldtrb \reg, [\ptr]
+ sub \ptr, \ptr, \offset
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro strb1_reg reg, ptr, offset
+ strb \reg, [\ptr, \offset]
+ .endm

- .macro ldrb1 reg, ptr, val
- uao_user_alternative 9998f, ldrb, ldtrb, \reg, \ptr, \val
+ .macro ldr1 reg, ptr, offset=0
+ 8888: ldtr \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
.endm

- .macro strb1 reg, ptr, val
- strb \reg, [\ptr], \val
+ .macro str1 reg, ptr, offset=0
+ str \reg, [\ptr, \offset]
.endm

- .macro ldrh1 reg, ptr, val
- uao_user_alternative 9998f, ldrh, ldtrh, \reg, \ptr, \val
+ .macro ldp1 regA, regB, ptr, offset=0
+ 8888: ldtr \regA, [\ptr, \offset]
+ 8889: ldtr \regB, [\ptr, \offset + 8]
+ _asm_extable_faultaddr 8888b,9998f;
+ _asm_extable_faultaddr 8889b,9998f;
.endm

- .macro strh1 reg, ptr, val
- strh \reg, [\ptr], \val
+ .macro stp1 regA, regB, ptr, offset=0
+ stp \regA, \regB, [\ptr, \offset]
.endm

- .macro ldr1 reg, ptr, val
- uao_user_alternative 9998f, ldr, ldtr, \reg, \ptr, \val
+ .macro ldp1_pre regA, regB, ptr, offset
+ 8888: ldtr \regA, [\ptr, \offset]
+ 8889: ldtr \regB, [\ptr, \offset + 8]
+ add \ptr, \ptr, \offset
+ _asm_extable_faultaddr 8888b,9998f;
+ _asm_extable_faultaddr 8889b,9998f;
.endm

- .macro str1 reg, ptr, val
- str \reg, [\ptr], \val
+ .macro stp1_pre regA, regB, ptr, offset
+ stp \regA, \regB, [\ptr, \offset]!
.endm

- .macro ldp1 reg1, reg2, ptr, val
- uao_ldp 9998f, \reg1, \reg2, \ptr, \val
+ .macro ldrb1_nuao reg, ptr, offset=0
+ 8888: ldrb \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
.endm

- .macro stp1 reg1, reg2, ptr, val
- stp \reg1, \reg2, [\ptr], \val
+ .macro strb1_nuao reg, ptr, offset=0
+ strb \reg, [\ptr, \offset]
+ .endm
+
+ .macro ldrb1_nuao_reg reg, ptr, offset=0
+ 8888: ldrb \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro strb1_nuao_reg reg, ptr, offset=0
+ strb \reg, [\ptr, \offset]
+ .endm
+
+ .macro ldr1_nuao reg, ptr, offset=0
+ 8888: ldr \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro str1_nuao reg, ptr, offset=0
+ str \reg, [\ptr, \offset]
+ .endm
+
+ .macro ldp1_nuao regA, regB, ptr, offset=0
+ 8888: ldp \regA, \regB, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro stp1_nuao regA, regB, ptr, offset=0
+ stp \regA, \regB, [\ptr, \offset]
+ .endm
+
+ .macro ldp1_pre_nuao regA, regB, ptr, offset
+ 8888: ldp \regA, \regB, [\ptr, \offset]!
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro stp1_pre_nuao regA, regB, ptr, offset
+ stp \regA, \regB, [\ptr, \offset]!
+ .endm
+
+ .macro copy_exit
+ b .Luaccess_finish
.endm

-end .req x5
SYM_FUNC_START(__arch_copy_from_user)
- add end, x0, x2
-#include "copy_template.S"
- mov x0, #0 // Nothing to copy
+#include "copy_template_user.S"
+.Luaccess_finish:
+ mov x0, #0
ret
SYM_FUNC_END(__arch_copy_from_user)
EXPORT_SYMBOL(__arch_copy_from_user)
-
- .section .fixup,"ax"
- .align 2
-9998: sub x0, end, dst // bytes not copied
- ret
- .previous
+#include "copy_user_fixup.S"
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index 80e37ada0ee1..77dfccc618b6 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -21,50 +21,129 @@
* Returns:
* x0 - bytes not copied
*/
- .macro ldrb1 reg, ptr, val
- uao_user_alternative 9998f, ldrb, ldtrb, \reg, \ptr, \val
+ .macro ldrb1 reg, ptr, offset=0
+ 8888: ldtrb \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
.endm

- .macro strb1 reg, ptr, val
- uao_user_alternative 9998f, strb, sttrb, \reg, \ptr, \val
+ .macro strb1 reg, ptr, offset=0
+ 8888: sttrb \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
.endm

- .macro ldrh1 reg, ptr, val
- uao_user_alternative 9998f, ldrh, ldtrh, \reg, \ptr, \val
+ .macro ldrb1_reg reg, ptr, offset
+ add \ptr, \ptr, \offset
+ 8888: ldtrb \reg, [\ptr]
+ sub \ptr, \ptr, \offset
+ _asm_extable_faultaddr 8888b,9998f;
.endm

- .macro strh1 reg, ptr, val
- uao_user_alternative 9998f, strh, sttrh, \reg, \ptr, \val
+ .macro strb1_reg reg, ptr, offset
+ add \ptr, \ptr, \offset
+ 8888: sttrb \reg, [\ptr]
+ sub \ptr, \ptr, \offset
+ _asm_extable_faultaddr 8888b,9998f;
.endm

- .macro ldr1 reg, ptr, val
- uao_user_alternative 9998f, ldr, ldtr, \reg, \ptr, \val
+ .macro ldr1 reg, ptr, offset=0
+ 8888: ldtr \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
.endm

- .macro str1 reg, ptr, val
- uao_user_alternative 9998f, str, sttr, \reg, \ptr, \val
+ .macro str1 reg, ptr, offset=0
+ 8888: sttr \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
.endm

- .macro ldp1 reg1, reg2, ptr, val
- uao_ldp 9998f, \reg1, \reg2, \ptr, \val
+ .macro ldp1 regA, regB, ptr, offset=0
+ 8888: ldtr \regA, [\ptr, \offset]
+ 8889: ldtr \regB, [\ptr, \offset + 8]
+ _asm_extable_faultaddr 8888b,9998f;
+ _asm_extable_faultaddr 8889b,9998f;
.endm

- .macro stp1 reg1, reg2, ptr, val
- uao_stp 9998f, \reg1, \reg2, \ptr, \val
+ .macro stp1 regA, regB, ptr, offset=0
+ 8888: sttr \regA, [\ptr, \offset]
+ 8889: sttr \regB, [\ptr, \offset + 8]
+ _asm_extable_faultaddr 8888b,9998f;
+ _asm_extable_faultaddr 8889b,9998f;
.endm

-end .req x5
+ .macro ldp1_pre regA, regB, ptr, offset
+ 8888: ldtr \regA, [\ptr, \offset]
+ 8889: ldtr \regB, [\ptr, \offset + 8]
+ add \ptr, \ptr, \offset
+ _asm_extable_faultaddr 8888b,9998f;
+ _asm_extable_faultaddr 8889b,9998f;
+ .endm
+
+ .macro stp1_pre regA, regB, ptr, offset
+ 8888: sttr \regA, [\ptr, \offset]
+ 8889: sttr \regB, [\ptr, \offset + 8]
+ add \ptr, \ptr, \offset
+ _asm_extable_faultaddr 8888b,9998f;
+ _asm_extable_faultaddr 8889b,9998f;
+ .endm
+
+ .macro ldrb1_nuao reg, ptr, offset=0
+ 8888: ldrb \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro strb1_nuao reg, ptr, offset=0
+ 8888: strb \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro ldrb1_nuao_reg reg, ptr, offset=0
+ 8888: ldrb \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro strb1_nuao_reg reg, ptr, offset=0
+ 8888: strb \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro ldr1_nuao reg, ptr, offset=0
+ 8888: ldr \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro str1_nuao reg, ptr, offset=0
+ 8888: str \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro ldp1_nuao regA, regB, ptr, offset=0
+ 8888: ldp \regA, \regB, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro stp1_nuao regA, regB, ptr, offset=0
+ 8888: stp \regA, \regB, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro ldp1_pre_nuao regA, regB, ptr, offset
+ 8888: ldp \regA, \regB, [\ptr, \offset]!
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro stp1_pre_nuao regA, regB, ptr, offset
+ 8888: stp \regA, \regB, [\ptr, \offset]!
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro copy_exit
+ b .Luaccess_finish
+ .endm

SYM_FUNC_START(__arch_copy_in_user)
- add end, x0, x2
-#include "copy_template.S"
+#include "copy_template_user.S"
+.Luaccess_finish:
mov x0, #0
ret
SYM_FUNC_END(__arch_copy_in_user)
EXPORT_SYMBOL(__arch_copy_in_user)
-
- .section .fixup,"ax"
- .align 2
-9998: sub x0, end, dst // bytes not copied
- ret
- .previous
+#include "copy_user_fixup.S"
diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
index 488df234c49a..90b5f63ff227 100644
--- a/arch/arm64/lib/copy_template.S
+++ b/arch/arm64/lib/copy_template.S
@@ -1,13 +1,12 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2012 Linaro Limited. All rights reserved.
+ * Copyright (c) 2015 ARM Ltd. All rights reserved.
*
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
+ * This code is based on work originally authored by Linaro,
+ * found at:
*
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * https://github.com/ARM-software/optimized-routines
*/

@@ -21,161 +20,209 @@
* Returns:
* x0 - dest
*/
-dstin .req x0
-src .req x1
-count .req x2
-tmp1 .req x3
-tmp1w .req w3
-tmp2 .req x4
-tmp2w .req w4
-dst .req x6
-
-A_l .req x7
-A_h .req x8
-B_l .req x9
-B_h .req x10
-C_l .req x11
-C_h .req x12
-D_l .req x13
-D_h .req x14
-
- mov dst, dstin
- cmp count, #16
- /*When memory length is less than 16, the accessed are not aligned.*/
- b.lo .Ltiny15
-
- neg tmp2, src
- ands tmp2, tmp2, #15/* Bytes to reach alignment. */
- b.eq .LSrcAligned
- sub count, count, tmp2
- /*
- * Copy the leading memory data from src to dst in an increasing
- * address order.By this way,the risk of overwriting the source
- * memory data is eliminated when the distance between src and
- * dst is less than 16. The memory accesses here are alignment.
- */
- tbz tmp2, #0, 1f
- ldrb1 tmp1w, src, #1
- strb1 tmp1w, dst, #1
-1:
- tbz tmp2, #1, 2f
- ldrh1 tmp1w, src, #2
- strh1 tmp1w, dst, #2
-2:
- tbz tmp2, #2, 3f
- ldr1 tmp1w, src, #4
- str1 tmp1w, dst, #4
-3:
- tbz tmp2, #3, .LSrcAligned
- ldr1 tmp1, src, #8
- str1 tmp1, dst, #8
-
-.LSrcAligned:
- cmp count, #64
- b.ge .Lcpy_over64
- /*
- * Deal with small copies quickly by dropping straight into the
- * exit block.
- */
-.Ltail63:
- /*
- * Copy up to 48 bytes of data. At this point we only need the
- * bottom 6 bits of count to be accurate.
- */
- ands tmp1, count, #0x30
- b.eq .Ltiny15
- cmp tmp1w, #0x20
- b.eq 1f
- b.lt 2f
- ldp1 A_l, A_h, src, #16
- stp1 A_l, A_h, dst, #16
-1:
- ldp1 A_l, A_h, src, #16
- stp1 A_l, A_h, dst, #16
-2:
- ldp1 A_l, A_h, src, #16
- stp1 A_l, A_h, dst, #16
-.Ltiny15:
- /*
- * Prefer to break one ldp/stp into several load/store to access
- * memory in an increasing address order,rather than to load/store 16
- * bytes from (src-16) to (dst-16) and to backward the src to aligned
- * address,which way is used in original cortex memcpy. If keeping
- * the original memcpy process here, memmove need to satisfy the
- * precondition that src address is at least 16 bytes bigger than dst
- * address,otherwise some source data will be overwritten when memove
- * call memcpy directly. To make memmove simpler and decouple the
- * memcpy's dependency on memmove, withdrew the original process.
- */
- tbz count, #3, 1f
- ldr1 tmp1, src, #8
- str1 tmp1, dst, #8
-1:
- tbz count, #2, 2f
- ldr1 tmp1w, src, #4
- str1 tmp1w, dst, #4
-2:
- tbz count, #1, 3f
- ldrh1 tmp1w, src, #2
- strh1 tmp1w, dst, #2
-3:
- tbz count, #0, .Lexitfunc
- ldrb1 tmp1w, src, #1
- strb1 tmp1w, dst, #1
-
- b .Lexitfunc
-
-.Lcpy_over64:
- subs count, count, #128
- b.ge .Lcpy_body_large
- /*
- * Less than 128 bytes to copy, so handle 64 here and then jump
- * to the tail.
- */
- ldp1 A_l, A_h, src, #16
- stp1 A_l, A_h, dst, #16
- ldp1 B_l, B_h, src, #16
- ldp1 C_l, C_h, src, #16
- stp1 B_l, B_h, dst, #16
- stp1 C_l, C_h, dst, #16
- ldp1 D_l, D_h, src, #16
- stp1 D_l, D_h, dst, #16
-
- tst count, #0x3f
- b.ne .Ltail63
- b .Lexitfunc
-
- /*
- * Critical loop. Start at a new cache line boundary. Assuming
- * 64 bytes per line this ensures the entire loop is in one line.
- */
- .p2align L1_CACHE_SHIFT
-.Lcpy_body_large:
- /* pre-get 64 bytes data. */
- ldp1 A_l, A_h, src, #16
- ldp1 B_l, B_h, src, #16
- ldp1 C_l, C_h, src, #16
- ldp1 D_l, D_h, src, #16
-1:
- /*
- * interlace the load of next 64 bytes data block with store of the last
- * loaded 64 bytes data.
- */
- stp1 A_l, A_h, dst, #16
- ldp1 A_l, A_h, src, #16
- stp1 B_l, B_h, dst, #16
- ldp1 B_l, B_h, src, #16
- stp1 C_l, C_h, dst, #16
- ldp1 C_l, C_h, src, #16
- stp1 D_l, D_h, dst, #16
- ldp1 D_l, D_h, src, #16
- subs count, count, #64
- b.ge 1b
- stp1 A_l, A_h, dst, #16
- stp1 B_l, B_h, dst, #16
- stp1 C_l, C_h, dst, #16
- stp1 D_l, D_h, dst, #16
-
- tst count, #0x3f
- b.ne .Ltail63
-.Lexitfunc:
+ #define dstin x0
+ #define src x1
+ #define count x2
+ #define dst x3
+ #define srcend x4
+ #define dstend x5
+ #define A_l x6
+ #define A_lw w6
+ #define A_h x7
+ #define B_l x8
+ #define B_lw w8
+ #define B_h x9
+ #define C_l x10
+ #define C_lw w10
+ #define C_h x11
+ #define D_l x12
+ #define D_h x13
+ #define E_l x14
+ #define E_h x15
+ #define F_l x16
+ #define F_h x17
+ #define G_l count
+ #define G_h dst
+ #define H_l src
+ #define H_h srcend
+ #define tmp1 x14
+
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 128
+ b.hi L(copy_long)
+ cmp count, 32
+ b.hi L(copy32_128)
+
+ /* Small copies: 0..32 bytes. */
+ cmp count, 16
+ b.lo L(copy16)
+ ldp1 A_l, A_h, src
+ ldp1 D_l, D_h, srcend, -16
+ stp1 A_l, A_h, dstin
+ stp1 D_l, D_h, dstend, -16
+ copy_exit
+
+ /* Copy 8-15 bytes. */
+L(copy16):
+ tbz count, 3, L(copy8)
+ ldr1 A_l, src
+ ldr1 A_h, srcend, -8
+ str1 A_l, dstin
+ str1 A_h, dstend, -8
+ copy_exit
+
+ .p2align 3
+ /* Copy 4-7 bytes. */
+L(copy8):
+ tbz count, 2, L(copy4)
+ ldr1 A_lw, src
+ ldr1 B_lw, srcend, -4
+ str1 A_lw, dstin
+ str1 B_lw, dstend, -4
+ copy_exit
+
+ /* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
+ cbz count, L(copy0)
+ lsr tmp1, count, 1
+ ldrb1 A_lw, src
+ ldrb1 C_lw, srcend, -1
+ ldrb1_reg B_lw, src, tmp1
+ strb1 A_lw, dstin
+ strb1_reg B_lw, dstin, tmp1
+ strb1 C_lw, dstend, -1
+L(copy0):
+ copy_exit
+
+ .p2align 4
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ ldp1 A_l, A_h, src
+ ldp1 B_l, B_h, src, 16
+ ldp1 C_l, C_h, srcend, -32
+ ldp1 D_l, D_h, srcend, -16
+ cmp count, 64
+ b.hi L(copy128)
+ stp1 A_l, A_h, dstin
+ stp1 B_l, B_h, dstin, 16
+ stp1 C_l, C_h, dstend, -32
+ stp1 D_l, D_h, dstend, -16
+ copy_exit
+
+ .p2align 4
+ /* Copy 65..128 bytes. */
+L(copy128):
+ ldp1 E_l, E_h, src, 32
+ ldp1 F_l, F_h, src, 48
+ cmp count, 96
+ b.ls L(copy96)
+ ldp1 G_l, G_h, srcend, -64
+ ldp1 H_l, H_h, srcend, -48
+ stp1 G_l, G_h, dstend, -64
+ stp1 H_l, H_h, dstend, -48
+L(copy96):
+ stp1 A_l, A_h, dstin
+ stp1 B_l, B_h, dstin, 16
+ stp1 E_l, E_h, dstin, 32
+ stp1 F_l, F_h, dstin, 48
+ stp1 C_l, C_h, dstend, -32
+ stp1 D_l, D_h, dstend, -16
+ copy_exit
+
+ .p2align 4
+ /* Copy more than 128 bytes. */
+L(copy_long):
+ /* Use backwards copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cbz tmp1, L(copy0)
+ cmp tmp1, count
+ b.lo L(copy_long_backwards)
+
+ /* Copy 16 bytes and then align dst to 16-byte alignment. */
+
+ ldp1 D_l, D_h, src
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp1 A_l, A_h, src, 16
+ stp1 D_l, D_h, dstin
+ ldp1 B_l, B_h, src, 32
+ ldp1 C_l, C_h, src, 48
+ ldp1_pre D_l, D_h, src, 64
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(copy64_from_end)
+
+L(loop64):
+ stp1 A_l, A_h, dst, 16
+ ldp1 A_l, A_h, src, 16
+ stp1 B_l, B_h, dst, 32
+ ldp1 B_l, B_h, src, 32
+ stp1 C_l, C_h, dst, 48
+ ldp1 C_l, C_h, src, 48
+ stp1_pre D_l, D_h, dst, 64
+ ldp1_pre D_l, D_h, src, 64
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
+ ldp1 E_l, E_h, srcend, -64
+ stp1 A_l, A_h, dst, 16
+ ldp1 A_l, A_h, srcend, -48
+ stp1 B_l, B_h, dst, 32
+ ldp1 B_l, B_h, srcend, -32
+ stp1 C_l, C_h, dst, 48
+ ldp1 C_l, C_h, srcend, -16
+ stp1 D_l, D_h, dst, 64
+ stp1 E_l, E_h, dstend, -64
+ stp1 A_l, A_h, dstend, -48
+ stp1 B_l, B_h, dstend, -32
+ stp1 C_l, C_h, dstend, -16
+ copy_exit
+
+ .p2align 4
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align dst to 16-byte alignment. */
+L(copy_long_backwards):
+ ldp1 D_l, D_h, srcend, -16
+ and tmp1, dstend, 15
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldp1 A_l, A_h, srcend, -16
+ stp1 D_l, D_h, dstend, -16
+ ldp1 B_l, B_h, srcend, -32
+ ldp1 C_l, C_h, srcend, -48
+ ldp1_pre D_l, D_h, srcend, -64
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls L(copy64_from_start)
+
+L(loop64_backwards):
+ stp1 A_l, A_h, dstend, -16
+ ldp1 A_l, A_h, srcend, -16
+ stp1 B_l, B_h, dstend, -32
+ ldp1 B_l, B_h, srcend, -32
+ stp1 C_l, C_h, dstend, -48
+ ldp1 C_l, C_h, srcend, -48
+ stp1_pre D_l, D_h, dstend, -64
+ ldp1_pre D_l, D_h, srcend, -64
+ subs count, count, 64
+ b.hi L(loop64_backwards)
+
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
+ ldp1 G_l, G_h, src, 48
+ stp1 A_l, A_h, dstend, -16
+ ldp1 A_l, A_h, src, 32
+ stp1 B_l, B_h, dstend, -32
+ ldp1 B_l, B_h, src, 16
+ stp1 C_l, C_h, dstend, -48
+ ldp1 C_l, C_h, src
+ stp1 D_l, D_h, dstend, -64
+ stp1 G_l, G_h, dstin, 48
+ stp1 A_l, A_h, dstin, 32
+ stp1 B_l, B_h, dstin, 16
+ stp1 C_l, C_h, dstin
+ copy_exit
diff --git a/arch/arm64/lib/copy_template_user.S b/arch/arm64/lib/copy_template_user.S
new file mode 100644
index 000000000000..3db24dcdab05
--- /dev/null
+++ b/arch/arm64/lib/copy_template_user.S
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#define L(l) .L ## l
+
+ alternative_if_not ARM64_HAS_UAO
+ b L(copy_non_uao)
+ alternative_else_nop_endif
+#include "copy_template.S"
+
+#define ldp1 ldp1_nuao
+#define ldp1_pre ldp1_pre_nuao
+#define stp1 stp1_nuao
+#define stp1_pre stp1_pre_nuao
+#define ldr1 ldr1_nuao
+#define str1 str1_nuao
+#define ldrb1 ldrb1_nuao
+#define strb1 strb1_nuao
+#define ldrb1_reg ldrb1_nuao_reg
+#define strb1_reg strb1_nuao_reg
+
+L(copy_non_uao):
+#undef L
+#define L(l) .Lnuao ## l
+#include "copy_template.S"
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 4ec59704b8f2..6b4742cac083 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -19,49 +19,111 @@
* Returns:
* x0 - bytes not copied
*/
- .macro ldrb1 reg, ptr, val
- ldrb \reg, [\ptr], \val
+ .macro ldrb1 reg, ptr, offset=0
+ ldrb \reg, [\ptr, \offset]
.endm

- .macro strb1 reg, ptr, val
- uao_user_alternative 9998f, strb, sttrb, \reg, \ptr, \val
+ .macro strb1 reg, ptr, offset=0
+ 8888: sttrb \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
.endm

- .macro ldrh1 reg, ptr, val
- ldrh \reg, [\ptr], \val
+ .macro ldrb1_reg reg, ptr, offset
+ ldrb \reg, [\ptr, \offset]
.endm

- .macro strh1 reg, ptr, val
- uao_user_alternative 9998f, strh, sttrh, \reg, \ptr, \val
+ .macro strb1_reg reg, ptr, offset
+ add \ptr, \ptr, \offset
+ 8888: sttrb \reg, [\ptr]
+ sub \ptr, \ptr, \offset
+ _asm_extable_faultaddr 8888b,9998f;
.endm

- .macro ldr1 reg, ptr, val
- ldr \reg, [\ptr], \val
+ .macro ldr1 reg, ptr, offset=0
+ ldr \reg, [\ptr, \offset]
.endm

- .macro str1 reg, ptr, val
- uao_user_alternative 9998f, str, sttr, \reg, \ptr, \val
+ .macro str1 reg, ptr, offset=0
+ 8888: sttr \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
.endm

- .macro ldp1 reg1, reg2, ptr, val
- ldp \reg1, \reg2, [\ptr], \val
+ .macro ldp1 regA, regB, ptr, offset=0
+ ldp \regA, \regB, [\ptr, \offset]
.endm

- .macro stp1 reg1, reg2, ptr, val
- uao_stp 9998f, \reg1, \reg2, \ptr, \val
+ .macro stp1 regA, regB, ptr, offset=0
+ 8888: sttr \regA, [\ptr, \offset]
+ 8889: sttr \regB, [\ptr, \offset + 8]
+ _asm_extable_faultaddr 8888b,9998f;
+ _asm_extable_faultaddr 8889b,9998f;
+ .endm
+
+ .macro ldp1_pre regA, regB, ptr, offset
+ ldp \regA, \regB, [\ptr, \offset]!
+ .endm
+
+ .macro stp1_pre regA, regB, ptr, offset
+ 8888: sttr \regA, [\ptr, \offset]
+ 8889: sttr \regB, [\ptr, \offset + 8]
+ add \ptr, \ptr, \offset
+ _asm_extable_faultaddr 8888b,9998f;
+ _asm_extable_faultaddr 8889b,9998f;
+ .endm
+
+ .macro ldrb1_nuao reg, ptr, offset=0
+ ldrb \reg, [\ptr, \offset]
+ .endm
+
+ .macro strb1_nuao reg, ptr, offset=0
+ 8888: strb \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro ldrb1_nuao_reg reg, ptr, offset=0
+ ldrb \reg, [\ptr, \offset]
+ .endm
+
+ .macro strb1_nuao_reg reg, ptr, offset=0
+ strb \reg, [\ptr, \offset]
+ .endm
+
+ .macro ldr1_nuao reg, ptr, offset=0
+ ldr \reg, [\ptr, \offset]
+ .endm
+
+ .macro str1_nuao reg, ptr, offset=0
+ 8888: str \reg, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro ldp1_nuao regA, regB, ptr, offset=0
+ ldp \regA, \regB, [\ptr, \offset]
+ .endm
+
+ .macro ldp1_pre_nuao regA, regB, ptr, offset
+ ldp \regA, \regB, [\ptr, \offset]!
+ .endm
+
+ .macro stp1_nuao regA, regB, ptr, offset=0
+ 8888: stp \regA, \regB, [\ptr, \offset]
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro stp1_pre_nuao regA, regB, ptr, offset
+ 8888: stp \regA, \regB, [\ptr, \offset]!
+ _asm_extable_faultaddr 8888b,9998f;
+ .endm
+
+ .macro copy_exit
+ b .Luaccess_finish
.endm

-end .req x5
SYM_FUNC_START(__arch_copy_to_user)
- add end, x0, x2
-#include "copy_template.S"
+#include "copy_template_user.S"
+.Luaccess_finish:
mov x0, #0
ret
SYM_FUNC_END(__arch_copy_to_user)
EXPORT_SYMBOL(__arch_copy_to_user)
-
- .section .fixup,"ax"
- .align 2
-9998: sub x0, end, dst // bytes not copied
- ret
- .previous
+#include "copy_user_fixup.S"
diff --git a/arch/arm64/lib/copy_user_fixup.S b/arch/arm64/lib/copy_user_fixup.S
new file mode 100644
index 000000000000..32fae9e2e799
--- /dev/null
+++ b/arch/arm64/lib/copy_user_fixup.S
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+addr .req x15
+.section .fixup,"ax"
+.align 2
+9998:
+ // If it falls in the src range then it was a load that failed,
+ // otherwise it was a store
+ cmp addr, src
+ ccmp addr, srcend, #0x0, ge
+ csel x0, srcend, dstend, lt
+ sub x0, x0, addr
+ ret
+
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index e0bf83d556f2..c24925aef236 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -24,43 +24,56 @@
* Returns:
* x0 - dest
*/
- .macro ldrb1 reg, ptr, val
- ldrb \reg, [\ptr], \val
+ #define L(l) .L ## l
+
+ .macro ldrb1 reg, ptr, offset=0
+ ldrb \reg, [\ptr, \offset]
+ .endm
+
+ .macro strb1 reg, ptr, offset=0
+ strb \reg, [\ptr, \offset]
+ .endm
+
+ .macro ldr1 reg, ptr, offset=0
+ ldr \reg, [\ptr, \offset]
.endm

- .macro strb1 reg, ptr, val
- strb \reg, [\ptr], \val
+ .macro str1 reg, ptr, offset=0
+ str \reg, [\ptr, \offset]
.endm

- .macro ldrh1 reg, ptr, val
- ldrh \reg, [\ptr], \val
+ .macro ldp1 regA, regB, ptr, offset=0
+ ldp \regA, \regB, [\ptr, \offset]
.endm

- .macro strh1 reg, ptr, val
- strh \reg, [\ptr], \val
+ .macro stp1 regA, regB, ptr, offset=0
+ stp \regA, \regB, [\ptr, \offset]
.endm

- .macro ldr1 reg, ptr, val
- ldr \reg, [\ptr], \val
+ .macro ldrb1_reg reg, ptr, offset
+ ldrb1 \reg, \ptr, \offset
.endm

- .macro str1 reg, ptr, val
- str \reg, [\ptr], \val
+ .macro strb1_reg reg, ptr, offset
+ strb1 \reg, \ptr, \offset
.endm

- .macro ldp1 reg1, reg2, ptr, val
- ldp \reg1, \reg2, [\ptr], \val
+ .macro ldp1_pre regA, regB, ptr, offset
+ ldp \regA, \regB, [\ptr, \offset]!
.endm

- .macro stp1 reg1, reg2, ptr, val
- stp \reg1, \reg2, [\ptr], \val
+ .macro stp1_pre regA, regB, ptr, offset
+ stp \regA, \regB, [\ptr, \offset]!
+ .endm
+
+ .macro copy_exit
+ ret
.endm

.weak memcpy
SYM_FUNC_START_ALIAS(__memcpy)
SYM_FUNC_START_PI(memcpy)
#include "copy_template.S"
- ret
SYM_FUNC_END_PI(memcpy)
EXPORT_SYMBOL(memcpy)
SYM_FUNC_END_ALIAS(__memcpy)
--
2.17.1

2021-06-01 10:05:35

by Sunil Kovvuri

[permalink] [raw]

Subject: Re: [PATCH v5 08/14] arm64: Import latest optimization of memcpy

On Mon, Sep 14, 2020 at 8:44 PM Oliver Swede <[email protected]> wrote:
>
> From: Sam Tebbs <[email protected]>
>
> Import the latest memcpy implementation into memcpy,
> copy_{from, to and in}_user.
> The implementation of the user routines is separated into two forms:
> one for when UAO is enabled and one for when UAO is disabled, with
> the two being chosen between with a runtime patch.
> This avoids executing the many NOPs emitted when UAO is disabled.
>
> The project containing optimized implementations for various library
> functions has now been renamed from 'cortex-strings' to
> 'optimized-routines', and the new upstream source is
> string/aarch64/memcpy.S as of commit 4c175c8be12 in
> https://github.com/ARM-software/optimized-routines.
>
> Signed-off-by: Sam Tebbs <[email protected]>
> [ rm: add UAO fixups, streamline copy_exit paths, expand commit message ]
> Signed-off-by: Robin Murphy <[email protected]>
> [ os: import newer memcpy algorithm, update commit message ]
> Signed-off-by: Oliver Swede <[email protected]>
> ---
> arch/arm64/include/asm/alternative.h | 36 ---
> arch/arm64/lib/copy_from_user.S | 113 ++++++--
> arch/arm64/lib/copy_in_user.S | 129 +++++++--
> arch/arm64/lib/copy_template.S | 375 +++++++++++++++------------
> arch/arm64/lib/copy_template_user.S | 24 ++
> arch/arm64/lib/copy_to_user.S | 112 ++++++--
> arch/arm64/lib/copy_user_fixup.S | 14 +
> arch/arm64/lib/memcpy.S | 47 ++--
> 8 files changed, 557 insertions(+), 293 deletions(-)
> create mode 100644 arch/arm64/lib/copy_template_user.S
> create mode 100644 arch/arm64/lib/copy_user_fixup.S

Do you have any performance data with this patch ?
I see these patches are still not pushed to mainline, any reasons ?

Also curious to know why 128bit registers are not considered, similar to
https://android.googlesource.com/platform/bionic.git/+/a71b4c3f144a516826e8ac5b262099b920c49ce0/libc/arch-arm64/generic-neon/bionic/memcpy.S

Thanks,
Sunil.

2021-06-01 12:08:10

by Robin Murphy

[permalink] [raw]

Subject: Re: [PATCH v5 08/14] arm64: Import latest optimization of memcpy

On 2021-06-01 11:03, Sunil Kovvuri wrote:
> On Mon, Sep 14, 2020 at 8:44 PM Oliver Swede <[email protected]> wrote:
>>
>> From: Sam Tebbs <[email protected]>
>>
>> Import the latest memcpy implementation into memcpy,
>> copy_{from, to and in}_user.
>> The implementation of the user routines is separated into two forms:
>> one for when UAO is enabled and one for when UAO is disabled, with
>> the two being chosen between with a runtime patch.
>> This avoids executing the many NOPs emitted when UAO is disabled.
>>
>> The project containing optimized implementations for various library
>> functions has now been renamed from 'cortex-strings' to
>> 'optimized-routines', and the new upstream source is
>> string/aarch64/memcpy.S as of commit 4c175c8be12 in
>> https://github.com/ARM-software/optimized-routines.
>>
>> Signed-off-by: Sam Tebbs <[email protected]>
>> [ rm: add UAO fixups, streamline copy_exit paths, expand commit message ]
>> Signed-off-by: Robin Murphy <[email protected]>
>> [ os: import newer memcpy algorithm, update commit message ]
>> Signed-off-by: Oliver Swede <[email protected]>
>> ---
>> arch/arm64/include/asm/alternative.h | 36 ---
>> arch/arm64/lib/copy_from_user.S | 113 ++++++--
>> arch/arm64/lib/copy_in_user.S | 129 +++++++--
>> arch/arm64/lib/copy_template.S | 375 +++++++++++++++------------
>> arch/arm64/lib/copy_template_user.S | 24 ++
>> arch/arm64/lib/copy_to_user.S | 112 ++++++--
>> arch/arm64/lib/copy_user_fixup.S | 14 +
>> arch/arm64/lib/memcpy.S | 47 ++--
>> 8 files changed, 557 insertions(+), 293 deletions(-)
>> create mode 100644 arch/arm64/lib/copy_template_user.S
>> create mode 100644 arch/arm64/lib/copy_user_fixup.S
>
> Do you have any performance data with this patch ?
> I see these patches are still not pushed to mainline, any reasons ?

Funny you should pick up on the 6-month-old thread days after I've been
posting new versions of the relevant parts[1] :)

I think this series mostly stalled on the complexity of the usercopy
parts, which then turned into even more of a moving target anyway, hence
why I decided to split it up.

> Also curious to know why 128bit registers are not considered, similar to
> https://android.googlesource.com/platform/bionic.git/+/a71b4c3f144a516826e8ac5b262099b920c49ce0/libc/arch-arm64/generic-neon/bionic/memcpy.S

The overhead of kernel_neon_begin() etc. is significant, and usually
only worth it in places like the crypto routines where there's enough
benefit from actual ASIMD computation to outweigh the save/restore cost.
On smaller cores where the L1 interface is only 128 bits wide anyway
there is no possible gain in memcpy() throughput to ever offset that
cost, and even for wider microarchitectures it's only likely to start
breaking even at relatively large copy sizes. Plus we can't necessarily
assume the ASIMD registers are even present (apparently the lack of a
soft-float ABI hasn't stopped people from wanting to run Linux on such
systems...)

Robin.

[1]
https://lore.kernel.org/linux-arm-kernel/[email protected]/

2021-06-01 12:32:38

by Sunil Kovvuri

[permalink] [raw]

Subject: Re: [PATCH v5 08/14] arm64: Import latest optimization of memcpy

On Tue, Jun 1, 2021 at 5:36 PM Robin Murphy <[email protected]> wrote:
>
> On 2021-06-01 11:03, Sunil Kovvuri wrote:
> > On Mon, Sep 14, 2020 at 8:44 PM Oliver Swede <[email protected]> wrote:
> >>
> >> From: Sam Tebbs <[email protected]>
> >>
> >> Import the latest memcpy implementation into memcpy,
> >> copy_{from, to and in}_user.
> >> The implementation of the user routines is separated into two forms:
> >> one for when UAO is enabled and one for when UAO is disabled, with
> >> the two being chosen between with a runtime patch.
> >> This avoids executing the many NOPs emitted when UAO is disabled.
> >>
> >> The project containing optimized implementations for various library
> >> functions has now been renamed from 'cortex-strings' to
> >> 'optimized-routines', and the new upstream source is
> >> string/aarch64/memcpy.S as of commit 4c175c8be12 in
> >> https://github.com/ARM-software/optimized-routines.
> >>
> >> Signed-off-by: Sam Tebbs <[email protected]>
> >> [ rm: add UAO fixups, streamline copy_exit paths, expand commit message ]
> >> Signed-off-by: Robin Murphy <[email protected]>
> >> [ os: import newer memcpy algorithm, update commit message ]
> >> Signed-off-by: Oliver Swede <[email protected]>
> >> ---
> >> arch/arm64/include/asm/alternative.h | 36 ---
> >> arch/arm64/lib/copy_from_user.S | 113 ++++++--
> >> arch/arm64/lib/copy_in_user.S | 129 +++++++--
> >> arch/arm64/lib/copy_template.S | 375 +++++++++++++++------------
> >> arch/arm64/lib/copy_template_user.S | 24 ++
> >> arch/arm64/lib/copy_to_user.S | 112 ++++++--
> >> arch/arm64/lib/copy_user_fixup.S | 14 +
> >> arch/arm64/lib/memcpy.S | 47 ++--
> >> 8 files changed, 557 insertions(+), 293 deletions(-)
> >> create mode 100644 arch/arm64/lib/copy_template_user.S
> >> create mode 100644 arch/arm64/lib/copy_user_fixup.S
> >
> > Do you have any performance data with this patch ?
> > I see these patches are still not pushed to mainline, any reasons ?
>
> Funny you should pick up on the 6-month-old thread days after I've been
> posting new versions of the relevant parts[1] :)

Hmm.. I searched with the subject and didn't find any newer version of patches.
It seems you changed patch subject which I didn't anticipate.

>
> I think this series mostly stalled on the complexity of the usercopy
> parts, which then turned into even more of a moving target anyway, hence
> why I decided to split it up.
>
> > Also curious to know why 128bit registers are not considered, similar to
> > https://android.googlesource.com/platform/bionic.git/+/a71b4c3f144a516826e8ac5b262099b920c49ce0/libc/arch-arm64/generic-neon/bionic/memcpy.S
>
> The overhead of kernel_neon_begin() etc. is significant, and usually
> only worth it in places like the crypto routines where there's enough
> benefit from actual ASIMD computation to outweigh the save/restore cost.
> On smaller cores where the L1 interface is only 128 bits wide anyway
> there is no possible gain in memcpy() throughput to ever offset that
> cost, and even for wider microarchitectures it's only likely to start
> breaking even at relatively large copy sizes. Plus we can't necessarily
> assume the ASIMD registers are even present (apparently the lack of a
> soft-float ABI hasn't stopped people from wanting to run Linux on such
> systems...)
>
> Robin.

Thanks for the info.

>
> [1]
> https://lore.kernel.org/linux-arm-kernel/[email protected]/

2021-06-03 08:48:20

by David Laight

[permalink] [raw]

Subject: RE: [PATCH v5 08/14] arm64: Import latest optimization of memcpy

From: Robin Murphy
> Sent: 01 June 2021 13:07
>
> On 2021-06-01 11:03, Sunil Kovvuri wrote:
> > On Mon, Sep 14, 2020 at 8:44 PM Oliver Swede <[email protected]> wrote:
> >>
> >> From: Sam Tebbs <[email protected]>
> >>
> >> Import the latest memcpy implementation into memcpy,
> >> copy_{from, to and in}_user.
> >> The implementation of the user routines is separated into two forms:
> >> one for when UAO is enabled and one for when UAO is disabled, with
> >> the two being chosen between with a runtime patch.
> >> This avoids executing the many NOPs emitted when UAO is disabled.
> >>
> >> The project containing optimized implementations for various library
> >> functions has now been renamed from 'cortex-strings' to
> >> 'optimized-routines', and the new upstream source is
> >> string/aarch64/memcpy.S as of commit 4c175c8be12 in
> >> https://github.com/ARM-software/optimized-routines.
> >>
...
> >
> > Do you have any performance data with this patch ?
> > I see these patches are still not pushed to mainline, any reasons ?
>
> Funny you should pick up on the 6-month-old thread days after I've been
> posting new versions of the relevant parts[1] :)
>
> I think this series mostly stalled on the complexity of the usercopy
> parts, which then turned into even more of a moving target anyway, hence
> why I decided to split it up.

It is also worth checking what kind of copy lengths the 'optimized'
routines are actually optimised for.
For instance a sendmsg() system call is likely to do at least 3 short
copy_from_user() requests before even thinking about reading the data buffer.
Even the costs of the comparisons to select between short/long copy
requests become significant on short copies.

I'm not sure you want to be calling
https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/memcpy.S
for 3 bytes!

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)