LinuxLists.cc - [PATCH v3 1/3] riscv: optimized memcpy

2021-06-17 18:27:53

Subject: [PATCH v3 1/3] riscv: optimized memcpy

From: Matteo Croce <[email protected]>

Write a C version of memcpy() which uses the biggest data size allowed,
without generating unaligned accesses.

The procedure is made of three steps:
First copy data one byte at time until the destination buffer is aligned
to a long boundary.
Then copy the data one long at time shifting the current and the next u8
to compose a long at every cycle.
Finally, copy the remainder one byte at time.

On a BeagleV, the TCP RX throughput increased by 45%:

before:

$ iperf3 -c beaglev
Connecting to host beaglev, port 5201
[ 5] local 192.168.85.6 port 44840 connected to 192.168.85.48 port 5201
[ ID] Interval Transfer Bitrate Retr Cwnd
[ 5] 0.00-1.00 sec 76.4 MBytes 641 Mbits/sec 27 624 KBytes
[ 5] 1.00-2.00 sec 72.5 MBytes 608 Mbits/sec 0 708 KBytes
[ 5] 2.00-3.00 sec 73.8 MBytes 619 Mbits/sec 10 451 KBytes
[ 5] 3.00-4.00 sec 72.5 MBytes 608 Mbits/sec 0 564 KBytes
[ 5] 4.00-5.00 sec 73.8 MBytes 619 Mbits/sec 0 658 KBytes
[ 5] 5.00-6.00 sec 73.8 MBytes 619 Mbits/sec 14 522 KBytes
[ 5] 6.00-7.00 sec 73.8 MBytes 619 Mbits/sec 0 621 KBytes
[ 5] 7.00-8.00 sec 72.5 MBytes 608 Mbits/sec 0 706 KBytes
[ 5] 8.00-9.00 sec 73.8 MBytes 619 Mbits/sec 20 580 KBytes
[ 5] 9.00-10.00 sec 73.8 MBytes 619 Mbits/sec 0 672 KBytes
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval Transfer Bitrate Retr
[ 5] 0.00-10.00 sec 736 MBytes 618 Mbits/sec 71 sender
[ 5] 0.00-10.01 sec 733 MBytes 615 Mbits/sec receiver

after:

$ iperf3 -c beaglev
Connecting to host beaglev, port 5201
[ 5] local 192.168.85.6 port 44864 connected to 192.168.85.48 port 5201
[ ID] Interval Transfer Bitrate Retr Cwnd
[ 5] 0.00-1.00 sec 109 MBytes 912 Mbits/sec 48 559 KBytes
[ 5] 1.00-2.00 sec 108 MBytes 902 Mbits/sec 0 690 KBytes
[ 5] 2.00-3.00 sec 106 MBytes 891 Mbits/sec 36 396 KBytes
[ 5] 3.00-4.00 sec 108 MBytes 902 Mbits/sec 0 567 KBytes
[ 5] 4.00-5.00 sec 106 MBytes 891 Mbits/sec 0 699 KBytes
[ 5] 5.00-6.00 sec 106 MBytes 891 Mbits/sec 32 414 KBytes
[ 5] 6.00-7.00 sec 106 MBytes 891 Mbits/sec 0 583 KBytes
[ 5] 7.00-8.00 sec 106 MBytes 891 Mbits/sec 0 708 KBytes
[ 5] 8.00-9.00 sec 106 MBytes 891 Mbits/sec 28 433 KBytes
[ 5] 9.00-10.00 sec 108 MBytes 902 Mbits/sec 0 591 KBytes
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval Transfer Bitrate Retr
[ 5] 0.00-10.00 sec 1.04 GBytes 897 Mbits/sec 144 sender
[ 5] 0.00-10.01 sec 1.04 GBytes 894 Mbits/sec receiver

And the decreased CPU time of the memcpy() is observable with perf top.
This is the `perf top -Ue task-clock` output when doing the test:

before:

Overhead Shared O Symbol
42.22% [kernel] [k] memcpy
35.00% [kernel] [k] __asm_copy_to_user
3.50% [kernel] [k] sifive_l2_flush64_range
2.30% [kernel] [k] stmmac_napi_poll_rx
1.11% [kernel] [k] memset

after:

Overhead Shared O Symbol
45.69% [kernel] [k] __asm_copy_to_user
29.06% [kernel] [k] memcpy
4.09% [kernel] [k] sifive_l2_flush64_range
2.77% [kernel] [k] stmmac_napi_poll_rx
1.24% [kernel] [k] memset

Signed-off-by: Matteo Croce <[email protected]>
---
arch/riscv/include/asm/string.h | 8 ++-
arch/riscv/kernel/riscv_ksyms.c | 2 -
arch/riscv/lib/Makefile | 2 +-
arch/riscv/lib/memcpy.S | 108 --------------------------------
arch/riscv/lib/string.c | 91 +++++++++++++++++++++++++++
5 files changed, 98 insertions(+), 113 deletions(-)
delete mode 100644 arch/riscv/lib/memcpy.S
create mode 100644 arch/riscv/lib/string.c

diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
index 909049366555..6b5d6fc3eab4 100644
--- a/arch/riscv/include/asm/string.h
+++ b/arch/riscv/include/asm/string.h
@@ -12,9 +12,13 @@
#define __HAVE_ARCH_MEMSET
extern asmlinkage void *memset(void *, int, size_t);
extern asmlinkage void *__memset(void *, int, size_t);
+
+#ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
#define __HAVE_ARCH_MEMCPY
-extern asmlinkage void *memcpy(void *, const void *, size_t);
-extern asmlinkage void *__memcpy(void *, const void *, size_t);
+extern void *memcpy(void *dest, const void *src, size_t count);
+extern void *__memcpy(void *dest, const void *src, size_t count);
+#endif
+
#define __HAVE_ARCH_MEMMOVE
extern asmlinkage void *memmove(void *, const void *, size_t);
extern asmlinkage void *__memmove(void *, const void *, size_t);
diff --git a/arch/riscv/kernel/riscv_ksyms.c b/arch/riscv/kernel/riscv_ksyms.c
index 5ab1c7e1a6ed..3f6d512a5b97 100644
--- a/arch/riscv/kernel/riscv_ksyms.c
+++ b/arch/riscv/kernel/riscv_ksyms.c
@@ -10,8 +10,6 @@
* Assembly functions that may be used (directly or indirectly) by modules
*/
EXPORT_SYMBOL(memset);
-EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(memmove);
EXPORT_SYMBOL(__memset);
-EXPORT_SYMBOL(__memcpy);
EXPORT_SYMBOL(__memmove);
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
index 25d5c9664e57..2ffe85d4baee 100644
--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -1,9 +1,9 @@
# SPDX-License-Identifier: GPL-2.0-only
lib-y += delay.o
-lib-y += memcpy.o
lib-y += memset.o
lib-y += memmove.o
lib-$(CONFIG_MMU) += uaccess.o
lib-$(CONFIG_64BIT) += tishift.o
+lib-$(CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE) += string.o

obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
diff --git a/arch/riscv/lib/memcpy.S b/arch/riscv/lib/memcpy.S
deleted file mode 100644
index 51ab716253fa..000000000000
--- a/arch/riscv/lib/memcpy.S
+++ /dev/null
@@ -1,108 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2013 Regents of the University of California
- */
-
-#include <linux/linkage.h>
-#include <asm/asm.h>
-
-/* void *memcpy(void *, const void *, size_t) */
-ENTRY(__memcpy)
-WEAK(memcpy)
- move t6, a0 /* Preserve return value */
-
- /* Defer to byte-oriented copy for small sizes */
- sltiu a3, a2, 128
- bnez a3, 4f
- /* Use word-oriented copy only if low-order bits match */
- andi a3, t6, SZREG-1
- andi a4, a1, SZREG-1
- bne a3, a4, 4f
-
- beqz a3, 2f /* Skip if already aligned */
- /*
- * Round to nearest double word-aligned address
- * greater than or equal to start address
- */
- andi a3, a1, ~(SZREG-1)
- addi a3, a3, SZREG
- /* Handle initial misalignment */
- sub a4, a3, a1
-1:
- lb a5, 0(a1)
- addi a1, a1, 1
- sb a5, 0(t6)
- addi t6, t6, 1
- bltu a1, a3, 1b
- sub a2, a2, a4 /* Update count */
-
-2:
- andi a4, a2, ~((16*SZREG)-1)
- beqz a4, 4f
- add a3, a1, a4
-3:
- REG_L a4, 0(a1)
- REG_L a5, SZREG(a1)
- REG_L a6, 2*SZREG(a1)
- REG_L a7, 3*SZREG(a1)
- REG_L t0, 4*SZREG(a1)
- REG_L t1, 5*SZREG(a1)
- REG_L t2, 6*SZREG(a1)
- REG_L t3, 7*SZREG(a1)
- REG_L t4, 8*SZREG(a1)
- REG_L t5, 9*SZREG(a1)
- REG_S a4, 0(t6)
- REG_S a5, SZREG(t6)
- REG_S a6, 2*SZREG(t6)
- REG_S a7, 3*SZREG(t6)
- REG_S t0, 4*SZREG(t6)
- REG_S t1, 5*SZREG(t6)
- REG_S t2, 6*SZREG(t6)
- REG_S t3, 7*SZREG(t6)
- REG_S t4, 8*SZREG(t6)
- REG_S t5, 9*SZREG(t6)
- REG_L a4, 10*SZREG(a1)
- REG_L a5, 11*SZREG(a1)
- REG_L a6, 12*SZREG(a1)
- REG_L a7, 13*SZREG(a1)
- REG_L t0, 14*SZREG(a1)
- REG_L t1, 15*SZREG(a1)
- addi a1, a1, 16*SZREG
- REG_S a4, 10*SZREG(t6)
- REG_S a5, 11*SZREG(t6)
- REG_S a6, 12*SZREG(t6)
- REG_S a7, 13*SZREG(t6)
- REG_S t0, 14*SZREG(t6)
- REG_S t1, 15*SZREG(t6)
- addi t6, t6, 16*SZREG
- bltu a1, a3, 3b
- andi a2, a2, (16*SZREG)-1 /* Update count */
-
-4:
- /* Handle trailing misalignment */
- beqz a2, 6f
- add a3, a1, a2
-
- /* Use word-oriented copy if co-aligned to word boundary */
- or a5, a1, t6
- or a5, a5, a3
- andi a5, a5, 3
- bnez a5, 5f
-7:
- lw a4, 0(a1)
- addi a1, a1, 4
- sw a4, 0(t6)
- addi t6, t6, 4
- bltu a1, a3, 7b
-
- ret
-
-5:
- lb a4, 0(a1)
- addi a1, a1, 1
- sb a4, 0(t6)
- addi t6, t6, 1
- bltu a1, a3, 5b
-6:
- ret
-END(__memcpy)
diff --git a/arch/riscv/lib/string.c b/arch/riscv/lib/string.c
new file mode 100644
index 000000000000..e48a79a045d8
--- /dev/null
+++ b/arch/riscv/lib/string.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * String functions optimized for hardware which doesn't
+ * handle unaligned memory accesses efficiently.
+ *
+ * Copyright (C) 2021 Matteo Croce
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+
+/* Minimum size for a word copy to be convenient */
+#define MIN_THRESHOLD (BITS_PER_LONG / 8 * 2)
+
+/* convenience union to avoid cast between different pointer types */
+union types {
+ u8 *u8;
+ unsigned long *ulong;
+ uintptr_t uptr;
+};
+
+union const_types {
+ const u8 *u8;
+ unsigned long *ulong;
+};
+
+void *__memcpy(void *dest, const void *src, size_t count)
+{
+ const int bytes_long = BITS_PER_LONG / 8;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ const int mask = bytes_long - 1;
+ const int distance = (src - dest) & mask;
+#endif
+ union const_types s = { .u8 = src };
+ union types d = { .u8 = dest };
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ if (count < MIN_THRESHOLD)
+ goto copy_remainder;
+
+ /* copy a byte at time until destination is aligned */
+ for (; count && d.uptr & mask; count--)
+ *d.u8++ = *s.u8++;
+
+ if (distance) {
+ unsigned long last, next;
+
+ /* move s backward to the previous alignment boundary */
+ s.u8 -= distance;
+
+ /* 32/64 bit wide copy from s to d.
+ * d is aligned now but s is not, so read s alignment wise,
+ * and do proper shift to get the right value.
+ * Works only on Little Endian machines.
+ */
+ for (next = s.ulong[0]; count >= bytes_long + mask; count -= bytes_long) {
+ last = next;
+ next = s.ulong[1];
+
+ d.ulong[0] = last >> (distance * 8) |
+ next << ((bytes_long - distance) * 8);
+
+ d.ulong++;
+ s.ulong++;
+ }
+
+ /* restore s with the original offset */
+ s.u8 += distance;
+ } else
+#endif
+ {
+ /* if the source and dest lower bits are the same, do a simple
+ * 32/64 bit wide copy.
+ */
+ for (; count >= bytes_long; count -= bytes_long)
+ *d.ulong++ = *s.ulong++;
+ }
+
+ /* suppress warning when CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y */
+ goto copy_remainder;
+
+copy_remainder:
+ while (count--)
+ *d.u8++ = *s.u8++;
+
+ return dest;
+}
+EXPORT_SYMBOL(__memcpy);
+
+void *memcpy(void *dest, const void *src, size_t count) __weak __alias(__memcpy);
+EXPORT_SYMBOL(memcpy);
--
2.31.1

2021-06-18 17:38:21

by kernel test robot

[permalink] [raw]

Subject: Re: [PATCH v3 1/3] riscv: optimized memcpy

Hi Matteo,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linux/master]
[also build test WARNING on linus/master v5.13-rc6 next-20210617]
[cannot apply to atish-riscv-linux/topo_v3]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url: https://github.com/0day-ci/linux/commits/Matteo-Croce/riscv-optimized-mem-functions/20210617-232934
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git dd860052c99b1e088352bdd4fb7aef46f8d2ef47
config: riscv-randconfig-r031-20210618 (attached as .config)
compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project 64720f57bea6a6bf033feef4a5751ab9c0c3b401)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install riscv cross compiling tool for clang build
# apt-get install binutils-riscv64-linux-gnu
# https://github.com/0day-ci/linux/commit/c35a3474b6782645ff0695db1f30be0e8123c131
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Matteo-Croce/riscv-optimized-mem-functions/20210617-232934
git checkout c35a3474b6782645ff0695db1f30be0e8123c131
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=riscv

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>

All warnings (new ones prefixed by >>):

>> arch/riscv/lib/string.c:90:57: warning: attribute declaration must precede definition [-Wignored-attributes]
void *memcpy(void *dest, const void *src, size_t count) __weak __alias(__memcpy);
^
include/linux/compiler_attributes.h:291:56: note: expanded from macro '__weak'
#define __weak __attribute__((__weak__))
^
include/linux/fortify-string.h:178:24: note: previous definition is here
__FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size)
^
1 warning generated.

Kconfig warnings: (for reference only)
WARNING: unmet direct dependencies detected for LOCKDEP
Depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT && (FRAME_POINTER || MIPS || PPC || S390 || MICROBLAZE || ARM || ARC || X86)
Selected by
- PROVE_LOCKING && DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT
- LOCK_STAT && DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT
- DEBUG_LOCK_ALLOC && DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT

vim +90 arch/riscv/lib/string.c

89
> 90 void *memcpy(void *dest, const void *src, size_t count) __weak __alias(__memcpy);

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]

Attachments:

(No filename) (3.03 kB)
.config.gz (26.74 kB)
Download all attachments

2021-06-21 14:30:50

by Christoph Hellwig

[permalink] [raw]

Subject: Re: [PATCH v3 1/3] riscv: optimized memcpy

On Thu, Jun 17, 2021 at 05:27:52PM +0200, Matteo Croce wrote:
> +extern void *memcpy(void *dest, const void *src, size_t count);
> +extern void *__memcpy(void *dest, const void *src, size_t count);

No need for externs.

> +++ b/arch/riscv/lib/string.c

Nothing in her looks RISC-V specific. Why doesn't this go into lib/ so
that other architectures can use it as well.

> +#include <linux/module.h>

I think you only need export.h.

> +void *__memcpy(void *dest, const void *src, size_t count)
> +{
> + const int bytes_long = BITS_PER_LONG / 8;
> +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> + const int mask = bytes_long - 1;
> + const int distance = (src - dest) & mask;
> +#endif
> + union const_types s = { .u8 = src };
> + union types d = { .u8 = dest };
> +
> +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> + if (count < MIN_THRESHOLD)

Using IS_ENABLED we can avoid a lot of the mess in this
function.

int distance = 0;

if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
if (count < MIN_THRESHOLD)
goto copy_remainder;

/* copy a byte at time until destination is aligned */
for (; count && d.uptr & mask; count--)
*d.u8++ = *s.u8++;
distance = (src - dest) & mask;
}

if (distance) {
...

> + /* 32/64 bit wide copy from s to d.
> + * d is aligned now but s is not, so read s alignment wise,
> + * and do proper shift to get the right value.
> + * Works only on Little Endian machines.
> + */

Normal kernel comment style always start with a:

/*

> + for (next = s.ulong[0]; count >= bytes_long + mask; count -= bytes_long) {

Please avoid the pointlessly overlong line. And (just as a matter of
personal preference) I find for loop that don't actually use a single
iterator rather confusing. Wjy not simply:

next = s.ulong[0];
while (count >= bytes_long + mask) {
...
count -= bytes_long;
}

2021-06-22 00:16:36

by Nick Kossifidis

[permalink] [raw]

Subject: Re: [PATCH v3 1/3] riscv: optimized memcpy

Hello Matteo and thanks for the patch,

Στις 2021-06-17 18:27, Matteo Croce έγραψε:
>
> @@ -0,0 +1,91 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * String functions optimized for hardware which doesn't
> + * handle unaligned memory accesses efficiently.
> + *
> + * Copyright (C) 2021 Matteo Croce
> + */
> +
> +#include <linux/types.h>
> +#include <linux/module.h>
> +
> +/* Minimum size for a word copy to be convenient */
> +#define MIN_THRESHOLD (BITS_PER_LONG / 8 * 2)
> +
> +/* convenience union to avoid cast between different pointer types */
> +union types {
> + u8 *u8;

You are using a type as a name, I'd go with as_bytes/as_ulong/as_uptr
which makes it easier for the reader to understand what you are trying
to do.

> + unsigned long *ulong;
> + uintptr_t uptr;
> +};
> +
> +union const_types {
> + const u8 *u8;
> + unsigned long *ulong;
> +};
> +

I suggest you define those unions inside the function body, no one else
is using them.

> +void *__memcpy(void *dest, const void *src, size_t count)
> +{
> + const int bytes_long = BITS_PER_LONG / 8;
> +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> + const int mask = bytes_long - 1;
> + const int distance = (src - dest) & mask;

Why not unsigned ints ?

> +#endif
> + union const_types s = { .u8 = src };
> + union types d = { .u8 = dest };
> +
> +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS

If you want to be compliant with memcpy you should check for overlapping
regions here since "The memory areas must not overlap", and do nothing
about it because according to POSIX this leads to undefined behavior.
That's why recent libc implementations use memmove in any case (memcpy
is an alias to memmove), which is the suggested approach.

> + if (count < MIN_THRESHOLD)
> + goto copy_remainder;
> +
> + /* copy a byte at time until destination is aligned */
> + for (; count && d.uptr & mask; count--)
> + *d.u8++ = *s.u8++;
> +

You should check for !IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) here.

> + if (distance) {
> + unsigned long last, next;
> +
> + /* move s backward to the previous alignment boundary */
> + s.u8 -= distance;

It'd help here to explain that since s is distance bytes ahead relative
to d, and d reached the alignment boundary above, s is now aligned but
the data needs to be shifted to compensate for distance, in order to do
word-by-word copy.

> +
> + /* 32/64 bit wide copy from s to d.
> + * d is aligned now but s is not, so read s alignment wise,
> + * and do proper shift to get the right value.
> + * Works only on Little Endian machines.
> + */

This commend is misleading because s is aligned or else s.ulong[0]/[1]
below would result an unaligned access.

> + for (next = s.ulong[0]; count >= bytes_long + mask; count -=
> bytes_long) {
> + last = next;
> + next = s.ulong[1];
> +
> + d.ulong[0] = last >> (distance * 8) |
> + next << ((bytes_long - distance) * 8);
> +
> + d.ulong++;
> + s.ulong++;
> + }
> +
> + /* restore s with the original offset */
> + s.u8 += distance;
> + } else
> +#endif
> + {
> + /* if the source and dest lower bits are the same, do a simple
> + * 32/64 bit wide copy.
> + */

A while() loop would make more sense here.

> + for (; count >= bytes_long; count -= bytes_long)
> + *d.ulong++ = *s.ulong++;
> + }
> +
> + /* suppress warning when CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y */
> + goto copy_remainder;
> +
> +copy_remainder:
> + while (count--)
> + *d.u8++ = *s.u8++;
> +
> + return dest;
> +}
> +EXPORT_SYMBOL(__memcpy);
> +
> +void *memcpy(void *dest, const void *src, size_t count) __weak
> __alias(__memcpy);
> +EXPORT_SYMBOL(memcpy);

2021-06-22 08:19:59

by David Laight

[permalink] [raw]

Subject: RE: [PATCH v3 1/3] riscv: optimized memcpy

From: Christoph Hellwig
> Sent: 21 June 2021 15:27
...
> > + for (next = s.ulong[0]; count >= bytes_long + mask; count -= bytes_long) {
>
> Please avoid the pointlessly overlong line. And (just as a matter of
> personal preference) I find for loop that don't actually use a single
> iterator rather confusing. Wjy not simply:
>
> next = s.ulong[0];
> while (count >= bytes_long + mask) {
> ...
> count -= bytes_long;
> }

My fist attack on long 'for' statements is just to move the
initialisation to the previous line.
Then make sure there is nothing in the comparison that needs
to be calculated every iteration.
I suspect you can subtract 'mask' from 'count'.
Giving:
count -= mask;
next = s.ulong[0];
for (;; count > bytes_long; count -= bytes_long) {

Next is to shorten the variable names!

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

2021-06-22 22:02:17

by Matteo Croce

[permalink] [raw]

Subject: Re: [PATCH v3 1/3] riscv: optimized memcpy

On Mon, Jun 21, 2021 at 4:26 PM Christoph Hellwig <[email protected]> wrote:
>
> On Thu, Jun 17, 2021 at 05:27:52PM +0200, Matteo Croce wrote:
> > +extern void *memcpy(void *dest, const void *src, size_t count);
> > +extern void *__memcpy(void *dest, const void *src, size_t count);
>
> No need for externs.
>

Right.

> > +++ b/arch/riscv/lib/string.c
>
> Nothing in her looks RISC-V specific. Why doesn't this go into lib/ so
> that other architectures can use it as well.
>

Technically it could go into lib/ and be generic.
If you think it's worth it, I have just to handle the different
left/right shift because of endianness.

> > +#include <linux/module.h>
>
> I think you only need export.h.
>

Nice.

> > +void *__memcpy(void *dest, const void *src, size_t count)
> > +{
> > + const int bytes_long = BITS_PER_LONG / 8;
> > +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> > + const int mask = bytes_long - 1;
> > + const int distance = (src - dest) & mask;
> > +#endif
> > + union const_types s = { .u8 = src };
> > + union types d = { .u8 = dest };
> > +
> > +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> > + if (count < MIN_THRESHOLD)
>
> Using IS_ENABLED we can avoid a lot of the mess in this
> function.
>
> int distance = 0;
>
> if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
> if (count < MIN_THRESHOLD)
> goto copy_remainder;
>
> /* copy a byte at time until destination is aligned */
> for (; count && d.uptr & mask; count--)
> *d.u8++ = *s.u8++;
> distance = (src - dest) & mask;
> }
>

Cool. What about putting this check in the very start:

if (count < MIN_THRESHOLD)
goto copy_remainder;

And since count is at least twice bytes_long, remove count from the check below?
Also, setting distance after d is aligned is as simple as getting the
lower bits of s:

if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
/* Copy a byte at time until destination is aligned. */
for (; d.uptr & mask; count--)
*d.u8++ = *s.u8++;

distance = s.uptr & mask;
}

> if (distance) {
> ...
>
> > + /* 32/64 bit wide copy from s to d.
> > + * d is aligned now but s is not, so read s alignment wise,
> > + * and do proper shift to get the right value.
> > + * Works only on Little Endian machines.
> > + */
>
> Normal kernel comment style always start with a:
>

Right, I was used to netdev ones :)

> /*
>
>
> > + for (next = s.ulong[0]; count >= bytes_long + mask; count -= bytes_long) {
>
> Please avoid the pointlessly overlong line. And (just as a matter of
> personal preference) I find for loop that don't actually use a single
> iterator rather confusing. Wjy not simply:
>
> next = s.ulong[0];
> while (count >= bytes_long + mask) {
> ...
> count -= bytes_long;
> }

My fault, in a previous version it was:

next = s.ulong[0];
for (; count >= bytes_long + mask; count -= bytes_long) {

So to have a single `count` counter for the loop.

Regards,
--
per aspera ad upstream

2021-06-22 22:58:01

by Matteo Croce

[permalink] [raw]

Subject: Re: [PATCH v3 1/3] riscv: optimized memcpy

On Tue, Jun 22, 2021 at 10:19 AM David Laight <[email protected]> wrote:
>
> From: Christoph Hellwig
> > Sent: 21 June 2021 15:27
> ...
> > > + for (next = s.ulong[0]; count >= bytes_long + mask; count -= bytes_long) {
> >
> > Please avoid the pointlessly overlong line. And (just as a matter of
> > personal preference) I find for loop that don't actually use a single
> > iterator rather confusing. Wjy not simply:
> >
> > next = s.ulong[0];
> > while (count >= bytes_long + mask) {
> > ...
> > count -= bytes_long;
> > }
>
> My fist attack on long 'for' statements is just to move the
> initialisation to the previous line.
> Then make sure there is nothing in the comparison that needs
> to be calculated every iteration.
> I suspect you can subtract 'mask' from 'count'.
> Giving:
> count -= mask;
> next = s.ulong[0];
> for (;; count > bytes_long; count -= bytes_long) {
>

This way we'll lose the remainder, as count is used at the end to copy
the leftover.
Anyway, both bytes_long and mask are constant, I doubt they get summed
at every cycle.

> Next is to shorten the variable names!
>
> David
>

--
per aspera ad upstream

2021-06-22 23:40:37

by Matteo Croce

[permalink] [raw]

Subject: Re: [PATCH v3 1/3] riscv: optimized memcpy

On Tue, Jun 22, 2021 at 2:15 AM Nick Kossifidis <[email protected]> wrote:
>
> Hello Matteo and thanks for the patch,
>
> Στις 2021-06-17 18:27, Matteo Croce έγραψε:
> >
> > @@ -0,0 +1,91 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * String functions optimized for hardware which doesn't
> > + * handle unaligned memory accesses efficiently.
> > + *
> > + * Copyright (C) 2021 Matteo Croce
> > + */
> > +
> > +#include <linux/types.h>
> > +#include <linux/module.h>
> > +
> > +/* Minimum size for a word copy to be convenient */
> > +#define MIN_THRESHOLD (BITS_PER_LONG / 8 * 2)
> > +
> > +/* convenience union to avoid cast between different pointer types */
> > +union types {
> > + u8 *u8;
>
> You are using a type as a name, I'd go with as_bytes/as_ulong/as_uptr
> which makes it easier for the reader to understand what you are trying
> to do.
>

Makes sense

> > + unsigned long *ulong;
> > + uintptr_t uptr;
> > +};
> > +
> > +union const_types {
> > + const u8 *u8;
> > + unsigned long *ulong;
> > +};
> > +
>
> I suggest you define those unions inside the function body, no one else
> is using them.
>

They will be used in memset(), in patch 3/3

> > +void *__memcpy(void *dest, const void *src, size_t count)
> > +{
> > + const int bytes_long = BITS_PER_LONG / 8;
> > +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> > + const int mask = bytes_long - 1;
> > + const int distance = (src - dest) & mask;
>
> Why not unsigned ints ?
>

Ok.

> > +#endif
> > + union const_types s = { .u8 = src };
> > + union types d = { .u8 = dest };
> > +
> > +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
>
> If you want to be compliant with memcpy you should check for overlapping
> regions here since "The memory areas must not overlap", and do nothing
> about it because according to POSIX this leads to undefined behavior.
> That's why recent libc implementations use memmove in any case (memcpy
> is an alias to memmove), which is the suggested approach.
>

Mmm which memcpy arch implementation does this check?
I guess that noone is currently doing it.

> > + if (count < MIN_THRESHOLD)
> > + goto copy_remainder;
> > +
> > + /* copy a byte at time until destination is aligned */
> > + for (; count && d.uptr & mask; count--)
> > + *d.u8++ = *s.u8++;
> > +
>
> You should check for !IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) here.
>

I tought that only Little Endian RISC-V machines were supported in Linux.
Should I add a BUILD_BUG_ON()?
Anyway, if this is going in generic lib/, I'll take care of the endianness.

> > + if (distance) {
> > + unsigned long last, next;
> > +
> > + /* move s backward to the previous alignment boundary */
> > + s.u8 -= distance;
>
> It'd help here to explain that since s is distance bytes ahead relative
> to d, and d reached the alignment boundary above, s is now aligned but
> the data needs to be shifted to compensate for distance, in order to do
> word-by-word copy.
>
> > +
> > + /* 32/64 bit wide copy from s to d.
> > + * d is aligned now but s is not, so read s alignment wise,
> > + * and do proper shift to get the right value.
> > + * Works only on Little Endian machines.
> > + */
>
> This commend is misleading because s is aligned or else s.ulong[0]/[1]
> below would result an unaligned access.
>

Yes, those two comments should be rephrased, merged and put above.

> > + for (next = s.ulong[0]; count >= bytes_long + mask; count -=
> > bytes_long) {
> > + last = next;
> > + next = s.ulong[1];
> > +
> > + d.ulong[0] = last >> (distance * 8) |
> > + next << ((bytes_long - distance) * 8);
> > +
> > + d.ulong++;
> > + s.ulong++;
> > + }
> > +
> > + /* restore s with the original offset */
> > + s.u8 += distance;
> > + } else
> > +#endif
> > + {
> > + /* if the source and dest lower bits are the same, do a simple
> > + * 32/64 bit wide copy.
> > + */
>
> A while() loop would make more sense here.
>

Ok.

> > + for (; count >= bytes_long; count -= bytes_long)
> > + *d.ulong++ = *s.ulong++;
> > + }
> > +
> > + /* suppress warning when CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y */
> > + goto copy_remainder;
> > +
> > +copy_remainder:
> > + while (count--)
> > + *d.u8++ = *s.u8++;
> > +
> > + return dest;
> > +}
> > +EXPORT_SYMBOL(__memcpy);
> > +
> > +void *memcpy(void *dest, const void *src, size_t count) __weak
> > __alias(__memcpy);
> > +EXPORT_SYMBOL(memcpy);

Regards,
--
per aspera ad upstream

2021-06-23 09:50:34

by Nick Kossifidis

[permalink] [raw]

Subject: Re: [PATCH v3 1/3] riscv: optimized memcpy

Στις 2021-06-23 02:35, Matteo Croce έγραψε:
>>
>> If you want to be compliant with memcpy you should check for
>> overlapping
>> regions here since "The memory areas must not overlap", and do nothing
>> about it because according to POSIX this leads to undefined behavior.
>> That's why recent libc implementations use memmove in any case (memcpy
>> is an alias to memmove), which is the suggested approach.
>>
>
> Mmm which memcpy arch implementation does this check?
> I guess that noone is currently doing it.
>

Yup because even if they did the wouldn't know what to do about it since
POSIX leaves this as undefined behavior. So instead of using memcpy it's
suggested that people use memmove that can handle overlapping regions,
and because we can't patch the rest of the kernel to only use memmove
(or the rest of the programs if we were a libc), the idea is to just
alias memcpy to memmove (BTW Torvalds also thinks this is a good idea:
https://bugzilla.redhat.com/show_bug.cgi?id=638477#c132).