2006-12-13 18:31:36

by Bryan O'Sullivan

[permalink] [raw]
Subject: [PATCH 0 of 2] Add memcpy_uncached_read, a memcpy that doesn't cache reads

Hi, Andrew -

Here's a suitably renamed uncached-read memcpy. I hope the name is now
self-explanatory.

<b


2006-12-13 18:31:35

by Bryan O'Sullivan

[permalink] [raw]
Subject: [PATCH 2 of 2] IB/ipath - use memcpy_uncached_read in RDMA interrupt handler to reduce packet loss

In cases where a large incoming RDMA is being received, we have to
copy data inside the interrupt handler before we can ACK each packet.
The source is DMAed to by the hardware, which means that the CPU won't
have it cached. We only read the source this one time; using normal load
instructions pollutes the dcache with useless data, reducing performance
to the point where we can lose a significant number of packets.

We use memcpy_uncached_read to try to not fill the dcache with useless data.
Avoiding the cache refill penalty lets us keep up better with the sender,
resulting in many fewer dropped packets.

Signed-off-by: Bryan O'Sullivan <[email protected]>

diff -r e7c3b265254b -r f25d77f76998 drivers/infiniband/hw/ipath/ipath_verbs.c
--- a/drivers/infiniband/hw/ipath/ipath_verbs.c Wed Dec 13 09:51:09 2006 -0800
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.c Wed Dec 13 09:51:09 2006 -0800
@@ -167,7 +167,7 @@ void ipath_copy_sge(struct ipath_sge_sta
BUG_ON(len == 0);
if (len > length)
len = length;
- memcpy(sge->vaddr, data, len);
+ memcpy_uncached_read(sge->vaddr, data, len);
sge->vaddr += len;
sge->length -= len;
sge->sge_length -= len;

2006-12-13 18:31:55

by Bryan O'Sullivan

[permalink] [raw]
Subject: [PATCH 1 of 2] Add memcpy_uncached_read, a memcpy that tries to reduce cache pressure

This copy routine is memcpy-compatible, but on some architectures will use
cache-bypassing loads to avoid bringing the source data into the cache.

One case where this is useful is when a device issues a DMA to a memory
region, and the CPU must copy the DMAed data elsewhere before doing any
work with it. Since the source data is read-once, write-never from the
CPU's perspective, caching the data at those addresses can only evict
potentially useful data.

We provide an x86_64 implementation that uses SSE non-temporal loads,
and a generic version that falls back to plain memcpy.

Implementors for other arches should not use cache-bypassing stores to
the destination, as in most cases, the destination is accessed almost
immediately after a copy finishes.

Signed-off-by: Bryan O'Sullivan <[email protected]>

diff -r 4a0c3ede5076 -r e7c3b265254b arch/x86_64/lib/Makefile
--- a/arch/x86_64/lib/Makefile Tue Dec 12 10:43:21 2006 -0800
+++ b/arch/x86_64/lib/Makefile Wed Dec 13 09:51:09 2006 -0800
@@ -9,4 +9,5 @@ lib-y := csum-partial.o csum-copy.o csum
lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
usercopy.o getuser.o putuser.o \
thunk.o clear_page.o copy_page.o bitstr.o bitops.o
-lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o
+lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o \
+ memcpy_uncached_read.o
diff -r 4a0c3ede5076 -r e7c3b265254b arch/x86_64/lib/memcpy_uncached_read.S
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/arch/x86_64/lib/memcpy_uncached_read.S Wed Dec 13 09:51:09 2006 -0800
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2006 QLogic Corporation. All Rights Reserved.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * memcpy_uncached_read - memcpy-compatible copy routine, using streaming loads
+ * @dest: destination address
+ * @src: source address (will not be cached)
+ * @count: number of bytes to copy
+ *
+ * Use streaming loads and normal stores for a special-case copy where
+ * we know we won't be reading the source again, but will be reading the
+ * destination again soon.
+ */
+ .text
+ .p2align 4,,15
+ /* rdi destination, rsi source, rdx count */
+ .globl memcpy_uncached_read
+ .type memcpy_uncached_read, @function
+memcpy_uncached_read:
+ movq %rdi, %rax
+.L5:
+ cmpq $15, %rdx
+ ja .L34
+.L3:
+ cmpl $8, %edx /* rdx is 0..15 */
+ jbe .L9
+.L6:
+ testb $8, %dxl /* rdx is 3,5,6,7,9..15 */
+ je .L13
+ movq (%rsi), %rcx
+ addq $8, %rsi
+ movq %rcx, (%rdi)
+ addq $8, %rdi
+.L13:
+ testb $4, %dxl
+ je .L15
+ movl (%rsi), %ecx
+ addq $4, %rsi
+ movl %ecx, (%rdi)
+ addq $4, %rdi
+.L15:
+ testb $2, %dxl
+ je .L17
+ movzwl (%rsi), %ecx
+ addq $2, %rsi
+ movw %cx, (%rdi)
+ addq $2, %rdi
+.L17:
+ testb $1, %dxl
+ je .L33
+.L1:
+ movzbl (%rsi), %ecx
+ movb %cl, (%rdi)
+.L33:
+ ret
+.L34:
+ cmpq $63, %rdx /* rdx is > 15 */
+ ja .L64
+ movl $16, %ecx /* rdx is 16..63 */
+.L25:
+ movq 8(%rsi), %r8
+ movq (%rsi), %r9
+ addq %rcx, %rsi
+ movq %r8, 8(%rdi)
+ movq %r9, (%rdi)
+ addq %rcx, %rdi
+ subq %rcx, %rdx
+ cmpl %edx, %ecx /* is rdx >= 16? */
+ jbe .L25
+ jmp .L3 /* rdx is 0..15 */
+ .p2align 4,,7
+.L64:
+ movl $64, %ecx
+.L42:
+ prefetchnta 128(%rsi)
+ movq (%rsi), %r8
+ movq 8(%rsi), %r9
+ movq 16(%rsi), %r10
+ movq 24(%rsi), %r11
+ subq %rcx, %rdx
+ movq %r8, (%rdi)
+ movq 32(%rsi), %r8
+ movq %r9, 8(%rdi)
+ movq 40(%rsi), %r9
+ movq %r10, 16(%rdi)
+ movq 48(%rsi), %r10
+ movq %r11, 24(%rdi)
+ movq 56(%rsi), %r11
+ addq %rcx, %rsi
+ movq %r8, 32(%rdi)
+ movq %r9, 40(%rdi)
+ movq %r10, 48(%rdi)
+ movq %r11, 56(%rdi)
+ addq %rcx, %rdi
+ cmpq %rdx, %rcx /* is rdx >= 64? */
+ jbe .L42
+ sfence
+ orl %edx, %edx
+ je .L33
+ jmp .L5
+.L9:
+ jmp *.L12(,%rdx,8) /* rdx is 0..8 */
+ .section .rodata
+ .align 8
+ .align 4
+.L12:
+ .quad .L33
+ .quad .L1
+ .quad .L2
+ .quad .L6
+ .quad .L4
+ .quad .L6
+ .quad .L6
+ .quad .L6
+ .quad .L8
+ .text
+.L2:
+ movzwl (%rsi), %ecx
+ movw %cx, (%rdi)
+ ret
+.L4:
+ movl (%rsi), %ecx
+ movl %ecx, (%rdi)
+ ret
+.L8:
+ movq (%rsi), %rcx
+ movq %rcx, (%rdi)
+ ret
diff -r 4a0c3ede5076 -r e7c3b265254b include/asm-x86_64/string.h
--- a/include/asm-x86_64/string.h Tue Dec 12 10:43:21 2006 -0800
+++ b/include/asm-x86_64/string.h Wed Dec 13 09:51:09 2006 -0800
@@ -39,6 +39,8 @@ extern void *__memcpy(void *to, const vo
__ret = __builtin_memcpy((dst),(src),__len); \
__ret; })

+#define __HAVE_ARCH_MEMCPY_UNCACHED_READ
+extern void *memcpy_uncached_read(void *to, const void *from, size_t len);

#define __HAVE_ARCH_MEMSET
void *memset(void *s, int c, size_t n);
diff -r 4a0c3ede5076 -r e7c3b265254b include/linux/string.h
--- a/include/linux/string.h Tue Dec 12 10:43:21 2006 -0800
+++ b/include/linux/string.h Wed Dec 13 09:51:09 2006 -0800
@@ -85,6 +85,9 @@ extern void * memset(void *,int,__kernel
#ifndef __HAVE_ARCH_MEMCPY
extern void * memcpy(void *,const void *,__kernel_size_t);
#endif
+#ifndef __HAVE_ARCH_MEMCPY_UNCACHED_READ
+#define memcpy_uncached_read(dest, src, count) memcpy((dest), (src), (count))
+#endif
#ifndef __HAVE_ARCH_MEMMOVE
extern void * memmove(void *,const void *,__kernel_size_t);
#endif