LinuxLists.cc - [rfc: cpuops adv V1 3/8] x86: this_cpu_cmpxchg and this_cpu_cmpxchg

2010-12-02 21:55:21

Subject: [rfc: cpuops adv V1 3/8] x86: this_cpu_cmpxchg and this_cpu_cmpxchg_double operations

Provide support as far as the hardware capabilities of the x86 cpus
allow.

V1->V2:
- Mark %rdx clobbering during cmpxchg16b
- Provide emulation of cmpxchg16b for early AMD processors

Signed-off-by: Christoph Lameter <[email protected]>

---
arch/x86/Kconfig.cpu | 3
arch/x86/include/asm/percpu.h | 183 +++++++++++++++++++++++++++++++++++++++++-
arch/x86/lib/Makefile | 1
arch/x86/lib/cmpxchg16b_emu.S | 55 ++++++++++++
4 files changed, 238 insertions(+), 4 deletions(-)

Index: linux-2.6/arch/x86/lib/Makefile
===================================================================
--- linux-2.6.orig/arch/x86/lib/Makefile 2010-11-30 15:14:05.000000000 -0600
+++ linux-2.6/arch/x86/lib/Makefile 2010-12-01 09:50:55.000000000 -0600
@@ -42,4 +42,5 @@ else
lib-y += memmove_64.o memset_64.o
lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
+ lib-y += cmpxchg16b_emu.o
endif
Index: linux-2.6/arch/x86/lib/cmpxchg16b_emu.S
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/arch/x86/lib/cmpxchg16b_emu.S 2010-12-01 09:50:55.000000000 -0600
@@ -0,0 +1,55 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/frame.h>
+#include <asm/dwarf2.h>
+
+
+.text
+
+/*
+ * Inputs:
+ * %rsi : memory location to compare
+ * %rax : low 64 bits of old value
+ * %rdx : high 64 bits of old value
+ * %rbx : low 64 bits of new value
+ * %rcx : high 64 bits of new value
+ * %al : Operation successful
+ */
+ENTRY(cmpxchg16b_local_emu)
+CFI_STARTPROC
+
+#
+# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in
+# al not via the ZF. Caller will access al to get result.
+#
+cmpxchg16b_local_emu:
+ pushf
+ cli
+
+ cmpq %gs:(%rsi), %rax
+ jne not_same
+ cmpq %gs:8(%rsi), %rdx
+ jne not_same
+
+ movq %rbx, %gs:(%rsi)
+ movq %rcx, %gs:8(%rsi)
+
+ popf
+ mov $1, %al
+ ret
+
+ not_same:
+ popf
+ xor %al,%al
+ ret
+
+CFI_ENDPROC
+ENDPROC(cmpxchg16b_local_emu)
Index: linux-2.6/arch/x86/include/asm/percpu.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/percpu.h 2010-12-01 09:47:44.000000000 -0600
+++ linux-2.6/arch/x86/include/asm/percpu.h 2010-12-01 09:50:55.000000000 -0600
@@ -212,6 +212,83 @@ do { \
ret__; \
})

+/*
+ * Beware: xchg on x86 has an implied lock prefix. There will be the cost of
+ * full lock semantics even though they are not needed.
+ */
+#define percpu_xchg_op(var, nval) \
+({ \
+ typeof(var) __ret; \
+ typeof(var) __new = (nval); \
+ switch (sizeof(var)) { \
+ case 1: \
+ asm("xchgb %2, "__percpu_arg(1) \
+ : "=a" (__ret), "+m" (var) \
+ : "q" (__new) \
+ : "memory"); \
+ break; \
+ case 2: \
+ asm("xchgw %2, "__percpu_arg(1) \
+ : "=a" (__ret), "+m" (var) \
+ : "r" (__new) \
+ : "memory"); \
+ break; \
+ case 4: \
+ asm("xchgl %2, "__percpu_arg(1) \
+ : "=a" (__ret), "+m" (var) \
+ : "r" (__new) \
+ : "memory"); \
+ break; \
+ case 8: \
+ asm("xchgq %2, "__percpu_arg(1) \
+ : "=a" (__ret), "+m" (var) \
+ : "r" (__new) \
+ : "memory"); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+ __ret; \
+})
+
+/*
+ * cmpxchg has no such implies lock semantics as a result it is much
+ * more efficient for cpu local operations.
+ */
+#define percpu_cmpxchg_op(var, oval, nval) \
+({ \
+ typeof(var) __ret; \
+ typeof(var) __old = (oval); \
+ typeof(var) __new = (nval); \
+ switch (sizeof(var)) { \
+ case 1: \
+ asm("cmpxchgb %2, "__percpu_arg(1) \
+ : "=a" (__ret), "+m" (var) \
+ : "q" (__new), "0" (__old) \
+ : "memory"); \
+ break; \
+ case 2: \
+ asm("cmpxchgw %2, "__percpu_arg(1) \
+ : "=a" (__ret), "+m" (var) \
+ : "r" (__new), "0" (__old) \
+ : "memory"); \
+ break; \
+ case 4: \
+ asm("cmpxchgl %2, "__percpu_arg(1) \
+ : "=a" (__ret), "+m" (var) \
+ : "r" (__new), "0" (__old) \
+ : "memory"); \
+ break; \
+ case 8: \
+ asm("cmpxchgq %2, "__percpu_arg(1) \
+ : "=a" (__ret), "+m" (var) \
+ : "r" (__new), "0" (__old) \
+ : "memory"); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+ __ret; \
+})
+
#define percpu_from_op(op, var, constraint) \
({ \
typeof(var) pfo_ret__; \
@@ -335,14 +412,76 @@ do { \
#define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
#define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)

+#define __this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
+#define __this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
+#define __this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
+
#ifndef CONFIG_M386
#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
#define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
#define __this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val)
-#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
-#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
-#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val)
-#endif
+#define this_cpu_add_return_1(pcp, val) percpu_add_return_op((pcp), val)
+#define this_cpu_add_return_2(pcp, val) percpu_add_return_op((pcp), val)
+#define this_cpu_add_return_4(pcp, val) percpu_add_return_op((pcp), val)
+
+#define __this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op((pcp), oval, nval)
+#define __this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op((pcp), oval, nval)
+#define __this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op((pcp), oval, nval)
+#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op((pcp), oval, nval)
+#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op((pcp), oval, nval)
+#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op((pcp), oval, nval)
+#define irqsafe_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op((pcp), oval, nval)
+#define irqsafe_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op((pcp), oval, nval)
+#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op((pcp), oval, nval)
+#endif /* !CONFIG_M386 */
+
+#ifdef CONFIG_X86_CMPXCHG64
+#define percpu_cmpxchg8b_double(pcp, o1, o2, n1, n2) \
+({ \
+ char __ret; \
+ typeof(o1) __o1 = o1; \
+ typeof(o1) __n1 = n1; \
+ typeof(o2) __o2 = o2; \
+ typeof(o2) __n2 = n2; \
+ typeof(o2) __dummy = n2; \
+ asm("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \
+ : "=a"(__ret), "=m" (*pcp), "=d"(__dummy) \
+ : "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2)); \
+ __ret; \
+})
+#endif /* CONFIG_X86_CMPXCHG64 */
+
+#define __this_cpu_cmpxchg_double_4(pcp, o1, o2, n1, n2) percpu_cmpxchg8b_double((pcp), o1, o2, n1, n2)
+#define this_cpu_cmpxchg_double_4(pcp, o1, o2, n1, n2) percpu_cmpxchg8b_double((pcp), o1, o2, n1, n2)
+#define irqsafe_cpu_cmpxchg_double_4(pcp, o1, o2, n1, n2) percpu_cmpxchg8b_double((pcp), o1, o2, n1, n2)
+
+#ifndef CONFIG_X86_64
+#ifdef CONFIG_X86_CMPXCHG64
+/* We can support a 8 byte cmpxchg with a special instruction on 32 bit */
+#define __this_cpu_cmpxchg_8(pcp, oval, nval) \
+({ \
+ typeof(var) __ret; \
+ typeof(var) __old = (oval); \
+ typeof(var) __new = (nval); \
+ asm("cmpxchg8b %2, "__percpu_arg(1) \
+ : "=A" (__ret), "+m" (&pcp) \
+ : "b" (((u32)new), "c" ((u32)(new >> 32)), "0" (__old) \
+ : "memory"); \
+ __ret; \
+})
+
+#define this_cpu_cmpxchg_8(pcp, oval, nval) __this_cpu_cmpxchg_8(pcp, oval, nval)
+#define irqsafe_cmpxchg_8(pcp, oval, nval) __this_cpu_cmpxchg_8(pcp, oval, nval)
+
+#endif /* CONFIG_X86_CMPXCHG64 */
+#endif /* !CONFIG_X86_64 */
+
/*
* Per cpu atomic 64 bit operations are only available under 64 bit.
* 32 bit must fall back to generic operations.
@@ -370,6 +509,42 @@ do { \
#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)

+#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
+
+#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op((pcp), oval, nval)
+#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op((pcp), oval, nval)
+#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op((pcp), oval, nval)
+
+/*
+ * Pretty complex macro to generate cmpxchg16 instruction. The instruction
+ * is not supported on early AMD64 processors so we must be able to emulate
+ * it in software. The address used in the cmpxchg16 instruction must be
+ * aligned to a 16 byte boundary.
+ */
+#define percpu_cmpxchg16b(pcp, o1, o2, n1, n2) \
+({ \
+ char __ret; \
+ typeof(o1) __o1 = o1; \
+ typeof(o1) __n1 = n1; \
+ typeof(o2) __o2 = o2; \
+ typeof(o2) __n2 = n2; \
+ typeof(o2) __dummy; \
+ VM_BUG_ON(((unsigned long)pcp) % 16); \
+ alternative_io("call cmpxchg16b_local_emu\n\t" P6_NOP4, \
+ "cmpxchg16b %%gs:(%%rsi)\n\tsetz %0\n\t", \
+ X86_FEATURE_CX16, \
+ ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \
+ "S" (pcp), "b"(__n1), "c"(__n2), \
+ "a"(__o1), "d"(__o2)); \
+ __ret; \
+})
+
+#define __this_cpu_cmpxchg_double_8(pcp, o1, o2, n1, n2) percpu_cmpxchg16b((pcp), o1, o2, n1, n2)
+#define this_cpu_cmpxchg_double_8(pcp, o1, o2, n1, n2) percpu_cmpxchg16b((pcp), o1, o2, n1, n2)
+#define irqsafe_cpu_cmpxchg_double_8(pcp, o1, o2, n1, n2) percpu_cmpxchg16b((pcp), o1, o2, n1, n2)
+
#endif

/* This is not atomic against other CPUs -- CPU preemption needs to be off */
Index: linux-2.6/arch/x86/Kconfig.cpu
===================================================================
--- linux-2.6.orig/arch/x86/Kconfig.cpu 2010-12-01 10:04:37.000000000 -0600
+++ linux-2.6/arch/x86/Kconfig.cpu 2010-12-01 10:05:18.000000000 -0600
@@ -310,6 +310,9 @@ config X86_INTERNODE_CACHE_SHIFT
config X86_CMPXCHG
def_bool X86_64 || (X86_32 && !M386)

+config CMPXCHG_LOCAL
+ def_bool X86_64 || (X86_32 && !M386)
+
config X86_L1_CACHE_SHIFT
int
default "7" if MPENTIUM4 || MPSC

2010-12-06 17:15:41

by Avi Kivity

[permalink] [raw]

Subject: Re: [rfc: cpuops adv V1 3/8] x86: this_cpu_cmpxchg and this_cpu_cmpxchg_double operations

On 12/02/2010 11:53 PM, Christoph Lameter wrote:
> Provide support as far as the hardware capabilities of the x86 cpus
> allow.
>
>
>
> +/*
> + * Beware: xchg on x86 has an implied lock prefix. There will be the cost of
> + * full lock semantics even though they are not needed.
> + */

Perhaps we can use cmpxchg instead of xchg to avoid this? costs one more
instruction but may be worth it.

--
error compiling committee.c: too many arguments to function

2010-12-06 17:35:28

by Christoph Lameter (Ampere)

[permalink] [raw]

Subject: Re: [rfc: cpuops adv V1 3/8] x86: this_cpu_cmpxchg and this_cpu_cmpxchg_double operations

On Mon, 6 Dec 2010, Avi Kivity wrote:

> On 12/02/2010 11:53 PM, Christoph Lameter wrote:
> > Provide support as far as the hardware capabilities of the x86 cpus
> > allow.
> >
> >
> >
> > +/*
> > + * Beware: xchg on x86 has an implied lock prefix. There will be the cost
> > of
> > + * full lock semantics even though they are not needed.
> > + */
>
> Perhaps we can use cmpxchg instead of xchg to avoid this? costs one more
> instruction but may be worth it.

Hmmm... Maybe good since I also need a xchg_double. And xchg_double can
only be realized with cmpxchg16b. Using cmpxchg would make it consistent.

2010-12-07 09:31:41

by Avi Kivity

[permalink] [raw]

Subject: Re: [rfc: cpuops adv V1 3/8] x86: this_cpu_cmpxchg and this_cpu_cmpxchg_double operations

On 12/06/2010 07:35 PM, Christoph Lameter wrote:
> On Mon, 6 Dec 2010, Avi Kivity wrote:
>
> > On 12/02/2010 11:53 PM, Christoph Lameter wrote:
> > > Provide support as far as the hardware capabilities of the x86 cpus
> > > allow.
> > >
> > >
> > >
> > > +/*
> > > + * Beware: xchg on x86 has an implied lock prefix. There will be the cost
> > > of
> > > + * full lock semantics even though they are not needed.
> > > + */
> >
> > Perhaps we can use cmpxchg instead of xchg to avoid this? costs one more
> > instruction but may be worth it.
>
> Hmmm... Maybe good since I also need a xchg_double. And xchg_double can
> only be realized with cmpxchg16b. Using cmpxchg would make it consistent.

I don't think we need to worry about consistency, this is an
implementation not an interface.

We have three choices:

xchg %1, %0

atomic, one instruction

1: cmpxchg %2, %0
jnz 1b

two non-atomic instructions, potential mispredicted jump, extra clobber
(%1 == "=a")

mov %0, %1
1: cmpxchg %2, %0
jnz 1b

three non-atomic instructions, no mispredict, extra clobber

--
error compiling committee.c: too many arguments to function