This patchset improves the atomic64_t functions on x86-32.
It also includes a testsuite that has been used to test this functionality
and can test any atomic64_t implementation.
It offers the following improvements:
1. Better code due to hand-written assembly (e.g. use of the ZF flag)
2. All atomic64 functions implemented, efficiently
3. Support for 386/486 due to the ability to alternatively use either
the cmpxchg8b based implementation or the generic implementation
4. Use of SSE, if available, for atomic64_read and atomic64_set
The first patches add functionality to the alternatives system to support
the new atomic64_t code.
A patch that improves cmpxchg64() using that functionality is also included.
To test this code, enable CONFIG_ATOMIC64_SELFTEST, compile for 386 and
boot normally, with "clearcpuid=25" and with "clearcpuid=8 clearcpuid=25".
You should receive a message stating that the atomic64 test passed,
along with the selected configuration.
Signed-off-by: Luca Barbieri <[email protected]>
This patch modifies the x86 alternative macros to allow more than one
alternative code sequence.
This is done by simply adding multiple alternative patches, which are
applied in sequence, overwriting previous ones.
Signed-off-by: Luca Barbieri <[email protected]>
---
arch/x86/include/asm/alternative.h | 27 ++++++++++++++++++++-------
1 files changed, 20 insertions(+), 7 deletions(-)
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 69b74a7..42d41ac 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -73,23 +73,36 @@ static inline void alternatives_smp_module_del(struct module *mod) {}
static inline void alternatives_smp_switch(int smp) {}
#endif /* CONFIG_SMP */
-/* alternative assembly primitive: */
-#define ALTERNATIVE(oldinstr, newinstr, feature) \
- \
- "661:\n\t" oldinstr "\n662:\n" \
+#define ALTERNATIVE_PATCH(oldstart, oldend, newinstr, feature) \
".section .altinstructions,\"a\"\n" \
_ASM_ALIGN "\n" \
- _ASM_PTR "661b\n" /* label */ \
+ _ASM_PTR oldstart "\n" /* label */ \
_ASM_PTR "663f\n" /* new instruction */ \
" .byte " __stringify(feature) "\n" /* feature bit */ \
- " .byte 662b-661b\n" /* sourcelen */ \
+ " .byte (" oldend ")-(" oldstart ")\n" /* sourcelen */ \
" .byte 664f-663f\n" /* replacementlen */ \
- " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \
+ " .byte 0xff + (664f-663f) - ((" oldend ")-(" oldstart "))\n" /* rlen <= slen */ \
".previous\n" \
".section .altinstr_replacement, \"ax\"\n" \
"663:\n\t" newinstr "\n664:\n" /* replacement */ \
".previous"
+/* alternative assembly primitive: */
+#define ALTERNATIVE(oldinstr, newinstr, feature) \
+ "661:\n\t" oldinstr "\n662:\n" \
+ ALTERNATIVE_PATCH("661b", "662b", newinstr, feature)
+
+#define ALTERNATIVE3(oldinstr, newinstr1, feature1, newinstr2, feature2) \
+ "661:\n\t" oldinstr "\n662:\n" \
+ ALTERNATIVE_PATCH("661b", "662b", newinstr1, feature1) "\n" \
+ ALTERNATIVE_PATCH("661b", "662b", newinstr2, feature2)
+
+#define ALTERNATIVE4(oldinstr, newinstr1, feature1, newinstr2, feature2, newinstr3, feature3) \
+ "661:\n\t" oldinstr "\n662:\n" \
+ ALTERNATIVE_PATCH("661b", "662b", newinstr1, feature1) "\n" \
+ ALTERNATIVE_PATCH("661b", "662b", newinstr2, feature2) "\n" \
+ ALTERNATIVE_PATCH("661b", "662b", newinstr3, feature3)
+
/*
* Alternative instructions for different CPU types or capabilities.
*
--
1.6.6.1.476.g01ddb
This patch makes atomic64 use either the generic implementation or
the rewritten cmpxchg8b one just introduced by inserting a "call" to
either, using the alternatives system to dynamically switch the calls.
This allows to use atomic64_t on 386/486 which lack cmpxchg8b
Signed-off-by: Luca Barbieri <[email protected]>
---
arch/x86/include/asm/atomic_32.h | 56 +++++++++++++++++-----
arch/x86/lib/atomic64_32.c | 94 ++++++++++++++++++++++++++++++++++++++
2 files changed, 137 insertions(+), 13 deletions(-)
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 50a6d4c..1ab431c 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -279,6 +279,12 @@ long long cx8_atomic64_dec_if_positive_cx8call(atomic64_t *v);
int cx8_atomic64_inc_not_zero_cx8call(atomic64_t *v);
int cx8_atomic64_add_unless(atomic64_t *v, long long a, long long u);
+#ifdef CONFIG_X86_CMPXCHG64
+#define ATOMIC64_ALTERNATIVE(f) "call cx8_atomic64_" #f
+#else
+#define ATOMIC64_ALTERNATIVE(f) ALTERNATIVE("call generic_atomic64_" #f, "call cx8_atomic64_" #f, X86_FEATURE_CX8)
+#endif
+
/**
* atomic64_cmpxchg - cmpxchg atomic64 variable
* @p: pointer to type atomic64_t
@@ -288,11 +294,25 @@ int cx8_atomic64_add_unless(atomic64_t *v, long long a, long long u);
* Atomically sets @v to @n if it was equal to @o and returns
* the old value.
*/
+
static inline long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n)
{
- asm volatile(LOCK_PREFIX "cmpxchg8b %1"
- : "+A" (o)
- : "m" (v->counter), "b" ((unsigned)n), "c" ((unsigned)(n >> 32))
+ unsigned high = (unsigned)(n >> 32);
+ unsigned low = (unsigned)n;
+ asm volatile(
+#ifdef CONFIG_X86_CMPXCHG64
+ LOCK_PREFIX "cmpxchg8b (%%esi)"
+#else
+#if CONFIG_SMP
+ LOCK_PREFIX_ALTERNATIVE_PATCH "\n\t"
+ ALTERNATIVE("call generic_atomic64_cmpxchg_cx8call", "lock; cmpxchg8b (%%esi)\n\tnop", X86_FEATURE_CX8)
+#else
+ ALTERNATIVE("call generic_atomic64_cmpxchg_cx8call", "cmpxchg8b (%%esi)\n\t" ASM_NOP2, X86_FEATURE_CX8)
+#endif
+#endif
+ : "+A" (o), "+b" (low), "+c" (high)
+ : "S" (v)
+ : "memory"
);
return o;
}
@@ -310,7 +330,7 @@ static inline long long atomic64_xchg(atomic64_t *v, long long n)
long long o;
unsigned high = (unsigned)(n >> 32);
unsigned low = (unsigned)n;
- asm volatile("call cx8_atomic64_xchg_cx8call"
+ asm volatile(ATOMIC64_ALTERNATIVE(xchg_cx8call)
: "=A" (o), "+b" (low), "+c" (high)
: "S" (v)
: "memory"
@@ -329,7 +349,7 @@ static inline void atomic64_set(atomic64_t *v, long long i)
{
unsigned high = (unsigned)(i >> 32);
unsigned low = (unsigned)i;
- asm volatile("call cx8_atomic64_set_cx8call"
+ asm volatile(ATOMIC64_ALTERNATIVE(set_cx8call)
: "+b" (low), "+c" (high)
: "S" (v)
: "eax", "edx", "memory"
@@ -345,7 +365,7 @@ static inline void atomic64_set(atomic64_t *v, long long i)
static inline long long atomic64_read(atomic64_t *v)
{
long long r;
- asm volatile("call cx8_atomic64_read_cx8call"
+ asm volatile(ATOMIC64_ALTERNATIVE(read_cx8call)
: "=A" (r), "+c" (v)
: : "memory"
);
@@ -361,7 +381,11 @@ static inline long long atomic64_read(atomic64_t *v)
*/
static inline long long atomic64_add_return(long long a, atomic64_t *v)
{
- return cx8_atomic64_add_return(a, v);
+ asm volatile(ATOMIC64_ALTERNATIVE(add_return)
+ : "+A" (a), "+c" (v)
+ : : "memory"
+ );
+ return a;
}
/*
@@ -369,13 +393,17 @@ static inline long long atomic64_add_return(long long a, atomic64_t *v)
*/
static inline long long atomic64_sub_return(long long a, atomic64_t *v)
{
- return cx8_atomic64_sub_return(a, v);
+ asm volatile(ATOMIC64_ALTERNATIVE(sub_return)
+ : "+A" (a), "+c" (v)
+ : : "memory"
+ );
+ return a;
}
static inline long long atomic64_inc_return(atomic64_t *v)
{
long long a;
- asm volatile("call cx8_atomic64_inc_return_cx8call"
+ asm volatile(ATOMIC64_ALTERNATIVE(inc_return_cx8call)
: "=A" (a)
: "S" (v)
: "memory", "ecx"
@@ -386,7 +414,7 @@ static inline long long atomic64_inc_return(atomic64_t *v)
static inline long long atomic64_dec_return(atomic64_t *v)
{
long long a;
- asm volatile("call cx8_atomic64_dec_return_cx8call"
+ asm volatile(ATOMIC64_ALTERNATIVE(dec_return_cx8call)
: "=A" (a)
: "S" (v)
: "memory", "ecx"
@@ -402,7 +430,7 @@ static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
asm volatile(
"pushl %4\n\t"
"pushl %3\n\t"
- "call cx8_atomic64_add_unless\n\t"
+ ATOMIC64_ALTERNATIVE(add_unless) "\n\t"
"addl $8, %%esp\n\t"
: "+a" (r), "+d" (low), "+c" (high)
: "g" ((unsigned)u), "g" ((unsigned)(u >> 32))
@@ -413,7 +441,7 @@ static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
static inline long long atomic64_dec_if_positive(atomic64_t *v)
{
long long r;
- asm volatile("call cx8_atomic64_dec_if_positive_cx8call"
+ asm volatile(ATOMIC64_ALTERNATIVE(dec_if_positive_cx8call)
: "=A" (r)
: "S" (v)
: "ecx", "memory"
@@ -424,7 +452,7 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v)
static inline int atomic64_inc_not_zero(atomic64_t *v)
{
int r;
- asm volatile("call cx8_atomic64_inc_not_zero_cx8call"
+ asm volatile(ATOMIC64_ALTERNATIVE(inc_not_zero_cx8call)
: "=a" (r)
: "S" (v)
: "ecx", "edx", "memory"
@@ -441,5 +469,7 @@ static inline int atomic64_inc_not_zero(atomic64_t *v)
#define atomic64_dec(v) atomic64_dec_return(v)
#define atomic64_dec_and_test(v) (atomic64_dec_return((v)) == 0)
+#undef ATOMIC64_ALTERNATIVE
+
#include <asm-generic/atomic-long.h>
#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
index 18544b3..b7edbb3 100644
--- a/arch/x86/lib/atomic64_32.c
+++ b/arch/x86/lib/atomic64_32.c
@@ -27,3 +27,97 @@ EXPORT_SYMBOL(cx8_atomic64_inc_return_cx8call);
EXPORT_SYMBOL(cx8_atomic64_dec_return_cx8call);
EXPORT_SYMBOL(cx8_atomic64_dec_if_positive_cx8call);
EXPORT_SYMBOL(cx8_atomic64_inc_not_zero_cx8call);
+
+
+#ifndef CONFIG_X86_CMPXCHG64
+
+/* inline these since we expose variants with a different calling convention */
+#define generic_atomic64_read static inline generic_atomic64_read
+#define generic_atomic64_set static inline generic_atomic64_set
+#define generic_atomic64_xchg static inline generic_atomic64_xchg
+#define generic_atomic64_cmpxchg static inline generic_atomic64_cmpxchg
+#define generic_atomic64_dec_if_positive static inline generic_atomic64_dec_if_positive
+#include <asm-generic/atomic64-impl.h>
+#undef generic_atomic64_read
+#undef generic_atomic64_set
+#undef generic_atomic64_xchg
+#undef generic_atomic64_cmpxchg
+#undef generic_atomic64_dec_if_positive
+
+union generic_atomic64_lock generic_atomic64_lock[ATOMIC64_NR_LOCKS] __cacheline_aligned_in_smp;
+pure_initcall(init_generic_atomic64_lock);
+
+EXPORT_SYMBOL(generic_atomic64_add);
+EXPORT_SYMBOL(generic_atomic64_add_return);
+EXPORT_SYMBOL(generic_atomic64_sub);
+EXPORT_SYMBOL(generic_atomic64_sub_return);
+EXPORT_SYMBOL(generic_atomic64_add_unless);
+
+long long generic_atomic64_read_cx8call(long long dummy, const atomic64_t *v)
+{
+ return generic_atomic64_read(v);
+}
+EXPORT_SYMBOL(generic_atomic64_read_cx8call);
+
+#endif /* CONFIG_X86_CMPXCHG64 */
+
+register unsigned low asm("ebx");
+register atomic64_t *v asm("esi");
+
+#ifndef CONFIG_X86_CMPXCHG64
+
+long long generic_atomic64_cmpxchg_cx8call(long long o, unsigned high)
+{
+ return generic_atomic64_cmpxchg(v, o, ((long long)high << 32) | low);
+}
+EXPORT_SYMBOL(generic_atomic64_cmpxchg_cx8call);
+
+long long generic_atomic64_xchg_cx8call(long long dummy, unsigned high)
+{
+ return generic_atomic64_xchg(v, ((long long)high << 32) | low);
+}
+EXPORT_SYMBOL(generic_atomic64_xchg_cx8call);
+
+void generic_atomic64_set_cx8call(long long dummy, unsigned high)
+{
+ return generic_atomic64_set(v, ((long long)high << 32) | low);
+}
+EXPORT_SYMBOL(generic_atomic64_set_cx8call);
+
+long long generic_atomic64_inc_return_cx8call(void)
+{
+ return generic_atomic64_add_return(1, v);
+}
+EXPORT_SYMBOL(generic_atomic64_inc_return_cx8call);
+
+long long generic_atomic64_dec_return_cx8call(void)
+{
+ return generic_atomic64_sub_return(1, v);
+}
+EXPORT_SYMBOL(generic_atomic64_dec_return_cx8call);
+
+void generic_atomic64_inc_cx8call(void)
+{
+ return generic_atomic64_add(1, v);
+}
+EXPORT_SYMBOL(generic_atomic64_inc_cx8call);
+
+void generic_atomic64_dec_cx8call(void)
+{
+ return generic_atomic64_sub(1, v);
+}
+EXPORT_SYMBOL(generic_atomic64_dec_cx8call);
+
+long long generic_atomic64_dec_if_positive_cx8call(void)
+{
+ return generic_atomic64_dec_if_positive(v);
+}
+EXPORT_SYMBOL(generic_atomic64_dec_if_positive_cx8call);
+
+int generic_atomic64_inc_not_zero_cx8call(void)
+{
+ return generic_atomic64_add_unless(v, 1LL, 0LL);
+}
+EXPORT_SYMBOL(generic_atomic64_inc_not_zero_cx8call);
+
+#endif /* CONFIG_X86_CMPXCHG64 */
--
1.6.6.1.476.g01ddb
No known CPU should have this combination, and future ones are very
unlikely to.
However, should this happen, we would generate working but non-atomic
code, so panic instead.
---
arch/x86/lib/atomic64_32.c | 11 +++++++++++
1 files changed, 11 insertions(+), 0 deletions(-)
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
index 9ff8589..35dbd12 100644
--- a/arch/x86/lib/atomic64_32.c
+++ b/arch/x86/lib/atomic64_32.c
@@ -47,6 +47,17 @@ EXPORT_SYMBOL(cx8_atomic64_inc_not_zero_cx8call);
union generic_atomic64_lock generic_atomic64_lock[ATOMIC64_NR_LOCKS] __cacheline_aligned_in_smp;
pure_initcall(init_generic_atomic64_lock);
+static int __init panic_on_sse_without_cx8(void)
+{
+ /* no known CPU should do this, and we generate non-atomic code in this case
+ * because we mix the generic spinlock-reliant code and the SSE code
+ */
+ if (!boot_cpu_has(X86_FEATURE_CX8) && boot_cpu_has(X86_FEATURE_XMM))
+ panic("CPUs without CX8 but with SSE are not supported\nBoot with clearcpuid=25 and report your CPU model to [email protected]\n");
+ return 0;
+}
+core_initcall(panic_on_sse_without_cx8);
+
EXPORT_SYMBOL(generic_atomic64_add);
EXPORT_SYMBOL(generic_atomic64_add_return);
EXPORT_SYMBOL(generic_atomic64_sub);
--
1.6.6.1.476.g01ddb
This patch moves the generic implementation of the atomic64 functions
from atomic64.c to atomic64-impl.h
This file will be reused by x86-32 for 386/486 support.
Signed-off-by: Luca Barbieri <[email protected]>
---
include/asm-generic/atomic64-impl.h | 167 ++++++++++++++++++++++++++++++++
include/asm-generic/atomic64.h | 31 ++++--
lib/atomic64.c | 183 +++--------------------------------
3 files changed, 203 insertions(+), 178 deletions(-)
create mode 100644 include/asm-generic/atomic64-impl.h
diff --git a/include/asm-generic/atomic64-impl.h b/include/asm-generic/atomic64-impl.h
new file mode 100644
index 0000000..a0a76f4
--- /dev/null
+++ b/include/asm-generic/atomic64-impl.h
@@ -0,0 +1,167 @@
+#ifndef _ASM_GENERIC_ATOMIC64_IMPL_H
+#define _ASM_GENERIC_ATOMIC64_IMPL_H
+
+#include <linux/spinlock.h>
+
+/*
+ * We use a hashed array of spinlocks to provide exclusive access
+ * to each atomic64_t variable. Since this is expected to used on
+ * systems with small numbers of CPUs (<= 4 or so), we use a
+ * relatively small array of 16 spinlocks to avoid wasting too much
+ * memory on the spinlock array.
+ */
+#ifndef ATOMIC64_NR_LOCKS
+#define ATOMIC64_NR_LOCKS 16
+#endif
+
+/*
+ * Ensure each lock is in a separate cacheline.
+ */
+union generic_atomic64_lock {
+ spinlock_t lock;
+ char pad[L1_CACHE_BYTES];
+};
+
+extern union generic_atomic64_lock generic_atomic64_lock[ATOMIC64_NR_LOCKS] __cacheline_aligned_in_smp;
+
+static inline int init_generic_atomic64_lock(void)
+{
+ int i;
+
+ for (i = 0; i < ATOMIC64_NR_LOCKS; ++i)
+ spin_lock_init(&generic_atomic64_lock[i].lock);
+ return 0;
+}
+
+static inline spinlock_t *generic_atomic64_lock_addr(const atomic64_t *v)
+{
+ unsigned long addr = (unsigned long) v;
+
+ addr >>= L1_CACHE_SHIFT;
+ addr ^= (addr >> 8) ^ (addr >> 16);
+ return &generic_atomic64_lock[addr & (ATOMIC64_NR_LOCKS - 1)].lock;
+}
+
+long long generic_atomic64_read(const atomic64_t *v)
+{
+ unsigned long flags;
+ spinlock_t *lock = generic_atomic64_lock_addr(v);
+ long long val;
+
+ spin_lock_irqsave(lock, flags);
+ val = v->counter;
+ spin_unlock_irqrestore(lock, flags);
+ return val;
+}
+
+void generic_atomic64_set(atomic64_t *v, long long i)
+{
+ unsigned long flags;
+ spinlock_t *lock = generic_atomic64_lock_addr(v);
+
+ spin_lock_irqsave(lock, flags);
+ v->counter = i;
+ spin_unlock_irqrestore(lock, flags);
+}
+
+void generic_atomic64_add(long long a, atomic64_t *v)
+{
+ unsigned long flags;
+ spinlock_t *lock = generic_atomic64_lock_addr(v);
+
+ spin_lock_irqsave(lock, flags);
+ v->counter += a;
+ spin_unlock_irqrestore(lock, flags);
+}
+
+long long generic_atomic64_add_return(long long a, atomic64_t *v)
+{
+ unsigned long flags;
+ spinlock_t *lock = generic_atomic64_lock_addr(v);
+ long long val;
+
+ spin_lock_irqsave(lock, flags);
+ val = v->counter += a;
+ spin_unlock_irqrestore(lock, flags);
+ return val;
+}
+
+void generic_atomic64_sub(long long a, atomic64_t *v)
+{
+ unsigned long flags;
+ spinlock_t *lock = generic_atomic64_lock_addr(v);
+
+ spin_lock_irqsave(lock, flags);
+ v->counter -= a;
+ spin_unlock_irqrestore(lock, flags);
+}
+
+long long generic_atomic64_sub_return(long long a, atomic64_t *v)
+{
+ unsigned long flags;
+ spinlock_t *lock = generic_atomic64_lock_addr(v);
+ long long val;
+
+ spin_lock_irqsave(lock, flags);
+ val = v->counter -= a;
+ spin_unlock_irqrestore(lock, flags);
+ return val;
+}
+
+long long generic_atomic64_dec_if_positive(atomic64_t *v)
+{
+ unsigned long flags;
+ spinlock_t *lock = generic_atomic64_lock_addr(v);
+ long long val;
+
+ spin_lock_irqsave(lock, flags);
+ val = v->counter - 1;
+ if (val >= 0)
+ v->counter = val;
+ spin_unlock_irqrestore(lock, flags);
+ return val;
+}
+
+long long generic_atomic64_cmpxchg(atomic64_t *v, long long o, long long n)
+{
+ unsigned long flags;
+ spinlock_t *lock = generic_atomic64_lock_addr(v);
+ long long val;
+
+ spin_lock_irqsave(lock, flags);
+ val = v->counter;
+ if (val == o)
+ v->counter = n;
+ spin_unlock_irqrestore(lock, flags);
+ return val;
+}
+
+long long generic_atomic64_xchg(atomic64_t *v, long long new)
+{
+ unsigned long flags;
+ spinlock_t *lock = generic_atomic64_lock_addr(v);
+ long long val;
+
+ spin_lock_irqsave(lock, flags);
+ val = v->counter;
+ v->counter = new;
+ spin_unlock_irqrestore(lock, flags);
+ return val;
+}
+
+int generic_atomic64_add_unless(atomic64_t *v, long long a, long long u)
+{
+ unsigned long flags;
+ spinlock_t *lock = generic_atomic64_lock_addr(v);
+ int ret = 1;
+
+ spin_lock_irqsave(lock, flags);
+ if (v->counter != u) {
+ v->counter += a;
+ ret = 0;
+ }
+ spin_unlock_irqrestore(lock, flags);
+ return ret;
+}
+
+#endif /* _ASM_GENERIC_ATOMIC64_IMPL_H */
diff --git a/include/asm-generic/atomic64.h b/include/asm-generic/atomic64.h
index b18ce4f..d6775fd 100644
--- a/include/asm-generic/atomic64.h
+++ b/include/asm-generic/atomic64.h
@@ -18,16 +18,27 @@ typedef struct {
#define ATOMIC64_INIT(i) { (i) }
-extern long long atomic64_read(const atomic64_t *v);
-extern void atomic64_set(atomic64_t *v, long long i);
-extern void atomic64_add(long long a, atomic64_t *v);
-extern long long atomic64_add_return(long long a, atomic64_t *v);
-extern void atomic64_sub(long long a, atomic64_t *v);
-extern long long atomic64_sub_return(long long a, atomic64_t *v);
-extern long long atomic64_dec_if_positive(atomic64_t *v);
-extern long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n);
-extern long long atomic64_xchg(atomic64_t *v, long long new);
-extern int atomic64_add_unless(atomic64_t *v, long long a, long long u);
+extern long long generic_atomic64_read(const atomic64_t *v);
+extern void generic_atomic64_set(atomic64_t *v, long long i);
+extern void generic_atomic64_add(long long a, atomic64_t *v);
+extern long long generic_atomic64_add_return(long long a, atomic64_t *v);
+extern void generic_atomic64_sub(long long a, atomic64_t *v);
+extern long long generic_atomic64_sub_return(long long a, atomic64_t *v);
+extern long long generic_atomic64_dec_if_positive(atomic64_t *v);
+extern long long generic_atomic64_cmpxchg(atomic64_t *v, long long o, long long n);
+extern long long generic_atomic64_xchg(atomic64_t *v, long long new);
+extern int generic_atomic64_add_unless(atomic64_t *v, long long a, long long u);
+
+#define atomic64_read generic_atomic64_read
+#define atomic64_set generic_atomic64_set
+#define atomic64_add generic_atomic64_add
+#define atomic64_add_return generic_atomic64_add_return
+#define atomic64_sub generic_atomic64_sub
+#define atomic64_sub_return generic_atomic64_sub_return
+#define atomic64_dec_if_positive generic_atomic64_dec_if_positive
+#define atomic64_cmpxchg generic_atomic64_cmpxchg
+#define atomic64_xchg generic_atomic64_xchg
+#define atomic64_add_unless generic_atomic64_add_unless
#define atomic64_add_negative(a, v) (atomic64_add_return((a), (v)) < 0)
#define atomic64_inc(v) atomic64_add(1LL, (v))
diff --git a/lib/atomic64.c b/lib/atomic64.c
index 8bee16e..2565f63 100644
--- a/lib/atomic64.c
+++ b/lib/atomic64.c
@@ -16,171 +16,18 @@
#include <linux/module.h>
#include <asm/atomic.h>
-/*
- * We use a hashed array of spinlocks to provide exclusive access
- * to each atomic64_t variable. Since this is expected to used on
- * systems with small numbers of CPUs (<= 4 or so), we use a
- * relatively small array of 16 spinlocks to avoid wasting too much
- * memory on the spinlock array.
- */
-#define NR_LOCKS 16
-
-/*
- * Ensure each lock is in a separate cacheline.
- */
-static union {
- spinlock_t lock;
- char pad[L1_CACHE_BYTES];
-} atomic64_lock[NR_LOCKS] __cacheline_aligned_in_smp;
-
-static inline spinlock_t *lock_addr(const atomic64_t *v)
-{
- unsigned long addr = (unsigned long) v;
-
- addr >>= L1_CACHE_SHIFT;
- addr ^= (addr >> 8) ^ (addr >> 16);
- return &atomic64_lock[addr & (NR_LOCKS - 1)].lock;
-}
-
-long long atomic64_read(const atomic64_t *v)
-{
- unsigned long flags;
- spinlock_t *lock = lock_addr(v);
- long long val;
-
- spin_lock_irqsave(lock, flags);
- val = v->counter;
- spin_unlock_irqrestore(lock, flags);
- return val;
-}
-EXPORT_SYMBOL(atomic64_read);
-
-void atomic64_set(atomic64_t *v, long long i)
-{
- unsigned long flags;
- spinlock_t *lock = lock_addr(v);
-
- spin_lock_irqsave(lock, flags);
- v->counter = i;
- spin_unlock_irqrestore(lock, flags);
-}
-EXPORT_SYMBOL(atomic64_set);
-
-void atomic64_add(long long a, atomic64_t *v)
-{
- unsigned long flags;
- spinlock_t *lock = lock_addr(v);
-
- spin_lock_irqsave(lock, flags);
- v->counter += a;
- spin_unlock_irqrestore(lock, flags);
-}
-EXPORT_SYMBOL(atomic64_add);
-
-long long atomic64_add_return(long long a, atomic64_t *v)
-{
- unsigned long flags;
- spinlock_t *lock = lock_addr(v);
- long long val;
-
- spin_lock_irqsave(lock, flags);
- val = v->counter += a;
- spin_unlock_irqrestore(lock, flags);
- return val;
-}
-EXPORT_SYMBOL(atomic64_add_return);
-
-void atomic64_sub(long long a, atomic64_t *v)
-{
- unsigned long flags;
- spinlock_t *lock = lock_addr(v);
-
- spin_lock_irqsave(lock, flags);
- v->counter -= a;
- spin_unlock_irqrestore(lock, flags);
-}
-EXPORT_SYMBOL(atomic64_sub);
-
-long long atomic64_sub_return(long long a, atomic64_t *v)
-{
- unsigned long flags;
- spinlock_t *lock = lock_addr(v);
- long long val;
-
- spin_lock_irqsave(lock, flags);
- val = v->counter -= a;
- spin_unlock_irqrestore(lock, flags);
- return val;
-}
-EXPORT_SYMBOL(atomic64_sub_return);
-
-long long atomic64_dec_if_positive(atomic64_t *v)
-{
- unsigned long flags;
- spinlock_t *lock = lock_addr(v);
- long long val;
-
- spin_lock_irqsave(lock, flags);
- val = v->counter - 1;
- if (val >= 0)
- v->counter = val;
- spin_unlock_irqrestore(lock, flags);
- return val;
-}
-EXPORT_SYMBOL(atomic64_dec_if_positive);
-
-long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n)
-{
- unsigned long flags;
- spinlock_t *lock = lock_addr(v);
- long long val;
-
- spin_lock_irqsave(lock, flags);
- val = v->counter;
- if (val == o)
- v->counter = n;
- spin_unlock_irqrestore(lock, flags);
- return val;
-}
-EXPORT_SYMBOL(atomic64_cmpxchg);
-
-long long atomic64_xchg(atomic64_t *v, long long new)
-{
- unsigned long flags;
- spinlock_t *lock = lock_addr(v);
- long long val;
-
- spin_lock_irqsave(lock, flags);
- val = v->counter;
- v->counter = new;
- spin_unlock_irqrestore(lock, flags);
- return val;
-}
-EXPORT_SYMBOL(atomic64_xchg);
-
-int atomic64_add_unless(atomic64_t *v, long long a, long long u)
-{
- unsigned long flags;
- spinlock_t *lock = lock_addr(v);
- int ret = 1;
-
- spin_lock_irqsave(lock, flags);
- if (v->counter != u) {
- v->counter += a;
- ret = 0;
- }
- spin_unlock_irqrestore(lock, flags);
- return ret;
-}
-EXPORT_SYMBOL(atomic64_add_unless);
-
-static int init_atomic64_lock(void)
-{
- int i;
-
- for (i = 0; i < NR_LOCKS; ++i)
- spin_lock_init(&atomic64_lock[i].lock);
- return 0;
-}
-
-pure_initcall(init_atomic64_lock);
+#include <asm-generic/atomic64-impl.h>
+
+union generic_atomic64_lock generic_atomic64_lock[ATOMIC64_NR_LOCKS] __cacheline_aligned_in_smp;
+pure_initcall(init_generic_atomic64_lock);
+
+EXPORT_SYMBOL(generic_atomic64_read);
+EXPORT_SYMBOL(generic_atomic64_set);
+EXPORT_SYMBOL(generic_atomic64_add);
+EXPORT_SYMBOL(generic_atomic64_add_return);
+EXPORT_SYMBOL(generic_atomic64_sub);
+EXPORT_SYMBOL(generic_atomic64_sub_return);
+EXPORT_SYMBOL(generic_atomic64_dec_if_positive);
+EXPORT_SYMBOL(generic_atomic64_cmpxchg);
+EXPORT_SYMBOL(generic_atomic64_xchg);
+EXPORT_SYMBOL(generic_atomic64_add_unless);
--
1.6.6.1.476.g01ddb
This patch uses SSE movlps to perform 64-bit atomic reads and writes.
According to Intel manuals, all aligned 64-bit reads and writes are
atomically, which should include movlps.
To do this, we need to disable preempt, clts if TS was set, and
restore TS.
If we don't need to change TS, using SSE is much faster.
Otherwise, it should be essentially even, with the fastest method
depending on the specific architecture.
Another important point is that with SSE atomic64_read can keep the
cacheline in shared state.
If we could keep TS off and reenable it when returning to userspace,
this would be even faster, but this is left for a later patch.
We use SSE because we can just save the low part %xmm0, whereas using
the FPU or MMX requires at least saving the environment, and seems
impossible to do fast.
Signed-off-by: Luca Barbieri <[email protected]>
---
arch/x86/include/asm/atomic_32.h | 10 ++++-
arch/x86/lib/atomic64_32.c | 67 ++++++++++++++++++++++++++++++++++++++
2 files changed, 75 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 1ab431c..d03e471 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -268,6 +268,9 @@ typedef struct {
#define ATOMIC64_INIT(val) { (val) }
+long long sse_atomic64_read_cx8call(long long, const atomic64_t *v);
+void sse_atomic64_set_cx8call(long long, unsigned high);
+
long long cx8_atomic64_read_cx8call(long long, const atomic64_t *v);
long long cx8_atomic64_set_cx8call(long long, const atomic64_t *v);
long long cx8_atomic64_xchg_cx8call(long long, unsigned high);
@@ -281,8 +284,10 @@ int cx8_atomic64_add_unless(atomic64_t *v, long long a, long long u);
#ifdef CONFIG_X86_CMPXCHG64
#define ATOMIC64_ALTERNATIVE(f) "call cx8_atomic64_" #f
+#define ATOMIC64_ALTERNATIVE_XMM(f) ALTERNATIVE("call cx8_atomic64_" #f, "call sse_atomic64_" #g, X86_FEATURE_XMM)
#else
#define ATOMIC64_ALTERNATIVE(f) ALTERNATIVE("call generic_atomic64_" #f, "call cx8_atomic64_" #f, X86_FEATURE_CX8)
+#define ATOMIC64_ALTERNATIVE_XMM(f) ALTERNATIVE3("call generic_atomic64_" #f, "call cx8_atomic64_" #f, X86_FEATURE_CX8, "call sse_atomic64_" #f, X86_FEATURE_XMM)
#endif
/**
@@ -349,7 +354,7 @@ static inline void atomic64_set(atomic64_t *v, long long i)
{
unsigned high = (unsigned)(i >> 32);
unsigned low = (unsigned)i;
- asm volatile(ATOMIC64_ALTERNATIVE(set_cx8call)
+ asm volatile(ATOMIC64_ALTERNATIVE_XMM(set_cx8call)
: "+b" (low), "+c" (high)
: "S" (v)
: "eax", "edx", "memory"
@@ -365,7 +370,7 @@ static inline void atomic64_set(atomic64_t *v, long long i)
static inline long long atomic64_read(atomic64_t *v)
{
long long r;
- asm volatile(ATOMIC64_ALTERNATIVE(read_cx8call)
+ asm volatile(ATOMIC64_ALTERNATIVE_XMM(read_cx8call)
: "=A" (r), "+c" (v)
: : "memory"
);
@@ -470,6 +475,7 @@ static inline int atomic64_inc_not_zero(atomic64_t *v)
#define atomic64_dec_and_test(v) (atomic64_dec_return((v)) == 0)
#undef ATOMIC64_ALTERNATIVE
+#undef ATOMIC64_ALTERNATIVE_XMM
#include <asm-generic/atomic-long.h>
#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
index b7edbb3..9ff8589 100644
--- a/arch/x86/lib/atomic64_32.c
+++ b/arch/x86/lib/atomic64_32.c
@@ -61,6 +61,47 @@ EXPORT_SYMBOL(generic_atomic64_read_cx8call);
#endif /* CONFIG_X86_CMPXCHG64 */
+struct sse_atomic64_percpu {
+ long long xmm0_low;
+ long low;
+ long high;
+};
+
+/* we actually only need 8-byte alignment, but using cacheline alignment is the only simple way to this */
+/* we use a per-CPU variable because we need to disable preemption anyway and this is faster than
+ * aligning the stack pointer to 8 bytes
+ */
+DEFINE_PER_CPU_ALIGNED(struct sse_atomic64_percpu, sse_atomic64_percpu);
+
+/* using the fpu/mmx looks infeasible due to the need to save the FPU environment, which is very slow
+ * SSE2 is slightly slower on Core 2 and less compatible, so avoid it for now
+ */
+long long sse_atomic64_read_cx8call(long long dummy, const atomic64_t *v)
+{
+ long long res;
+ unsigned long cr0 = 0;
+ struct thread_info *me = current_thread_info();
+ preempt_disable();
+ if (!(me->status & TS_USEDFPU)) {
+ cr0 = read_cr0();
+ if (cr0 & X86_CR0_TS)
+ clts();
+ }
+ asm volatile(
+ "movlps %%xmm0, " __percpu_arg(0) "\n\t"
+ "movlps %3, %%xmm0\n\t"
+ "movlps %%xmm0, " __percpu_arg(1) "\n\t"
+ "movlps " __percpu_arg(0) ", %%xmm0\n\t"
+ : "+m" (per_cpu__sse_atomic64_percpu.xmm0_low), "=m" (per_cpu__sse_atomic64_percpu.low), "=m" (per_cpu__sse_atomic64_percpu.high)
+ : "m" (v->counter));
+ if (cr0 & X86_CR0_TS)
+ write_cr0(cr0);
+ res = (long long)(unsigned)percpu_read(sse_atomic64_percpu.low) | ((long long)(unsigned)percpu_read(sse_atomic64_percpu.high) << 32);
+ preempt_enable();
+ return res;
+}
+EXPORT_SYMBOL(sse_atomic64_read_cx8call);
+
register unsigned low asm("ebx");
register atomic64_t *v asm("esi");
@@ -121,3 +162,29 @@ int generic_atomic64_inc_not_zero_cx8call(void)
EXPORT_SYMBOL(generic_atomic64_inc_not_zero_cx8call);
#endif /* CONFIG_X86_CMPXCHG64 */
+
+/* put this here because we need access to the global register variables */
+void sse_atomic64_set_cx8call(long long dummy, unsigned high)
+{
+ struct thread_info *me = current_thread_info();
+ unsigned long cr0 = 0;
+ preempt_disable();
+ percpu_write(sse_atomic64_percpu.low, low);
+ percpu_write(sse_atomic64_percpu.high, high);
+ if (!(me->status & TS_USEDFPU)) {
+ cr0 = read_cr0();
+ if (cr0 & X86_CR0_TS)
+ clts();
+ }
+ asm volatile(
+ "movlps %%xmm0, " __percpu_arg(0) "\n\t"
+ "movlps " __percpu_arg(2) ", %%xmm0\n\t"
+ "movlps %%xmm0, %1\n\t"
+ "movlps " __percpu_arg(0) ", %%xmm0\n\t"
+ : "+m" (per_cpu__sse_atomic64_percpu.xmm0_low), "=m" (v->counter)
+ : "m" (per_cpu__sse_atomic64_percpu.low), "m" (per_cpu__sse_atomic64_percpu.high));
+ if (cr0 & X86_CR0_TS)
+ write_cr0(cr0);
+ preempt_enable();
+}
+EXPORT_SYMBOL(sse_atomic64_set_cx8call);
--
1.6.6.1.476.g01ddb
This patch replaces atomic64_32.c with an assembly implementation.
It provides the following advantages:
1. Implements atomic64_add_unless, atomic64_dec_if_positive and
atomic64_inc_not_zero
2. Uses the ZF flag changed by cmpxchg8b instead of doing a comparison
3. Uses custom register calling conventions that reduce or eliminate
register moves to suit cmpxchg8b
4. Reads the initial value instead of using cmpxchg8b to do that.
Currently we use lock xaddl and movl, which seems the fastest.
5. Does not use the lock prefix for atomic64_set
64-bit writes are already atomic, so we don't need that.
We still need it for atomic64_read to avoid restoring a value
changed in the meantime.
6. Allocates registers as well or better than gcc
A pure assembly implementation is required due to the custom calling
conventions, and desire to use %ebp in atomic64_add_return (we need
7 registers...).
Signed-off-by: Luca Barbieri <[email protected]>
---
arch/x86/include/asm/atomic_32.h | 244 +++++++++++++++++++++-----------------
arch/x86/lib/Makefile | 2 +-
arch/x86/lib/atomic64_32.c | 243 ++++----------------------------------
arch/x86/lib/atomic64_asm_32.S | 231 ++++++++++++++++++++++++++++++++++++
4 files changed, 390 insertions(+), 330 deletions(-)
create mode 100644 arch/x86/lib/atomic64_asm_32.S
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index dc5a667..50a6d4c 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -268,56 +268,89 @@ typedef struct {
#define ATOMIC64_INIT(val) { (val) }
-extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val);
+long long cx8_atomic64_read_cx8call(long long, const atomic64_t *v);
+long long cx8_atomic64_set_cx8call(long long, const atomic64_t *v);
+long long cx8_atomic64_xchg_cx8call(long long, unsigned high);
+long long cx8_atomic64_add_return(long long a, atomic64_t *v);
+long long cx8_atomic64_sub_return(long long a, atomic64_t *v);
+long long cx8_atomic64_inc_return_cx8call(long long a, atomic64_t *v);
+long long cx8_atomic64_dec_return_cx8call(long long a, atomic64_t *v);
+long long cx8_atomic64_dec_if_positive_cx8call(atomic64_t *v);
+int cx8_atomic64_inc_not_zero_cx8call(atomic64_t *v);
+int cx8_atomic64_add_unless(atomic64_t *v, long long a, long long u);
+
+/**
+ * atomic64_cmpxchg - cmpxchg atomic64 variable
+ * @p: pointer to type atomic64_t
+ * @o: expected value
+ * @n: new value
+ *
+ * Atomically sets @v to @n if it was equal to @o and returns
+ * the old value.
+ */
+static inline long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n)
+{
+ asm volatile(LOCK_PREFIX "cmpxchg8b %1"
+ : "+A" (o)
+ : "m" (v->counter), "b" ((unsigned)n), "c" ((unsigned)(n >> 32))
+ );
+ return o;
+}
/**
* atomic64_xchg - xchg atomic64 variable
- * @ptr: pointer to type atomic64_t
- * @new_val: value to assign
+ * @v: pointer to type atomic64_t
+ * @n: value to assign
*
- * Atomically xchgs the value of @ptr to @new_val and returns
+ * Atomically xchgs the value of @v to @n and returns
* the old value.
*/
-extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val);
+static inline long long atomic64_xchg(atomic64_t *v, long long n)
+{
+ long long o;
+ unsigned high = (unsigned)(n >> 32);
+ unsigned low = (unsigned)n;
+ asm volatile("call cx8_atomic64_xchg_cx8call"
+ : "=A" (o), "+b" (low), "+c" (high)
+ : "S" (v)
+ : "memory"
+ );
+ return o;
+}
/**
* atomic64_set - set atomic64 variable
- * @ptr: pointer to type atomic64_t
- * @new_val: value to assign
+ * @v: pointer to type atomic64_t
+ * @n: value to assign
*
- * Atomically sets the value of @ptr to @new_val.
+ * Atomically sets the value of @v to @n.
*/
-extern void atomic64_set(atomic64_t *ptr, u64 new_val);
+static inline void atomic64_set(atomic64_t *v, long long i)
+{
+ unsigned high = (unsigned)(i >> 32);
+ unsigned low = (unsigned)i;
+ asm volatile("call cx8_atomic64_set_cx8call"
+ : "+b" (low), "+c" (high)
+ : "S" (v)
+ : "eax", "edx", "memory"
+ );
+}
/**
* atomic64_read - read atomic64 variable
- * @ptr: pointer to type atomic64_t
+ * @v: pointer to type atomic64_t
*
- * Atomically reads the value of @ptr and returns it.
+ * Atomically reads the value of @v and returns it.
*/
-static inline u64 atomic64_read(atomic64_t *ptr)
+static inline long long atomic64_read(atomic64_t *v)
{
- u64 res;
-
- /*
- * Note, we inline this atomic64_t primitive because
- * it only clobbers EAX/EDX and leaves the others
- * untouched. We also (somewhat subtly) rely on the
- * fact that cmpxchg8b returns the current 64-bit value
- * of the memory location we are touching:
- */
- asm volatile(
- "mov %%ebx, %%eax\n\t"
- "mov %%ecx, %%edx\n\t"
- LOCK_PREFIX "cmpxchg8b %1\n"
- : "=&A" (res)
- : "m" (*ptr)
- );
-
- return res;
-}
-
-extern u64 atomic64_read(atomic64_t *ptr);
+ long long r;
+ asm volatile("call cx8_atomic64_read_cx8call"
+ : "=A" (r), "+c" (v)
+ : : "memory"
+ );
+ return r;
+ }
/**
* atomic64_add_return - add and return
@@ -326,90 +359,87 @@ extern u64 atomic64_read(atomic64_t *ptr);
*
* Atomically adds @delta to @ptr and returns @delta + *@ptr
*/
-extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr);
+static inline long long atomic64_add_return(long long a, atomic64_t *v)
+{
+ return cx8_atomic64_add_return(a, v);
+}
/*
* Other variants with different arithmetic operators:
*/
-extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr);
-extern u64 atomic64_inc_return(atomic64_t *ptr);
-extern u64 atomic64_dec_return(atomic64_t *ptr);
-
-/**
- * atomic64_add - add integer to atomic64 variable
- * @delta: integer value to add
- * @ptr: pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr.
- */
-extern void atomic64_add(u64 delta, atomic64_t *ptr);
-
-/**
- * atomic64_sub - subtract the atomic64 variable
- * @delta: integer value to subtract
- * @ptr: pointer to type atomic64_t
- *
- * Atomically subtracts @delta from @ptr.
- */
-extern void atomic64_sub(u64 delta, atomic64_t *ptr);
+static inline long long atomic64_sub_return(long long a, atomic64_t *v)
+{
+ return cx8_atomic64_sub_return(a, v);
+}
-/**
- * atomic64_sub_and_test - subtract value from variable and test result
- * @delta: integer value to subtract
- * @ptr: pointer to type atomic64_t
- *
- * Atomically subtracts @delta from @ptr and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr);
+static inline long long atomic64_inc_return(atomic64_t *v)
+{
+ long long a;
+ asm volatile("call cx8_atomic64_inc_return_cx8call"
+ : "=A" (a)
+ : "S" (v)
+ : "memory", "ecx"
+ );
+ return a;
+}
-/**
- * atomic64_inc - increment atomic64 variable
- * @ptr: pointer to type atomic64_t
- *
- * Atomically increments @ptr by 1.
- */
-extern void atomic64_inc(atomic64_t *ptr);
+static inline long long atomic64_dec_return(atomic64_t *v)
+{
+ long long a;
+ asm volatile("call cx8_atomic64_dec_return_cx8call"
+ : "=A" (a)
+ : "S" (v)
+ : "memory", "ecx"
+ );
+ return a;
+}
-/**
- * atomic64_dec - decrement atomic64 variable
- * @ptr: pointer to type atomic64_t
- *
- * Atomically decrements @ptr by 1.
- */
-extern void atomic64_dec(atomic64_t *ptr);
+static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
+{
+ int r = (int)v;
+ unsigned low = (unsigned)a;
+ unsigned high = (unsigned)(a >> 2);
+ asm volatile(
+ "pushl %4\n\t"
+ "pushl %3\n\t"
+ "call cx8_atomic64_add_unless\n\t"
+ "addl $8, %%esp\n\t"
+ : "+a" (r), "+d" (low), "+c" (high)
+ : "g" ((unsigned)u), "g" ((unsigned)(u >> 32))
+ : "memory");
+ return r;
+}
-/**
- * atomic64_dec_and_test - decrement and test
- * @ptr: pointer to type atomic64_t
- *
- * Atomically decrements @ptr by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-extern int atomic64_dec_and_test(atomic64_t *ptr);
+static inline long long atomic64_dec_if_positive(atomic64_t *v)
+{
+ long long r;
+ asm volatile("call cx8_atomic64_dec_if_positive_cx8call"
+ : "=A" (r)
+ : "S" (v)
+ : "ecx", "memory"
+ );
+ return r;
+}
-/**
- * atomic64_inc_and_test - increment and test
- * @ptr: pointer to type atomic64_t
- *
- * Atomically increments @ptr by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-extern int atomic64_inc_and_test(atomic64_t *ptr);
+static inline int atomic64_inc_not_zero(atomic64_t *v)
+{
+ int r;
+ asm volatile("call cx8_atomic64_inc_not_zero_cx8call"
+ : "=a" (r)
+ : "S" (v)
+ : "ecx", "edx", "memory"
+ );
+ return r;
+}
-/**
- * atomic64_add_negative - add and test if negative
- * @delta: integer value to add
- * @ptr: pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-extern int atomic64_add_negative(u64 delta, atomic64_t *ptr);
+#define atomic64_add(i, v) atomic64_add_return(i, v)
+#define atomic64_sub(i, v) atomic64_sub_return(i, v)
+#define atomic64_add_negative(a, v) (atomic64_add_return((a), (v)) < 0)
+#define atomic64_inc(v) atomic64_inc_return(v)
+#define atomic64_inc_and_test(v) (atomic64_inc_return(v) == 0)
+#define atomic64_sub_and_test(a, v) (atomic64_sub_return((a), (v)) == 0)
+#define atomic64_dec(v) atomic64_dec_return(v)
+#define atomic64_dec_and_test(v) (atomic64_dec_return((v)) == 0)
#include <asm-generic/atomic-long.h>
#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index cffd754..b42fdeb 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -25,7 +25,7 @@ lib-$(CONFIG_KPROBES) += insn.o inat.o
obj-y += msr.o msr-reg.o msr-reg-export.o
ifeq ($(CONFIG_X86_32),y)
- obj-y += atomic64_32.o
+ obj-y += atomic64_32.o atomic64_asm_32.o
lib-y += checksum_32.o
lib-y += strstr_32.o
lib-y += semaphore_32.o string_32.o
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
index 824fa0b..18544b3 100644
--- a/arch/x86/lib/atomic64_32.c
+++ b/arch/x86/lib/atomic64_32.c
@@ -1,3 +1,14 @@
+/*
+ * atomic64_t for x86-32
+ *
+ * Copyright © 2010 Luca Barbieri
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -6,225 +17,13 @@
#include <asm/cmpxchg.h>
#include <asm/atomic.h>
-static noinline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new)
-{
- u32 low = new;
- u32 high = new >> 32;
-
- asm volatile(
- LOCK_PREFIX "cmpxchg8b %1\n"
- : "+A" (old), "+m" (*ptr)
- : "b" (low), "c" (high)
- );
- return old;
-}
-
-u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val)
-{
- return cmpxchg8b(&ptr->counter, old_val, new_val);
-}
-EXPORT_SYMBOL(atomic64_cmpxchg);
-
-/**
- * atomic64_xchg - xchg atomic64 variable
- * @ptr: pointer to type atomic64_t
- * @new_val: value to assign
- *
- * Atomically xchgs the value of @ptr to @new_val and returns
- * the old value.
- */
-u64 atomic64_xchg(atomic64_t *ptr, u64 new_val)
-{
- /*
- * Try first with a (possibly incorrect) assumption about
- * what we have there. We'll do two loops most likely,
- * but we'll get an ownership MESI transaction straight away
- * instead of a read transaction followed by a
- * flush-for-ownership transaction:
- */
- u64 old_val, real_val = 0;
-
- do {
- old_val = real_val;
-
- real_val = atomic64_cmpxchg(ptr, old_val, new_val);
-
- } while (real_val != old_val);
-
- return old_val;
-}
-EXPORT_SYMBOL(atomic64_xchg);
-
-/**
- * atomic64_set - set atomic64 variable
- * @ptr: pointer to type atomic64_t
- * @new_val: value to assign
- *
- * Atomically sets the value of @ptr to @new_val.
- */
-void atomic64_set(atomic64_t *ptr, u64 new_val)
-{
- atomic64_xchg(ptr, new_val);
-}
-EXPORT_SYMBOL(atomic64_set);
-
-/**
-EXPORT_SYMBOL(atomic64_read);
- * atomic64_add_return - add and return
- * @delta: integer value to add
- * @ptr: pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr and returns @delta + *@ptr
- */
-noinline u64 atomic64_add_return(u64 delta, atomic64_t *ptr)
-{
- /*
- * Try first with a (possibly incorrect) assumption about
- * what we have there. We'll do two loops most likely,
- * but we'll get an ownership MESI transaction straight away
- * instead of a read transaction followed by a
- * flush-for-ownership transaction:
- */
- u64 old_val, new_val, real_val = 0;
-
- do {
- old_val = real_val;
- new_val = old_val + delta;
-
- real_val = atomic64_cmpxchg(ptr, old_val, new_val);
-
- } while (real_val != old_val);
-
- return new_val;
-}
-EXPORT_SYMBOL(atomic64_add_return);
-
-u64 atomic64_sub_return(u64 delta, atomic64_t *ptr)
-{
- return atomic64_add_return(-delta, ptr);
-}
-EXPORT_SYMBOL(atomic64_sub_return);
-
-u64 atomic64_inc_return(atomic64_t *ptr)
-{
- return atomic64_add_return(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_inc_return);
-
-u64 atomic64_dec_return(atomic64_t *ptr)
-{
- return atomic64_sub_return(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_dec_return);
-
-/**
- * atomic64_add - add integer to atomic64 variable
- * @delta: integer value to add
- * @ptr: pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr.
- */
-void atomic64_add(u64 delta, atomic64_t *ptr)
-{
- atomic64_add_return(delta, ptr);
-}
-EXPORT_SYMBOL(atomic64_add);
-
-/**
- * atomic64_sub - subtract the atomic64 variable
- * @delta: integer value to subtract
- * @ptr: pointer to type atomic64_t
- *
- * Atomically subtracts @delta from @ptr.
- */
-void atomic64_sub(u64 delta, atomic64_t *ptr)
-{
- atomic64_add(-delta, ptr);
-}
-EXPORT_SYMBOL(atomic64_sub);
-
-/**
- * atomic64_sub_and_test - subtract value from variable and test result
- * @delta: integer value to subtract
- * @ptr: pointer to type atomic64_t
- *
- * Atomically subtracts @delta from @ptr and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-int atomic64_sub_and_test(u64 delta, atomic64_t *ptr)
-{
- u64 new_val = atomic64_sub_return(delta, ptr);
-
- return new_val == 0;
-}
-EXPORT_SYMBOL(atomic64_sub_and_test);
-
-/**
- * atomic64_inc - increment atomic64 variable
- * @ptr: pointer to type atomic64_t
- *
- * Atomically increments @ptr by 1.
- */
-void atomic64_inc(atomic64_t *ptr)
-{
- atomic64_add(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_inc);
-
-/**
- * atomic64_dec - decrement atomic64 variable
- * @ptr: pointer to type atomic64_t
- *
- * Atomically decrements @ptr by 1.
- */
-void atomic64_dec(atomic64_t *ptr)
-{
- atomic64_sub(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_dec);
-
-/**
- * atomic64_dec_and_test - decrement and test
- * @ptr: pointer to type atomic64_t
- *
- * Atomically decrements @ptr by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-int atomic64_dec_and_test(atomic64_t *ptr)
-{
- return atomic64_sub_and_test(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_dec_and_test);
-
-/**
- * atomic64_inc_and_test - increment and test
- * @ptr: pointer to type atomic64_t
- *
- * Atomically increments @ptr by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-int atomic64_inc_and_test(atomic64_t *ptr)
-{
- return atomic64_sub_and_test(-1, ptr);
-}
-EXPORT_SYMBOL(atomic64_inc_and_test);
-
-/**
- * atomic64_add_negative - add and test if negative
- * @delta: integer value to add
- * @ptr: pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-int atomic64_add_negative(u64 delta, atomic64_t *ptr)
-{
- s64 new_val = atomic64_add_return(delta, ptr);
-
- return new_val < 0;
-}
-EXPORT_SYMBOL(atomic64_add_negative);
+EXPORT_SYMBOL(cx8_atomic64_read_cx8call);
+EXPORT_SYMBOL(cx8_atomic64_set_cx8call);
+EXPORT_SYMBOL(cx8_atomic64_xchg_cx8call);
+EXPORT_SYMBOL(cx8_atomic64_add_return);
+EXPORT_SYMBOL(cx8_atomic64_sub_return);
+EXPORT_SYMBOL(cx8_atomic64_add_unless);
+EXPORT_SYMBOL(cx8_atomic64_inc_return_cx8call);
+EXPORT_SYMBOL(cx8_atomic64_dec_return_cx8call);
+EXPORT_SYMBOL(cx8_atomic64_dec_if_positive_cx8call);
+EXPORT_SYMBOL(cx8_atomic64_inc_not_zero_cx8call);
diff --git a/arch/x86/lib/atomic64_asm_32.S b/arch/x86/lib/atomic64_asm_32.S
new file mode 100644
index 0000000..3e98118
--- /dev/null
+++ b/arch/x86/lib/atomic64_asm_32.S
@@ -0,0 +1,231 @@
+/*
+ * atomic64_t for x86-32
+ *
+ * Copyright © 2010 Luca Barbieri
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/dwarf2.h>
+
+.macro SAVE reg
+ pushl %\reg
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET \reg, 0
+.endm
+
+.macro RESTORE reg
+ popl %\reg
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE \reg
+.endm
+
+.macro read64_for_write reg
+ xorl %eax, %eax
+ LOCK_PREFIX
+ xaddl %eax, (\reg)
+ movl 4(\reg), %edx
+.endm
+
+ENTRY(cx8_atomic64_read_cx8call)
+ CFI_STARTPROC
+
+ movl %ebx, %eax
+ movl %ecx, %edx
+
+/* we need LOCK_PREFIX since otherwise cmpxchg8b always does the write */
+ LOCK_PREFIX
+ cmpxchg8b (%ecx)
+
+ ret
+ CFI_ENDPROC
+ENDPROC(cx8_atomic64_read_cx8call)
+
+ENTRY(cx8_atomic64_set_cx8call)
+ CFI_STARTPROC
+
+ read64_for_write %esi
+
+1:
+/* we don't need LOCK_PREFIX since aligned 64-bit writes are atomic on 586 and newer */
+ cmpxchg8b (%esi)
+ jne 1b
+
+ ret
+ CFI_ENDPROC
+ENDPROC(cx8_atomic64_set_cx8call)
+
+ENTRY(cx8_atomic64_xchg_cx8call)
+ CFI_STARTPROC
+
+ read64_for_write %esi
+
+1:
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+ ret
+ CFI_ENDPROC
+ENDPROC(cx8_atomic64_xchg_cx8call)
+
+.macro addsub_return func ins insc
+ENTRY(cx8_atomic64_\func\()_return)
+ CFI_STARTPROC
+ SAVE ebp
+ SAVE ebx
+ SAVE esi
+ SAVE edi
+
+ movl %eax, %esi
+ movl %edx, %edi
+ movl %ecx, %ebp
+
+ read64_for_write %ebp
+1:
+ movl %eax, %ebx
+ movl %edx, %ecx
+ \ins\()l %esi, %ebx
+ \insc\()l %edi, %ecx
+ LOCK_PREFIX
+ cmpxchg8b (%ebp)
+ jne 1b
+
+10:
+ movl %ebx, %eax
+ movl %ecx, %edx
+ RESTORE edi
+ RESTORE esi
+ RESTORE ebx
+ RESTORE ebp
+ ret
+ CFI_ENDPROC
+ENDPROC(cx8_atomic64_\func\()_return)
+.endm
+
+addsub_return add add adc
+addsub_return sub sub sbb
+
+.macro incdec_return func ins insc
+ENTRY(cx8_atomic64_\func\()_return_cx8call)
+ CFI_STARTPROC
+ SAVE ebx
+
+ read64_for_write %esi
+1:
+ movl %eax, %ebx
+ movl %edx, %ecx
+ \ins\()l $1, %ebx
+ \insc\()l $0, %ecx
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+10:
+ movl %ebx, %eax
+ movl %ecx, %edx
+ RESTORE ebx
+ ret
+ CFI_ENDPROC
+ENDPROC(cx8_atomic64_\func\()_return_cx8call)
+.endm
+
+incdec_return inc add adc
+incdec_return dec sub sbb
+
+ENTRY(cx8_atomic64_dec_if_positive_cx8call)
+ CFI_STARTPROC
+ SAVE ebx
+
+ read64_for_write %esi
+1:
+ movl %eax, %ebx
+ movl %edx, %ecx
+/* dec doesn't set the carry flag */
+ subl $1, %ebx
+ sbb $0, %ecx
+ js 2f
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+2:
+ movl %ebx, %eax
+ movl %ecx, %edx
+ RESTORE ebx
+ ret
+ CFI_ENDPROC
+ENDPROC(cx8_atomic64_dec_if_positive_cx8call)
+
+ENTRY(cx8_atomic64_add_unless)
+ CFI_STARTPROC
+ SAVE ebp
+ SAVE ebx
+ SAVE esi
+ SAVE edi
+
+ movl %eax, %ebp
+ movl %edx, %esi
+ movl %ecx, %edi
+
+ read64_for_write %ebp
+1:
+ cmpl %eax, 0x14(%esp)
+ je 4f
+2:
+ movl %eax, %ebx
+ movl %edx, %ecx
+ addl %esi, %ebx
+ adcl %edi, %ecx
+ LOCK_PREFIX
+ cmpxchg8b (%ebp)
+ jne 1b
+
+ xorl %eax, %eax
+3:
+ RESTORE edi
+ RESTORE esi
+ RESTORE ebx
+ RESTORE ebp
+ ret
+4:
+ cmpl %edx, 0x18(%esp)
+ jne 2b
+ movl $1, %eax
+ jmp 3b
+ CFI_ENDPROC
+ENDPROC(cx8_atomic64_add_unless)
+
+ENTRY(cx8_atomic64_inc_not_zero_cx8call)
+ CFI_STARTPROC
+ SAVE ebx
+
+ read64_for_write %esi
+1:
+ testl %eax, %eax
+ je 4f
+2:
+ movl %eax, %ebx
+ movl %edx, %ecx
+ addl $1, %ebx
+ adcl $0, %ecx
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+ xorl %eax, %eax
+3:
+ RESTORE ebx
+ ret
+4:
+ testl %edx, %edx
+ jne 2b
+ movl $1, %eax
+ jmp 3b
+ CFI_ENDPROC
+ENDPROC(cx8_atomic64_inc_not_zero_cx8call)
--
1.6.6.1.476.g01ddb
This patch adds self-test on boot code for atomic64_t.
This has been used to test the later changes in this patchset.
Signed-off-by: Luca Barbieri <[email protected]>
---
lib/Kconfig.debug | 7 ++
lib/Makefile | 2 +
lib/atomic64_test.c | 158 +++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 167 insertions(+), 0 deletions(-)
create mode 100644 lib/atomic64_test.c
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 25c3ed5..3676c51 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1054,6 +1054,13 @@ config DMA_API_DEBUG
This option causes a performance degredation. Use only if you want
to debug device drivers. If unsure, say N.
+config ATOMIC64_SELFTEST
+ bool "Perform an atomic64_t self-test at boot"
+ help
+ Enable this option to test the atomic64_t functions at boot.
+
+ If unsure, say N.
+
source "samples/Kconfig"
source "lib/Kconfig.kgdb"
diff --git a/lib/Makefile b/lib/Makefile
index 3b0b4a6..ab333e8 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -100,6 +100,8 @@ obj-$(CONFIG_GENERIC_CSUM) += checksum.o
obj-$(CONFIG_GENERIC_ATOMIC64) += atomic64.o
+obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o
+
hostprogs-y := gen_crc32table
clean-files := crc32table.h
diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c
new file mode 100644
index 0000000..4ff649e
--- /dev/null
+++ b/lib/atomic64_test.c
@@ -0,0 +1,158 @@
+/*
+ * Testsuite for atomic64_t functions
+ *
+ * Copyright © 2010 Luca Barbieri
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <linux/init.h>
+#include <asm/atomic.h>
+
+#define INIT(c) do { atomic64_set(&v, c); r = c; } while (0)
+static __init int test_atomic64(void)
+{
+ long long v0 = 0xaaa31337c001d00dLL;
+ long long v1 = 0xdeadbeefdeafcafeLL;
+ long long v2 = 0xfaceabadf00df001LL;
+ long long onestwos = 0x1111111122222222LL;
+ long long one = 1LL;
+
+ atomic64_t v = ATOMIC64_INIT(v0);
+ long long r = v0;
+ BUG_ON(v.counter != r);
+
+ atomic64_set(&v, v1);
+ r = v1;
+ BUG_ON(v.counter != r);
+ BUG_ON(atomic64_read(&v) != r);
+
+ INIT(v0);
+ atomic64_add(onestwos, &v);
+ r += onestwos;
+ BUG_ON(v.counter != r);
+
+ INIT(v0);
+ atomic64_add(-one, &v);
+ r += -one;
+ BUG_ON(v.counter != r);
+
+ INIT(v0);
+ r += onestwos;
+ BUG_ON(atomic64_add_return(onestwos, &v) != r);
+ BUG_ON(v.counter != r);
+
+ INIT(v0);
+ r += -one;
+ BUG_ON(atomic64_add_return(-one, &v) != r);
+ BUG_ON(v.counter != r);
+
+ INIT(v0);
+ atomic64_sub(onestwos, &v);
+ r -= onestwos;
+ BUG_ON(v.counter != r);
+
+ INIT(v0);
+ atomic64_sub(-one, &v);
+ r -= -one;
+ BUG_ON(v.counter != r);
+
+ INIT(v0);
+ r -= onestwos;
+ BUG_ON(atomic64_sub_return(onestwos, &v) != r);
+ BUG_ON(v.counter != r);
+
+ INIT(v0);
+ r -= -one;
+ BUG_ON(atomic64_sub_return(-one, &v) != r);
+ BUG_ON(v.counter != r);
+
+ INIT(v0);
+ atomic64_inc(&v);
+ r += one;
+ BUG_ON(v.counter != r);
+
+ INIT(v0);
+ r += one;
+ BUG_ON(atomic64_inc_return(&v) != r);
+ BUG_ON(v.counter != r);
+
+ INIT(v0);
+ atomic64_dec(&v);
+ r -= one;
+ BUG_ON(v.counter != r);
+
+ INIT(v0);
+ r -= one;
+ BUG_ON(atomic64_dec_return(&v) != r);
+ BUG_ON(v.counter != r);
+
+ INIT(v0);
+ BUG_ON(atomic64_xchg(&v, v1) != v0);
+ r = v1;
+ BUG_ON(v.counter != r);
+
+ INIT(v0);
+ BUG_ON(atomic64_cmpxchg(&v, v0, v1) != v0);
+ r = v1;
+ BUG_ON(v.counter != r);
+
+ INIT(v0);
+ BUG_ON(atomic64_cmpxchg(&v, v2, v1) != v0);
+ BUG_ON(v.counter != r);
+
+ INIT(v0);
+ BUG_ON(!atomic64_add_unless(&v, one, v0));
+ BUG_ON(v.counter != r);
+
+ INIT(v0);
+ BUG_ON(atomic64_add_unless(&v, one, v1));
+ r += one;
+ BUG_ON(v.counter != r);
+
+ INIT(onestwos);
+ BUG_ON(atomic64_dec_if_positive(&v) != (onestwos - 1));
+ r -= one;
+ BUG_ON(v.counter != r);
+
+ INIT(0);
+ BUG_ON(atomic64_dec_if_positive(&v) != -one);
+ BUG_ON(v.counter != r);
+
+ INIT(-one);
+ BUG_ON(atomic64_dec_if_positive(&v) != (-one - one));
+ BUG_ON(v.counter != r);
+
+ INIT(onestwos);
+ BUG_ON(atomic64_inc_not_zero(&v));
+ r += one;
+ BUG_ON(v.counter != r);
+
+ INIT(0);
+ BUG_ON(!atomic64_inc_not_zero(&v));
+ BUG_ON(v.counter != r);
+
+ INIT(-one);
+ BUG_ON(atomic64_inc_not_zero(&v));
+ r += one;
+ BUG_ON(v.counter != r);
+
+#ifdef CONFIG_X86
+ printk(KERN_INFO "atomic64 test passed for %s+ platform %s CX8 and %s SSE\n",
+#ifdef CONFIG_X86_CMPXCHG64
+ "586",
+#else
+ "386",
+#endif
+ boot_cpu_has(X86_FEATURE_CX8) ? "with" : "without",
+ boot_cpu_has(X86_FEATURE_XMM) ? "with" : "without");
+#else
+ printk(KERN_INFO "atomic64 test passed\n");
+#endif
+
+ return 0;
+}
+
+core_initcall(test_atomic64);
--
1.6.6.1.476.g01ddb
Use the functionality just introduced in the previous patch.
---
arch/x86/include/asm/cmpxchg_32.h | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index ffb9bb6..3c267e0 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -271,7 +271,8 @@ extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
__typeof__(*(ptr)) __ret; \
__typeof__(*(ptr)) __old = (o); \
__typeof__(*(ptr)) __new = (n); \
- alternative_io("call cmpxchg8b_emu", \
+ alternative_io(LOCK_PREFIX_ALTERNATIVE_PATCH \
+ "call cmpxchg8b_emu", \
"lock; cmpxchg8b (%%esi)" , \
X86_FEATURE_CX8, \
"=A" (__ret), \
--
1.6.6.1.476.g01ddb
Currently CALL and JMP cannot be used in alternatives because the
relative offset would be wrong.
This patch uses the existing x86 instruction parser to parse the
alternative sequence and fix up the displacements.
This allows to implement this feature with minimal code.
Signed-off-by: Luca Barbieri <[email protected]>
---
arch/x86/kernel/alternative.c | 22 ++++++++++++++++++++++
1 files changed, 22 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index de7353c..7464c7e 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -17,6 +17,7 @@
#include <asm/tlbflush.h>
#include <asm/io.h>
#include <asm/fixmap.h>
+#include <asm/insn.h>
#define MAX_PATCH_LEN (255-1)
@@ -195,6 +196,26 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern u8 *__smp_locks[], *__smp_locks_end[];
static void *text_poke_early(void *addr, const void *opcode, size_t len);
+/* Fix call instruction displacements */
+static void __init_or_module fixup_relative_addresses(char *buf, size_t size, size_t adj)
+{
+ struct insn insn;
+ char *p = buf;
+ char *end = p + size;
+ while (p < end) {
+ kernel_insn_init(&insn, p);
+ insn_get_opcode(&insn);
+
+ /* CALL or JMP near32 */
+ if (insn.opcode.bytes[0] == 0xe8 || insn.opcode.bytes[0] == 0xe9) {
+ size_t disp_off = insn.next_byte - insn.kaddr;
+ *(unsigned *)(p + disp_off) += adj;
+ }
+ insn_get_length(&insn);
+ p += insn.length;
+ }
+}
+
/* Replace instructions with better alternatives for this CPU type.
This runs before SMP is initialized to avoid SMP problems with
self modifying code. This implies that assymetric systems where
@@ -223,6 +244,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
}
#endif
memcpy(insnbuf, a->replacement, a->replacementlen);
+ fixup_relative_addresses(insnbuf, a->replacementlen, a->replacement - instr);
add_nops(insnbuf + a->replacementlen,
a->instrlen - a->replacementlen);
text_poke_early(instr, insnbuf, a->instrlen);
--
1.6.6.1.476.g01ddb
The current lock prefix UP/SMP alternative code doesn't allow
LOCK_PREFIX to be used in alternatives code.
This patch solves the problem by adding a new LOCK_PREFIX_ALTERNATIVE_PATCH
macro that only records the lock prefix location but does not emit
the prefix.
The user of this macro can then start any alternative sequence with
"lock" and have it UP/SMP patched.
To make this work, the UP/SMP alternative code is changed to do the
lock/DS prefix switching only if the byte actually contains a lock or
DS prefix.
Thus, if an alternative without the "lock" is selected, it will now do
nothing instead of clobbering the code.
Signed-off-by: Luca Barbieri <[email protected]>
---
arch/x86/include/asm/alternative.h | 6 ++++--
arch/x86/kernel/alternative.c | 6 ++++--
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 42d41ac..b31af51 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -28,12 +28,14 @@
*/
#ifdef CONFIG_SMP
-#define LOCK_PREFIX \
+#define LOCK_PREFIX_ALTERNATIVE_PATCH \
".section .smp_locks,\"a\"\n" \
_ASM_ALIGN "\n" \
_ASM_PTR "661f\n" /* address */ \
".previous\n" \
- "661:\n\tlock; "
+ "661:"
+
+#define LOCK_PREFIX LOCK_PREFIX_ALTERNATIVE_PATCH "\n\tlock; "
#else /* ! CONFIG_SMP */
#define LOCK_PREFIX ""
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 7464c7e..b4ff6cb 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -264,7 +264,8 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
if (*ptr > text_end)
continue;
/* turn DS segment override prefix into lock prefix */
- text_poke(*ptr, ((unsigned char []){0xf0}), 1);
+ if (**ptr == 0x3e)
+ text_poke(*ptr, ((unsigned char []){0xf0}), 1);
};
mutex_unlock(&text_mutex);
}
@@ -283,7 +284,8 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
if (*ptr > text_end)
continue;
/* turn lock prefix into DS segment override prefix */
- text_poke(*ptr, ((unsigned char []){0x3E}), 1);
+ if (**ptr == 0xf0)
+ text_poke(*ptr, ((unsigned char []){0x3E}), 1);
};
mutex_unlock(&text_mutex);
}
--
1.6.6.1.476.g01ddb
On 02/17/2010 01:42 PM, Luca Barbieri wrote:
> This patch modifies the x86 alternative macros to allow more than one
> alternative code sequence.
>
> This is done by simply adding multiple alternative patches, which are
> applied in sequence, overwriting previous ones.
>
> +/* alternative assembly primitive: */
> +#define ALTERNATIVE(oldinstr, newinstr, feature) \
> + "661:\n\t" oldinstr "\n662:\n" \
> + ALTERNATIVE_PATCH("661b", "662b", newinstr, feature)
> +
> +#define ALTERNATIVE3(oldinstr, newinstr1, feature1, newinstr2, feature2) \
> + "661:\n\t" oldinstr "\n662:\n" \
> + ALTERNATIVE_PATCH("661b", "662b", newinstr1, feature1) "\n" \
> + ALTERNATIVE_PATCH("661b", "662b", newinstr2, feature2)
> +
> +#define ALTERNATIVE4(oldinstr, newinstr1, feature1, newinstr2, feature2, newinstr3, feature3) \
> + "661:\n\t" oldinstr "\n662:\n" \
> + ALTERNATIVE_PATCH("661b", "662b", newinstr1, feature1) "\n" \
> + ALTERNATIVE_PATCH("661b", "662b", newinstr2, feature2) "\n" \
> + ALTERNATIVE_PATCH("661b", "662b", newinstr3, feature3)
> +
>
Suggest documenting the precedence of alternatives: if both feature1 and
feature2 are present, which newinstr is patched in?
--
error compiling committee.c: too many arguments to function
On 02/17/2010 03:42 AM, Luca Barbieri wrote:
> No known CPU should have this combination, and future ones are very
> unlikely to.
>
> However, should this happen, we would generate working but non-atomic
> code, so panic instead.
> ---
> arch/x86/lib/atomic64_32.c | 11 +++++++++++
> 1 files changed, 11 insertions(+), 0 deletions(-)
>
> diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
> index 9ff8589..35dbd12 100644
> --- a/arch/x86/lib/atomic64_32.c
> +++ b/arch/x86/lib/atomic64_32.c
> @@ -47,6 +47,17 @@ EXPORT_SYMBOL(cx8_atomic64_inc_not_zero_cx8call);
> union generic_atomic64_lock generic_atomic64_lock[ATOMIC64_NR_LOCKS] __cacheline_aligned_in_smp;
> pure_initcall(init_generic_atomic64_lock);
>
> +static int __init panic_on_sse_without_cx8(void)
> +{
> + /* no known CPU should do this, and we generate non-atomic code in this case
> + * because we mix the generic spinlock-reliant code and the SSE code
> + */
> + if (!boot_cpu_has(X86_FEATURE_CX8) && boot_cpu_has(X86_FEATURE_XMM))
> + panic("CPUs without CX8 but with SSE are not supported\nBoot with clearcpuid=25 and report your CPU model to [email protected]\n");
> + return 0;
> +}
> +core_initcall(panic_on_sse_without_cx8);
> +
> EXPORT_SYMBOL(generic_atomic64_add);
> EXPORT_SYMBOL(generic_atomic64_add_return);
> EXPORT_SYMBOL(generic_atomic64_sub);
NAK in the extreme.
This is not how we deal with these kinds of stuff. If this really
matters, we explicitly clear the CPU feature in the feature detect code.
-hpa
On 02/17/2010 03:42 AM, Luca Barbieri wrote:
> This patch uses SSE movlps to perform 64-bit atomic reads and writes.
>
> According to Intel manuals, all aligned 64-bit reads and writes are
> atomically, which should include movlps.
>
> To do this, we need to disable preempt, clts if TS was set, and
> restore TS.
>
> If we don't need to change TS, using SSE is much faster.
>
> Otherwise, it should be essentially even, with the fastest method
> depending on the specific architecture.
>
> Another important point is that with SSE atomic64_read can keep the
> cacheline in shared state.
>
> If we could keep TS off and reenable it when returning to userspace,
> this would be even faster, but this is left for a later patch.
>
> We use SSE because we can just save the low part %xmm0, whereas using
> the FPU or MMX requires at least saving the environment, and seems
> impossible to do fast.
>
> Signed-off-by: Luca Barbieri <[email protected]>
I'm a bit unhappy about this patch. It seems to violate the assumption
that we only ever use the FPU state guarded by
kernel_fpu_begin()..kernel_fpu_end() and instead it uses a local hack,
which seems like a recipe for all kinds of very subtle problems down the
line.
Unless the performance advantage is provably very compelling, I'm
inclined to say that this is not worth it.
-hpa
> NAK in the extreme.
>
> This is not how we deal with these kinds of stuff. If this really
> matters, we explicitly clear the CPU feature in the feature detect code.
Also, don't forget the NT4 CX8 bug with non-Intel/AMD/Cyrix vendors:
http://www.geoffchappell.com/studies/windows/km/cpu/cx8.htm
If a processor support SSE (or PAE or MMX for that matter), you can pretty much assume CX8 exists even if the feature bit is cleared because of this bug.
Yuhong Bao
_________________________________________________________________
Hotmail: Trusted email with Microsoft?s powerful SPAM protection.
http://clk.atdmt.com/GBL/go/201469226/direct/01/-
On 02/17/2010 03:00 PM, Yuhong Bao wrote:
>
>> NAK in the extreme.
>>
>> This is not how we deal with these kinds of stuff. If this really
>> matters, we explicitly clear the CPU feature in the feature detect code.
> Also, don't forget the NT4 CX8 bug with non-Intel/AMD/Cyrix vendors:
> http://www.geoffchappell.com/studies/windows/km/cpu/cx8.htm
> If a processor support SSE (or PAE or MMX for that matter), you can pretty much assume CX8 exists even if the feature bit is cleared because of this bug.
>
Those kinds of things we *also* deal with by adjusting the CPU features
in the detection code.
-hpa
> I'm a bit unhappy about this patch. It seems to violate the assumption
> that we only ever use the FPU state guarded by
> kernel_fpu_begin()..kernel_fpu_end() and instead it uses a local hack,
> which seems like a recipe for all kinds of very subtle problems down the
> line.
kernel_fpu_begin saves the whole FPU state, but to use SSE we don't
really need that, since we can just save the %xmm registers we need,
which is much faster.
This is why SSE is used instead of just using an FPU double read.
We could however add a kernel_sse_begin_nosave/kernel_sse_end_nosave to do this.
> Unless the performance advantage is provably very compelling, I'm
> inclined to say that this is not worth it.
There is the advantage of not taking the cacheline for writing in atomic64_read.
Also locked cmpxchg8b is slow and if we were to restore the TS flag
lazily on userspace return, it would significantly improve the
function in all cases (with the current code, it depends on how fast
the architecture does clts/stts vs lock cmpxchg8b).
Of course the big-picture impact depends on the users of the interface.
Anyway, feel free to ignore this patch for now (and the next one as well).
On 02/17/2010 04:41 PM, Luca Barbieri wrote:
>> I'm a bit unhappy about this patch. It seems to violate the assumption
>> that we only ever use the FPU state guarded by
>> kernel_fpu_begin()..kernel_fpu_end() and instead it uses a local hack,
>> which seems like a recipe for all kinds of very subtle problems down the
>> line.
>
> kernel_fpu_begin saves the whole FPU state, but to use SSE we don't
> really need that, since we can just save the %xmm registers we need,
> which is much faster.
> This is why SSE is used instead of just using an FPU double read.
> We could however add a kernel_sse_begin_nosave/kernel_sse_end_nosave to do this.
>
We could, and that would definitely better than open-coding the operation.
>> Unless the performance advantage is provably very compelling, I'm
>> inclined to say that this is not worth it.
> There is the advantage of not taking the cacheline for writing in atomic64_read.
> Also locked cmpxchg8b is slow and if we were to restore the TS flag
> lazily on userspace return, it would significantly improve the
> function in all cases (with the current code, it depends on how fast
> the architecture does clts/stts vs lock cmpxchg8b).
> Of course the big-picture impact depends on the users of the interface.
It does, and I would prefer to not take it until there is a user of the
interface which motivates the performance. Ingo, do you have a feel for
how performance-critical this actually is?
-hpa
> NAK in the extreme.
>
> This is not how we deal with these kinds of stuff. ?If this really
> matters, we explicitly clear the CPU feature in the feature detect code.
The rationale is that a panic will likely be reported as a a bug and
this lets us know that such CPUs exist, which we may otherwise not
hear about.
The user may bypass the panic by altering the command line.
To avoid doing the panic, we could define an artificial "CX8_XMM"
flag, set it if CX8 && XMM and use that in the alternatives code in
the previous patch; this is a bit more intrusive.
We already manually set the CX8 flag on VIA CPUs that mistakenly not
report it, so that shouldn't be a problem.
>> Also, don't forget the NT4 CX8 bug with non-Intel/AMD/Cyrix vendors:
>> http://www.geoffchappell.com/studies/windows/km/cpu/cx8.htm
>> If a processor support SSE (or PAE or MMX for that matter), you can pretty much assume CX8 exists even if the feature bit is cleared because of this bug.
>>
>
> Those kinds of things we *also* deal with by adjusting the CPU features
> in the detection code.
And in fact, the Linux setup code already has code similar to the code used in XP to disable this NT4 workaround.
Yuhong Bao
_________________________________________________________________
Hotmail: Powerful Free email with security by Microsoft.
http://clk.atdmt.com/GBL/go/201469230/direct/01/-
Luca Barbieri <[email protected]> writes:
> This patch uses SSE movlps to perform 64-bit atomic reads and writes.
You seem to have forgotten to add benchmark results that show this is
actually worth while? And is there really any user on 32bit
that needs 64bit atomic_t?
I have to agree with Peter on it being a bad idea very likely
to mess with FPU state like this.
I'm also suspicious of your use of global register variables.
This means they won't be saved on entry/exit of the functions.
Does that really work?
It's also a not widely used gcc feature and those are always
somewhat risky.
-Andi
--
[email protected] -- Speaking for myself only.
> You seem to have forgotten to add benchmark results that show this is
> actually worth while? And is there really any user on 32bit
> that needs 64bit atomic_t?
perf is currently the main user.
On Core2, lock cmpxchg8b takes about 24 cycles and writes the
cacheline, while movlps takes 1 cycle.
clts/stts probably wipes out the savings if we need to use it, but we
can keep TS off and restore it lazily on return to userspace.
According to http://turkish_rational.tripod.com/trdos/pentium.txt
> I'm also suspicious of your use of global register variables.
> This means they won't be saved on entry/exit of the functions.
> Does that really work?
I think it does.
The functions never change the global register variables, and thus
they are preserved.
Calls are done in inline assembly, which saves the variables if they
are actually used as parameters (the global register variables are
only visible in a portion of the C file, of course).
Anyway, how about the previous patches?
The SSE part is not essential and can be delayed.
On 02/18/2010 02:47 AM, H. Peter Anvin wrote:
>
>>> Unless the performance advantage is provably very compelling, I'm
>>> inclined to say that this is not worth it.
>>>
>> There is the advantage of not taking the cacheline for writing in atomic64_read.
>> Also locked cmpxchg8b is slow and if we were to restore the TS flag
>> lazily on userspace return, it would significantly improve the
>> function in all cases (with the current code, it depends on how fast
>> the architecture does clts/stts vs lock cmpxchg8b).
>> Of course the big-picture impact depends on the users of the interface.
>>
> It does, and I would prefer to not take it until there is a user of the
> interface which motivates the performance. Ingo, do you have a feel for
> how performance-critical this actually is?
>
One heavy user is set_64() in the pagetable code. That's already in an
expensive operation due to the page fault so the impact will be quite
low, probably.
--
error compiling committee.c: too many arguments to function
> One heavy user is set_64() in the pagetable code. ?That's already in an
> expensive operation due to the page fault so the impact will be quite low,
> probably.
It currently does not use the atomic64_t infrastructure and thus won't
be affected currently, but can very easily be converted to cast the
pointer to atomic64_t* and use atomic64_set.
I think we set ptes in other places than the page fault handler.
Is any of them performance critical?
On Thu, Feb 18, 2010 at 10:53:06AM +0100, Luca Barbieri wrote:
> > You seem to have forgotten to add benchmark results that show this is
> > actually worth while? And is there really any user on 32bit
> > that needs 64bit atomic_t?
> perf is currently the main user.
> On Core2, lock cmpxchg8b takes about 24 cycles and writes the
> cacheline, while movlps takes 1 cycle.
> clts/stts probably wipes out the savings if we need to use it, but we
> can keep TS off and restore it lazily on return to userspace.
s/probably/very likely/
CR changes are slow and synchronize the CPU. The later is always slow.
It sounds like you didn't time it?
> > I'm also suspicious of your use of global register variables.
> > This means they won't be saved on entry/exit of the functions.
> > Does that really work?
> I think it does.
> The functions never change the global register variables, and thus
> they are preserved.
Sounds fragile.
It'll generate worse code because gcc can't use these registers
at all in the C code. Some gcc versions also tend to give up when they run
out of registers too badly.
> Calls are done in inline assembly, which saves the variables if they
> are actually used as parameters (the global register variables are
> only visible in a portion of the C file, of course).
So why don't you simply use normal asm inputs/outputs?
-Andi
--
[email protected] -- Speaking for myself only.
On Thu, 2010-02-18 at 10:53 +0100, Luca Barbieri wrote:
> perf is currently the main user.
> On Core2, lock cmpxchg8b takes about 24 cycles and writes the
> cacheline, while movlps takes 1 cycle.
Then run a 64bit kernel already, then its a simple 1 cycle read.
The only platform this might possibly be worth the effort for it Atom,
the rest of the world has moved on to 64bit a long time ago.
There might still be a few pentium-m users out there that might
appreciate this too, but still..
That said, _iff_ this can be done nicely there's no objection.
On Wed, 2010-02-17 at 12:42 +0100, Luca Barbieri wrote:
> +DEFINE_PER_CPU_ALIGNED(struct sse_atomic64_percpu, sse_atomic64_percpu);
> +
> +/* using the fpu/mmx looks infeasible due to the need to save the FPU environment, which is very slow
> + * SSE2 is slightly slower on Core 2 and less compatible, so avoid it for now
> + */
> +long long sse_atomic64_read_cx8call(long long dummy, const atomic64_t *v)
> +{
> + long long res;
> + unsigned long cr0 = 0;
> + struct thread_info *me = current_thread_info();
> + preempt_disable();
> + if (!(me->status & TS_USEDFPU)) {
> + cr0 = read_cr0();
> + if (cr0 & X86_CR0_TS)
> + clts();
> + }
> + asm volatile(
> + "movlps %%xmm0, " __percpu_arg(0) "\n\t"
> + "movlps %3, %%xmm0\n\t"
> + "movlps %%xmm0, " __percpu_arg(1) "\n\t"
> + "movlps " __percpu_arg(0) ", %%xmm0\n\t"
> + : "+m" (per_cpu__sse_atomic64_percpu.xmm0_low), "=m" (per_cpu__sse_atomic64_percpu.low), "=m" (per_cpu__sse_atomic64_percpu.high)
> + : "m" (v->counter));
> + if (cr0 & X86_CR0_TS)
> + write_cr0(cr0);
> + res = (long long)(unsigned)percpu_read(sse_atomic64_percpu.low) | ((long long)(unsigned)percpu_read(sse_atomic64_percpu.high) << 32);
> + preempt_enable();
> + return res;
> +}
> +EXPORT_SYMBOL(sse_atomic64_read_cx8call);
Care to explain how this is IRQ and NMI safe?
On Wed, 2010-02-17 at 12:42 +0100, Luca Barbieri wrote:
> This patch makes atomic64 use either the generic implementation or
> the rewritten cmpxchg8b one just introduced by inserting a "call" to
> either, using the alternatives system to dynamically switch the calls.
>
> This allows to use atomic64_t on 386/486 which lack cmpxchg8b
IIRC we dropped <i586 SMP support, and since we don't have a PMU on
those chips atomic64_t doesn't need to be NMI safe, so a simple
UP-IRQ-disable implementation should suffice.
> CR changes are slow and synchronize the CPU. The later is always slow.
>
> It sounds like you didn't time it?
I didn't, because I think it strongly depends on the microarchitecture
and I don't have a comprehensive set of machines to test on, so it
would just be a single data point.
The lock prefix on cmpxchg8b is also serializing so it might be as bad.
Anyway, if we use this, we should keep TS cleared in kernel mode and
lazily restore it on return to userspace.
This would make clts/stts performance mostly moot.
I agree that this feature would need to added too before putting the
SSE atomic64 code in a released kernel.
> It'll generate worse code because gcc can't use these registers
> at all in the C code. Some gcc versions also tend to give up when they run
> out of registers too badly.
Yes, but the C implementations are small and simple, and are only used
on 386/486.
Furthermore, the data in the global register variables is the main
input to the computation.
> So why don't you simply use normal asm inputs/outputs?
I do, on the caller side.
In the callee, I don't see any other robust way to implement parameter
passing in ebx/esi other than global register variables (without
resorting to pure assembly, which would prevent reusing the generic
atomic64 implementation).
On Thu, Feb 18, 2010 at 11:25 AM, Peter Zijlstra <[email protected]> wrote:
> On Wed, 2010-02-17 at 12:42 +0100, Luca Barbieri wrote:
>> +DEFINE_PER_CPU_ALIGNED(struct sse_atomic64_percpu, sse_atomic64_percpu);
>> +
>> +/* using the fpu/mmx looks infeasible due to the need to save the FPU environment, which is very slow
>> + * SSE2 is slightly slower on Core 2 and less compatible, so avoid it for now
>> + */
>> +long long sse_atomic64_read_cx8call(long long dummy, const atomic64_t *v)
>> +{
>> + ? ? ? long long res;
>> + ? ? ? unsigned long cr0 = 0;
>> + ? ? ? struct thread_info *me = current_thread_info();
>> + ? ? ? preempt_disable();
>> + ? ? ? if (!(me->status & TS_USEDFPU)) {
>> + ? ? ? ? ? ? ? cr0 = read_cr0();
>> + ? ? ? ? ? ? ? if (cr0 & X86_CR0_TS)
>> + ? ? ? ? ? ? ? ? ? ? ? clts();
>> + ? ? ? }
>> + ? ? ? asm volatile(
>> + ? ? ? ? ? ? ? ? ? ? ? "movlps %%xmm0, " __percpu_arg(0) "\n\t"
>> + ? ? ? ? ? ? ? ? ? ? ? "movlps %3, %%xmm0\n\t"
>> + ? ? ? ? ? ? ? ? ? ? ? "movlps %%xmm0, " __percpu_arg(1) "\n\t"
>> + ? ? ? ? ? ? ? ? ? ? ? "movlps " __percpu_arg(0) ", %%xmm0\n\t"
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? : "+m" (per_cpu__sse_atomic64_percpu.xmm0_low), "=m" (per_cpu__sse_atomic64_percpu.low), "=m" (per_cpu__sse_atomic64_percpu.high)
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? : "m" (v->counter));
>> + ? ? ? if (cr0 & X86_CR0_TS)
>> + ? ? ? ? ? ? ? write_cr0(cr0);
>> + ? ? ? res = (long long)(unsigned)percpu_read(sse_atomic64_percpu.low) | ((long long)(unsigned)percpu_read(sse_atomic64_percpu.high) << 32);
>> + ? ? ? preempt_enable();
>> + ? ? ? return res;
>> +}
>> +EXPORT_SYMBOL(sse_atomic64_read_cx8call);
>
> Care to explain how this is IRQ and NMI safe?
Unfortunately it isn't, due to the per-CPU variables, and thus needs
to be fixed to align the stack and use it instead
(__attribute__((force_align_arg_pointer)) should do the job).
Sorry for this, I initially used the stack and later changed it to
guarantee alignment without rechecking the IRQ/NMI safety.
If we use the stack instead of per-CPU variables, all IRQs and NMIs
preserve CR0 and the SSE registers, so this would be safe, right?
kernel_fpu_begin/end cannot be used in interrupts, so that shouldn't
be a concern.
> IIRC we dropped <i586 SMP support, and since we don't have a PMU on
> those chips atomic64_t doesn't need to be NMI safe, so a simple
> UP-IRQ-disable implementation should suffice.
We need the generic version with spinlocks for other architectures,
and reusing it is the cheapest way to support 386/486.
We thus get 386/486 SMP for free, and on UP the spinlocks simplify to
just IRQ disabling.
The only thing we could do is to #ifdef out the hashed spinlock array
in the generic implementation on UP builds, which would save about 1KB
of memory.
That is independent from this patch though.
On Thu, 2010-02-18 at 11:50 +0100, Luca Barbieri wrote:
> If we use the stack instead of per-CPU variables, all IRQs and NMIs
> preserve CR0 and the SSE registers, so this would be safe, right?
You'd have to take special care to deal with nested IRQs I think.
>> If we use the stack instead of per-CPU variables, all IRQs and NMIs
>> preserve CR0 and the SSE registers, so this would be safe, right?
>
> You'd have to take special care to deal with nested IRQs I think.
Could you elaborate on that?
Which issue could there be with nested IRQs?
On Thu, 2010-02-18 at 13:29 +0100, Luca Barbieri wrote:
> >> If we use the stack instead of per-CPU variables, all IRQs and NMIs
> >> preserve CR0 and the SSE registers, so this would be safe, right?
> >
> > You'd have to take special care to deal with nested IRQs I think.
>
> Could you elaborate on that?
> Which issue could there be with nested IRQs?
Depends on where on the stack you're going to save things, I through
you'd take space in the thread_info struct, but I guess if you're simply
going to push the reg onto the stack it should be fine.
> Depends on where on the stack you're going to save things, I through
> you'd take space in the thread_info struct, but I guess if you're simply
> going to push the reg onto the stack it should be fine.
Yes, this seems the best solution.
With frame pointers enabled, it's just a single andl $-8, %esp to
align the stack (otherwise, frame pointers are forced by gcc).
On 02/18/2010 02:25 AM, Peter Zijlstra wrote:
> On Wed, 2010-02-17 at 12:42 +0100, Luca Barbieri wrote:
>> This patch makes atomic64 use either the generic implementation or
>> the rewritten cmpxchg8b one just introduced by inserting a "call" to
>> either, using the alternatives system to dynamically switch the calls.
>>
>> This allows to use atomic64_t on 386/486 which lack cmpxchg8b
>
> IIRC we dropped <i586 SMP support, and since we don't have a PMU on
> those chips atomic64_t doesn't need to be NMI safe, so a simple
> UP-IRQ-disable implementation should suffice.
... which is what we have now, with the cmpxchg8b alternates emulation.
-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.
On 02/18/2010 02:27 AM, Luca Barbieri wrote:
>> CR changes are slow and synchronize the CPU. The later is always slow.
>>
>> It sounds like you didn't time it?
> I didn't, because I think it strongly depends on the microarchitecture
> and I don't have a comprehensive set of machines to test on, so it
> would just be a single data point.
>
> The lock prefix on cmpxchg8b is also serializing so it might be as bad.
No. LOCK isn't serializing in the same way CRx writes are.
> Anyway, if we use this, we should keep TS cleared in kernel mode and
> lazily restore it on return to userspace.
> This would make clts/stts performance mostly moot.
This is what kernel_fpu_begin/kernel_fpu_end is all about. We
definitely cannot leave TS cleared without the user space CPU state
moved to its home location, or we have yet another complicated state to
worry about.
I really feel that without a *strong* use case for this, there is
absolutely no point.
-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.
On 02/18/2010 02:27 AM, Luca Barbieri wrote:
>> So why don't you simply use normal asm inputs/outputs?
> I do, on the caller side.
>
> In the callee, I don't see any other robust way to implement parameter
> passing in ebx/esi other than global register variables (without
> resorting to pure assembly, which would prevent reusing the generic
> atomic64 implementation).
This really sounds like the wrong optimization. These functions aren't
exactly all that complex in assembly (which would also allow them to be
simple cli/do stuff/sti), and instead relying on gcc features which may
or may not be well supported on x86 is inviting breakage down the line.
That is particularly damaging, since the remaining 486-class users tend
to be deeply embedded and thus we only find problems later.
-hpa
> This is what kernel_fpu_begin/kernel_fpu_end is all about. ?We
> definitely cannot leave TS cleared without the user space CPU state
> moved to its home location, or we have yet another complicated state to
> worry about.
It should be relatively simple to handle, since the current code
doesn't really rely on the TS flag but uses TS_USEDFPU.
It would mostly be a matter of making sure TS is restored on return to
userspace if necessary.
> I really feel that without a *strong* use case for this, there is
> absolutely no point.
For the specific 32-bit atomic64_t case, it is an improvement, but not
necessarily significant in the big picture.
Being able to efficiently use SSE in the kernel might however be more
broadly useful.
memcpy/memset/etc. (assuming SSE is the best option for these at least
on some processors) and checksums come to mind.
Also non-temporal SSE moves might be useful for things like memory
compaction without clobbering caches.
On 02/18/2010 10:14 AM, Luca Barbieri wrote:
>
>> I really feel that without a *strong* use case for this, there is
>> absolutely no point.
> For the specific 32-bit atomic64_t case, it is an improvement, but not
> necessarily significant in the big picture.
> Being able to efficiently use SSE in the kernel might however be more
> broadly useful.
> memcpy/memset/etc. (assuming SSE is the best option for these at least
> on some processors) and checksums come to mind.
> Also non-temporal SSE moves might be useful for things like memory
> compaction without clobbering caches.
We already do that kind of stuff, using
kernel_fpu_begin()..kernel_fpu_end(). We went through some pain a bit
ago to clean up "private hacks" that complicated things substantially.
-hpa
> We already do that kind of stuff, using
> kernel_fpu_begin()..kernel_fpu_end(). ?We went through some pain a bit
> ago to clean up "private hacks" that complicated things substantially.
But that saves the whole FPU state on the first usage, and also
triggers a fault when userspace attempts to use it again.
Additionally it does a clts/stts every time which is slow for small
algorithms (lke the atomic64 routines).
The first issue can be solved by using SSE and saving only the used
registers, and the second with lazy TS flag restoring.
How about something like:
static inline unsigned long kernel_sse_begin(void)
{
struct thread_info *me = current_thread_info();
preempt_disable();
if (unlikely(!(me->status & TS_USEDFPU))) {
unsigned long cr0 = read_cr0();
if (unlikely(cr0 & X86_CR0_TS)) {
clts();
return cr0;
}
}
return 0;
}
static inline void kernel_sse_end(unsigned long cr0)
{
if (unlikely(cr0))
write_cr0(cr0);
preempt_enable();
}
to be improved with lazy TS restoring instead of the read_cr0/write_cr0?
> This really sounds like the wrong optimization. ?These functions aren't
> exactly all that complex in assembly (which would also allow them to be
> simple cli/do stuff/sti), and instead relying on gcc features which may
> or may not be well supported on x86 is inviting breakage down the line.
>
> That is particularly damaging, since the remaining 486-class users tend
> to be deeply embedded and thus we only find problems later.
There is the downside of adding a whole 386/486-specific implementation.
It's not too hard though, and if we don't care about 486 SMP, it may
be a better option.
On 02/18/2010 10:49 AM, Luca Barbieri wrote:
>> This really sounds like the wrong optimization. These functions aren't
>> exactly all that complex in assembly (which would also allow them to be
>> simple cli/do stuff/sti), and instead relying on gcc features which may
>> or may not be well supported on x86 is inviting breakage down the line.
>>
>> That is particularly damaging, since the remaining 486-class users tend
>> to be deeply embedded and thus we only find problems later.
>
> There is the downside of adding a whole 386/486-specific implementation.
>
> It's not too hard though, and if we don't care about 486 SMP, it may
> be a better option.
We don't care about 486 SMP. And yes, I think it is a better option.
At least an assembly option, once implemented, will *stay* implemented
and won't break due to some obscure gcc change.
-hpa
On 02/18/2010 10:42 AM, Luca Barbieri wrote:
>> We already do that kind of stuff, using
>> kernel_fpu_begin()..kernel_fpu_end(). We went through some pain a bit
>> ago to clean up "private hacks" that complicated things substantially.
>
> But that saves the whole FPU state on the first usage, and also
> triggers a fault when userspace attempts to use it again.
> Additionally it does a clts/stts every time which is slow for small
> algorithms (lke the atomic64 routines).
>
> The first issue can be solved by using SSE and saving only the used
> registers, and the second with lazy TS flag restoring.
>
Again, I want to see a strong use case before even *considering* making
the rules we already have any more complex.
-hpa
On 02/17/2010 03:42 AM, Luca Barbieri wrote:
> Currently CALL and JMP cannot be used in alternatives because the
> relative offset would be wrong.
>
> This patch uses the existing x86 instruction parser to parse the
> alternative sequence and fix up the displacements.
>
> This allows to implement this feature with minimal code.
>
The existing instruction parser is only present if KPROBES is configured
in... this patch would make it obligatory. Your patch doesn't reflect
that. Furthermore, it is ~16K of code and data which probably will make
embedded people unhappy... although perhaps isn't out of line.
A good question, though, is if we actually need support for JMP or CALL
as anything but the first instruction (usually the *only* instruction),
which would make it a lot easier...
-hpa
> We don't care about 486 SMP. ?And yes, I think it is a better option.
> At least an assembly option, once implemented, will *stay* implemented
> and won't break due to some obscure gcc change.
After looking at it a bit more I fully agree: I'll resubmit with a
cli/popf assembly implementation and without the SSE code (which can
be done later if desired).
> We don't care about 486 SMP.?
Yep, I remember when a previous patch broke 486 SMP support and since then, seemed like nobody cared.The closest to such a system I have seen was P5 systems using the 82489DX APIC.Anybody have a dmesg of such a system?
Yuhong Bao
_________________________________________________________________
Hotmail: Powerful Free email with security by Microsoft.
http://clk.atdmt.com/GBL/go/201469230/direct/01/-
> memcpy/memset/etc. (assuming SSE is the best option for these at least
> on some processors) and checksums come to mind.
Typically SSE advantages only come to play with very large data sets,
and the kernel's are not big enough. I tried it some time ago.
That said you can already use kernel_fpu_begin/end of course
and it's used in some very rare cases.
> Also non-temporal SSE moves might be useful for things like memory
> compaction without clobbering caches.
You don't need SSE for non temporal moves.
-Andi
--
[email protected] -- Speaking for myself only.
On 02/17/2010 04:47 AM, Avi Kivity wrote:
> On 02/17/2010 01:42 PM, Luca Barbieri wrote:
>> This patch modifies the x86 alternative macros to allow more than one
>> alternative code sequence.
>>
> Suggest documenting the precedence of alternatives: if both feature1 and
> feature2 are present, which newinstr is patched in?
>
Looks like the last one in the sequence takes effect, which is
consistent with the operation of the single ALTERNATIVE() macro ...
anything else would be quite lunacy.
-hpa
> The existing instruction parser is only present if KPROBES is configured
> in... this patch would make it obligatory. ?Your patch doesn't reflect
> that. ?Furthermore, it is ~16K of code and data which probably will make
> embedded people unhappy... although perhaps isn't out of line.
Didn't notice that.
> A good question, though, is if we actually need support for JMP or CALL
> as anything but the first instruction (usually the *only* instruction),
> which would make it a lot easier...
Probably not.
I think an even better option is to create a CALL if replacementlen is 0xff.
This would save some space for each callsite and make it clear we only
support this usage.
On 02/18/2010 03:38 PM, Luca Barbieri wrote:
>> The existing instruction parser is only present if KPROBES is configured
>> in... this patch would make it obligatory. Your patch doesn't reflect
>> that. Furthermore, it is ~16K of code and data which probably will make
>> embedded people unhappy... although perhaps isn't out of line.
> Didn't notice that.
>
>> A good question, though, is if we actually need support for JMP or CALL
>> as anything but the first instruction (usually the *only* instruction),
>> which would make it a lot easier...
>
> Probably not.
> I think an even better option is to create a CALL if replacementlen is 0xff.
> This would save some space for each callsite and make it clear we only
> support this usage.
That is an idea.
If we're going to do a full setup, it would be nice to handle general
relocations -- in addition to JMP and CALL, this would also allow
RIP-relative memory operands. However, I think if we're going to do
that, it would be better to extract relocations at compile time (either
directly or via disassembly, which is basically what you're doing)
rather than at runtime.
-hpa
Luca Barbieri wrote:
> Currently CALL and JMP cannot be used in alternatives because the
> relative offset would be wrong.
>
> This patch uses the existing x86 instruction parser to parse the
> alternative sequence and fix up the displacements.
>
> This allows to implement this feature with minimal code.
>
> Signed-off-by: Luca Barbieri <[email protected]>
> ---
> arch/x86/kernel/alternative.c | 22 ++++++++++++++++++++++
> 1 files changed, 22 insertions(+), 0 deletions(-)
>
> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> index de7353c..7464c7e 100644
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -17,6 +17,7 @@
> #include <asm/tlbflush.h>
> #include <asm/io.h>
> #include <asm/fixmap.h>
> +#include <asm/insn.h>
>
> #define MAX_PATCH_LEN (255-1)
>
> @@ -195,6 +196,26 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
> extern u8 *__smp_locks[], *__smp_locks_end[];
> static void *text_poke_early(void *addr, const void *opcode, size_t len);
>
Hmm, first of all, if those alternative codes are hand-written, I think
you don't need to use insn-decoder because you can expect what code
will be there. (or, as Peter said, it should be done in compile-time, because
the adjustment can be determined on each site...)
Anyway, I have some comments on it.
> +/* Fix call instruction displacements */
> +static void __init_or_module fixup_relative_addresses(char *buf, size_t size, size_t adj)
> +{
> + struct insn insn;
> + char *p = buf;
> + char *end = p + size;
> + while (p < end) {
> + kernel_insn_init(&insn, p);
> + insn_get_opcode(&insn);
Here, if you need to know the destination address encoded as an imm operand,
you can use insn_get_immediate() for that.
> +
> + /* CALL or JMP near32 */
> + if (insn.opcode.bytes[0] == 0xe8 || insn.opcode.bytes[0] == 0xe9) {
> + size_t disp_off = insn.next_byte - insn.kaddr;
> + *(unsigned *)(p + disp_off) += adj;
And also, you can use insn_offset_immediate() instead of disp_off, here.
Please see fix_riprel() in arch/x86/kernel/kprobes.c.
Thank you,
--
Masami Hiramatsu
e-mail: [email protected]
> And in fact, the Linux setup code already has code similar to the code used in XP to disable this NT4 workaround.
Unfortunately, in virtual machines this often don't work:http://www.virtualbox.org/ticket/370http://www.virtualbox.org/ticket/774http://www.virtualbox.org/ticket/4874
Yuhong Bao
_________________________________________________________________
Hotmail: Trusted email with powerful SPAM protection.
http://clk.atdmt.com/GBL/go/201469227/direct/01/-