Actually, on 386, cmpxchg and cmpxchg_local fall back on
cmpxchg_386_u8/16/32: it disables interruptions around non atomic
updates to mimic the cmpxchg behavior.
The comment:
/* Poor man's cmpxchg for 386. Unsuitable for SMP */
already present in cmpxchg_386_u32 tells much about how this cmpxchg
implementation should not be used in a SMP context. However, the cmpxchg_local
can perfectly use this fallback, since it only needs to be atomic wrt the local
cpu.
This patch adds a cmpxchg_486_u64 and uses it as a fallback for cmpxchg64
and cmpxchg64_local on 80386 and 80486.
Q:
but why is it called cmpxchg_486 when the other functions are called
A:
Because the standard cmpxchg is missing only on 386, but cmpxchg8b is
missing both on 386 and 486.
Citing Intel's Instruction set reference:
cmpxchg:
This instruction is not supported on Intel processors earlier than the
Intel486 processors.
cmpxchg8b:
This instruction encoding is not supported on Intel processors earlier
than the Pentium processors.
Q:
What's the reason to have cmpxchg64_local on 32 bit architectures?
Without that need all this would just be a few simple defines.
A:
cmpxchg64_local on 32 bits architectures takes unsigned long long
parameters, but cmpxchg_local only takes longs. Since we have cmpxchg8b
to execute a 8 byte cmpxchg atomically on pentium and +, it makes sense
to provide a flavor of cmpxchg and cmpxchg_local using this instruction.
Also, for 32 bits architectures lacking the 64 bits atomic cmpxchg, it
makes sense _not_ to define cmpxchg64 while cmpxchg could still be
available.
Moreover, the fallback for cmpxchg8b on i386 for 386 and 486 is a
different case than cmpxchg (which is only required for 386). Using
different code makes this easier.
However, cmpxchg64_local will be emulated by disabling interrupts on all
architectures where it is not supported atomically.
Therefore, we *could* turn cmpxchg64_local into a cmpxchg_local, but it
would make the 386/486 fallbacks ugly, make its design different from
cmpxchg/cmpxchg64 (which really depends on atomic operations and cannot
be emulated) and require the __cmpxchg_local to be expressed as a macro
rather than an inline function so the parameters would not be fixed to
unsigned long long in every case.
So I think cmpxchg64_local makes sense there, but I am open to
suggestions.
Signed-off-by: Mathieu Desnoyers <[email protected]>
CC: [email protected]
CC: [email protected]
---
arch/i386/kernel/cpu/intel.c | 17 +++++++
include/asm-i386/cmpxchg.h | 100 +++++++++++++++++++++++++++++--------------
2 files changed, 85 insertions(+), 32 deletions(-)
Index: linux-2.6-lttng/arch/i386/kernel/cpu/intel.c
===================================================================
--- linux-2.6-lttng.orig/arch/i386/kernel/cpu/intel.c 2007-08-07 11:00:43.000000000 -0400
+++ linux-2.6-lttng/arch/i386/kernel/cpu/intel.c 2007-08-07 11:11:07.000000000 -0400
@@ -329,5 +329,22 @@ unsigned long cmpxchg_386_u32(volatile v
EXPORT_SYMBOL(cmpxchg_386_u32);
#endif
+#ifndef CONFIG_X86_CMPXCHG64
+unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
+{
+ u64 prev;
+ unsigned long flags;
+
+ /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
+ local_irq_save(flags);
+ prev = *(u64 *)ptr;
+ if (prev == old)
+ *(u64 *)ptr = new;
+ local_irq_restore(flags);
+ return prev;
+}
+EXPORT_SYMBOL(cmpxchg_486_u64);
+#endif
+
// arch_initcall(intel_cpu_init);
Index: linux-2.6-lttng/include/asm-i386/cmpxchg.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-i386/cmpxchg.h 2007-08-07 11:01:17.000000000 -0400
+++ linux-2.6-lttng/include/asm-i386/cmpxchg.h 2007-08-07 11:16:42.000000000 -0400
@@ -116,6 +116,15 @@ static inline unsigned long __xchg(unsig
(unsigned long)(n),sizeof(*(ptr))))
#endif
+#ifdef CONFIG_X86_CMPXCHG64
+#define cmpxchg64(ptr,o,n)\
+ ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\
+ (unsigned long long)(n)))
+#define cmpxchg64_local(ptr,o,n)\
+ ((__typeof__(*(ptr)))__cmpxchg64_local((ptr),(unsigned long long)(o),\
+ (unsigned long long)(n)))
+#endif
+
static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
unsigned long new, int size)
{
@@ -203,6 +212,34 @@ static inline unsigned long __cmpxchg_lo
return old;
}
+static inline unsigned long long __cmpxchg64(volatile void *ptr,
+ unsigned long long old, unsigned long long new)
+{
+ unsigned long long prev;
+ __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
+ : "=A"(prev)
+ : "b"((unsigned long)new),
+ "c"((unsigned long)(new >> 32)),
+ "m"(*__xg(ptr)),
+ "0"(old)
+ : "memory");
+ return prev;
+}
+
+static inline unsigned long long __cmpxchg64_local(volatile void *ptr,
+ unsigned long long old, unsigned long long new)
+{
+ unsigned long long prev;
+ __asm__ __volatile__("cmpxchg8b %3"
+ : "=A"(prev)
+ : "b"((unsigned long)new),
+ "c"((unsigned long)(new >> 32)),
+ "m"(*__xg(ptr)),
+ "0"(old)
+ : "memory");
+ return prev;
+}
+
#ifndef CONFIG_X86_CMPXCHG
/*
* Building a kernel capable running on 80386. It may be necessary to
@@ -252,38 +289,37 @@ static inline unsigned long cmpxchg_386(
})
#endif
-static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old,
- unsigned long long new)
-{
- unsigned long long prev;
- __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
- : "=A"(prev)
- : "b"((unsigned long)new),
- "c"((unsigned long)(new >> 32)),
- "m"(*__xg(ptr)),
- "0"(old)
- : "memory");
- return prev;
-}
+#ifndef CONFIG_X86_CMPXCHG64
+/*
+ * Building a kernel capable running on 80386 and 80486. It may be necessary
+ * to simulate the cmpxchg8b on the 80386 and 80486 CPU.
+ */
+
+extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
+
+#define cmpxchg64(ptr,o,n) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ if (likely(boot_cpu_data.x86 > 4)) \
+ __ret = __cmpxchg64((ptr), (unsigned long long)(o), \
+ (unsigned long long)(n)); \
+ else \
+ __ret = cmpxchg_486_u64((ptr), (unsigned long long)(o), \
+ (unsigned long long)(n)); \
+ __ret; \
+})
+#define cmpxchg64_local(ptr,o,n) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ if (likely(boot_cpu_data.x86 > 4)) \
+ __ret = __cmpxchg64_local((ptr), (unsigned long long)(o), \
+ (unsigned long long)(n)); \
+ else \
+ __ret = cmpxchg_486_u64((ptr), (unsigned long long)(o), \
+ (unsigned long long)(n)); \
+ __ret; \
+})
-static inline unsigned long long __cmpxchg64_local(volatile void *ptr,
- unsigned long long old, unsigned long long new)
-{
- unsigned long long prev;
- __asm__ __volatile__("cmpxchg8b %3"
- : "=A"(prev)
- : "b"((unsigned long)new),
- "c"((unsigned long)(new >> 32)),
- "m"(*__xg(ptr)),
- "0"(old)
- : "memory");
- return prev;
-}
+#endif
-#define cmpxchg64(ptr,o,n)\
- ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\
- (unsigned long long)(n)))
-#define cmpxchg64_local(ptr,o,n)\
- ((__typeof__(*(ptr)))__cmpxchg64_local((ptr),(unsigned long long)(o),\
- (unsigned long long)(n)))
#endif
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Mon, 20 Aug 2007, Mathieu Desnoyers wrote:
> already present in cmpxchg_386_u32 tells much about how this cmpxchg
> implementation should not be used in a SMP context. However, the cmpxchg_local
> can perfectly use this fallback, since it only needs to be atomic wrt the local
> cpu.
Do you have an example where the use of cmpxchg_local is a performance
benefit when actually used in code? cmpxchg is already problematic
functionality since it is not available on all platforms.
* Christoph Lameter ([email protected]) wrote:
> On Mon, 20 Aug 2007, Mathieu Desnoyers wrote:
>
> > already present in cmpxchg_386_u32 tells much about how this cmpxchg
> > implementation should not be used in a SMP context. However, the cmpxchg_local
> > can perfectly use this fallback, since it only needs to be atomic wrt the local
> > cpu.
>
> Do you have an example where the use of cmpxchg_local is a performance
> benefit when actually used in code? cmpxchg is already problematic
> functionality since it is not available on all platforms.
Yes, I use it as synchronization mechanism for my buffer management algorithm
in LTTng. Since I write in per-cpu buffers and want to be as reentrant
as possible wrt other contexts (dealing with NMI as worse case, but also
applies to MCE..), I use local_cmpxchg to reserve space in my buffers.
It is faster than the standard cmpxchg.
Mathieu
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Mon, 20 Aug 2007, Mathieu Desnoyers wrote:
> Yes, I use it as synchronization mechanism for my buffer management algorithm
> in LTTng. Since I write in per-cpu buffers and want to be as reentrant
> as possible wrt other contexts (dealing with NMI as worse case, but also
> applies to MCE..), I use local_cmpxchg to reserve space in my buffers.
> It is faster than the standard cmpxchg.
Ok I have seen these numbers in the OLS papers but I could not reproduce
them in SLUB.
* Christoph Lameter ([email protected]) wrote:
> On Mon, 20 Aug 2007, Mathieu Desnoyers wrote:
>
> > Yes, I use it as synchronization mechanism for my buffer management algorithm
> > in LTTng. Since I write in per-cpu buffers and want to be as reentrant
> > as possible wrt other contexts (dealing with NMI as worse case, but also
> > applies to MCE..), I use local_cmpxchg to reserve space in my buffers.
> > It is faster than the standard cmpxchg.
>
> Ok I have seen these numbers in the OLS papers but I could not reproduce
> them in SLUB.
>
I'm digging in the slub with cmpxchg_local patch... first detail:
slab_alloc seems to have a return path that does not reenable
preemption... I'll keep you posted when I finish the 2.6.23-rc2-mm2
port.
Mathieu
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Mon, 20 Aug 2007, Mathieu Desnoyers wrote:
> I'm digging in the slub with cmpxchg_local patch... first detail:
> slab_alloc seems to have a return path that does not reenable
> preemption... I'll keep you posted when I finish the 2.6.23-rc2-mm2
> port.
I have a patchset here if that would help you?
* Christoph Lameter ([email protected]) wrote:
> On Mon, 20 Aug 2007, Mathieu Desnoyers wrote:
>
> > I'm digging in the slub with cmpxchg_local patch... first detail:
> > slab_alloc seems to have a return path that does not reenable
> > preemption... I'll keep you posted when I finish the 2.6.23-rc2-mm2
> > port.
>
> I have a patchset here if that would help you?
>
Sure, I'd like to give it a try.
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Mon, 20 Aug 2007, Mathieu Desnoyers wrote:
> * Christoph Lameter ([email protected]) wrote:
> > On Mon, 20 Aug 2007, Mathieu Desnoyers wrote:
> >
> > > I'm digging in the slub with cmpxchg_local patch... first detail:
> > > slab_alloc seems to have a return path that does not reenable
> > > preemption... I'll keep you posted when I finish the 2.6.23-rc2-mm2
> > > port.
> >
> > I have a patchset here if that would help you?
> >
>
> Sure, I'd like to give it a try.
This applies on top of
http://git.kernel.org/?p=linux/kernel/git/christoph/slab.git;a=shortlog;h=performance
SLUB: Single atomic instruction alloc/free using cmpxchg
A cmpxchg allows us to avoid disabling and enabling interrupts. The cmpxchg
is optimal to allow operations on per cpu freelist even if we may be moved
to other processors while getting to the cmpxchg. So we do not need to be
pinned to a cpu. This may be particularly useful for the RT kernel
where we currently seem to have major SLAB issues with the per cpu structures.
But the constant interrupt disable / enable of slab operations also increases
the performance in general.
The hard binding to per cpu structures only comes into play when we enter
the slow path (__slab_alloc and __slab_free). At that point we have to disable
interrupts like before.
We have a problem of determining the page struct in slab_free due the
issue that the freelist pointer is the only data value that we can reliably
operate on. So we need to do a virt_to_page() on the freelist. This makes it
impossible to use the fastpath for a full slab and increases overhead
through a second virt_to_page for each slab_free(). We really need the
virtual memmap patchset to get slab_free to good performance for this one.
Pro:
- Dirty single cacheline with a single instruction in
slab_alloc to accomplish allocation.
- Critical section is also a single instruction in slab_free.
(but we need to write to the cacheline of the object too)
Con:
- Complex freelist management. __slab_alloc has to deal
with results of race conditions.
- Recalculation of per cpu structure address is necessary
in __slab_alloc since process may be rescheduled while
executing in slab_alloc.
Signed-off-by: Christoph Lameter <[email protected]>
---
include/linux/slub_def.h | 10 ++--
mm/slub.c | 100 ++++++++++++++++++++++++++++++++---------------
2 files changed, 74 insertions(+), 36 deletions(-)
Index: linux-2.6.23-rc1/mm/slub.c
===================================================================
--- linux-2.6.23-rc1.orig/mm/slub.c 2007-07-27 19:58:32.000000000 -0700
+++ linux-2.6.23-rc1/mm/slub.c 2007-07-27 21:15:27.000000000 -0700
@@ -1346,34 +1346,38 @@ static void unfreeze_slab(struct kmem_ca
/*
* Remove the cpu slab
*/
-static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
+static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c,
+ void **freelist)
{
struct page *page = c->page;
+
+ c->page = NULL;
/*
* Merge cpu freelist into freelist. Typically we get here
* because both freelists are empty. So this is unlikely
* to occur.
*/
- while (unlikely(c->freelist)) {
+ while (unlikely(freelist)) {
void **object;
/* Retrieve object from cpu_freelist */
- object = c->freelist;
- c->freelist = c->freelist[c->offset];
+ object = freelist;
+ freelist = freelist[c->offset];
/* And put onto the regular freelist */
object[c->offset] = page->freelist;
page->freelist = object;
page->inuse--;
}
- c->page = NULL;
unfreeze_slab(s, page);
}
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
+ void **freelist = xchg(&c->freelist, NULL);
+
slab_lock(c->page);
- deactivate_slab(s, c);
+ deactivate_slab(s, c, freelist);
}
/*
@@ -1439,17 +1443,31 @@ static inline int node_match(struct kmem
* we need to allocate a new slab. This is slowest path since we may sleep.
*/
static void *__slab_alloc(struct kmem_cache *s,
- gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c)
+ gfp_t gfpflags, int node, void *addr)
{
void **object;
struct page *new;
+ struct kmem_cache_cpu *c;
+ void **freelist = NULL;
+ unsigned long flags;
+ local_irq_save(flags);
+ c = get_cpu_slab(s, smp_processor_id());
if (!c->page)
+ /* Slab was flushed */
goto new_slab;
+ freelist = xchg(&c->freelist, NULL);
+
slab_lock(c->page);
if (unlikely(!node_match(c, node)))
goto another_slab;
+
+ if (unlikely(freelist)) {
+ object = freelist;
+ goto out_object;
+ }
+
load_freelist:
object = c->page->freelist;
if (unlikely(!object))
@@ -1458,15 +1476,20 @@ load_freelist:
goto debug;
object = c->page->freelist;
- c->freelist = object[c->offset];
c->page->inuse = s->objects;
c->page->freelist = NULL;
c->node = page_to_nid(c->page);
+out_object:
+ c->freelist = object[c->offset];
+out:
slab_unlock(c->page);
+ local_irq_restore(flags);
+ if (unlikely((gfpflags & __GFP_ZERO)))
+ memset(object, 0, c->objsize);
return object;
another_slab:
- deactivate_slab(s, c);
+ deactivate_slab(s, c, freelist);
new_slab:
new = get_partial(s, gfpflags, node);
@@ -1503,6 +1526,7 @@ new_slab:
c->page = new;
goto load_freelist;
}
+ local_irq_restore(flags);
return NULL;
debug:
object = c->page->freelist;
@@ -1512,8 +1536,7 @@ debug:
c->page->inuse++;
c->page->freelist = object[c->offset];
c->node = -1;
- slab_unlock(c->page);
- return object;
+ goto out;
}
/*
@@ -1530,25 +1553,28 @@ static void __always_inline *slab_alloc(
gfp_t gfpflags, int node, void *addr)
{
void **object;
- unsigned long flags;
struct kmem_cache_cpu *c;
- local_irq_save(flags);
- c = get_cpu_slab(s, smp_processor_id());
- if (unlikely(!c->freelist || !node_match(c, node)))
+redo:
+ c = get_cpu_slab(s, raw_smp_processor_id());
+ object = c->freelist;
+ if (unlikely(!object))
+ goto slow;
- object = __slab_alloc(s, gfpflags, node, addr, c);
+ if (unlikely(!node_match(c, node)))
+ goto slow;
- else {
- object = c->freelist;
- c->freelist = object[c->offset];
- }
- local_irq_restore(flags);
+ if (unlikely(cmpxchg(&c->freelist, object,
+ object[c->offset]) != object))
+ goto redo;
- if (unlikely((gfpflags & __GFP_ZERO) && object))
+ if (unlikely((gfpflags & __GFP_ZERO)))
memset(object, 0, c->objsize);
return object;
+slow:
+ return __slab_alloc(s, gfpflags, node, addr);
+
}
void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
@@ -1578,7 +1604,9 @@ static void __slab_free(struct kmem_cach
{
void *prior;
void **object = (void *)x;
+ unsigned long flags;
+ local_irq_save(flags);
slab_lock(page);
if (unlikely(SlabDebug(page)))
@@ -1604,6 +1632,7 @@ checks_ok:
out_unlock:
slab_unlock(page);
+ local_irq_restore(flags);
return;
slab_empty:
@@ -1614,6 +1643,7 @@ slab_empty:
remove_partial(s, page);
slab_unlock(page);
+ local_irq_restore(flags);
discard_slab(s, page);
return;
@@ -1638,18 +1668,26 @@ static void __always_inline slab_free(st
struct page *page, void *x, void *addr)
{
void **object = (void *)x;
- unsigned long flags;
+ void **freelist;
struct kmem_cache_cpu *c;
- local_irq_save(flags);
- c = get_cpu_slab(s, smp_processor_id());
- if (likely(page == c->page && c->node >= 0)) {
- object[c->offset] = c->freelist;
- c->freelist = object;
- } else
- __slab_free(s, page, x, addr, c->offset);
+ c = get_cpu_slab(s, raw_smp_processor_id());
+ if (unlikely(c->node >= 0))
+ goto slow;
+
+redo:
+ freelist = c->freelist;
+ smp_rmb();
+ if (unlikely(page != c->page))
+ goto slow;
- local_irq_restore(flags);
+ object[c->offset] = freelist;
+
+ if (unlikely(cmpxchg_local(&c->freelist, freelist, object) != freelist))
+ goto redo;
+ return;
+slow:
+ __slab_free(s, page, x, addr, c->offset);
}
void kmem_cache_free(struct kmem_cache *s, void *x)
Index: linux-2.6.23-rc1/include/linux/slub_def.h
===================================================================
--- linux-2.6.23-rc1.orig/include/linux/slub_def.h 2007-07-27 19:30:03.000000000 -0700
+++ linux-2.6.23-rc1/include/linux/slub_def.h 2007-07-27 21:15:27.000000000 -0700
@@ -12,11 +12,11 @@
#include <linux/kobject.h>
struct kmem_cache_cpu {
- void **freelist;
- struct page *page;
- int node;
- unsigned int offset;
- unsigned int objsize;
+ void **freelist; /* Updated through atomic ops */
+ struct page *page; /* Updated with interrupts disabled */
+ int node; /* Updated with interrupts disabled */
+ unsigned int offset; /* Set up on kmem_cache_create() */
+ unsigned int objsize; /* Set up on kmem_cache_create() */
};
struct kmem_cache_node {
* Christoph Lameter ([email protected]) wrote:
> @@ -1638,18 +1668,26 @@ static void __always_inline slab_free(st
> struct page *page, void *x, void *addr)
> {
> void **object = (void *)x;
> - unsigned long flags;
> + void **freelist;
> struct kmem_cache_cpu *c;
>
> - local_irq_save(flags);
> - c = get_cpu_slab(s, smp_processor_id());
> - if (likely(page == c->page && c->node >= 0)) {
> - object[c->offset] = c->freelist;
> - c->freelist = object;
> - } else
> - __slab_free(s, page, x, addr, c->offset);
> + c = get_cpu_slab(s, raw_smp_processor_id());
> + if (unlikely(c->node >= 0))
> + goto slow;
> +
> +redo:
> + freelist = c->freelist;
> + smp_rmb();
> + if (unlikely(page != c->page))
> + goto slow;
>
> - local_irq_restore(flags);
> + object[c->offset] = freelist;
> +
> + if (unlikely(cmpxchg_local(&c->freelist, freelist, object) != freelist))
If you don't plan to disable preemption here, cmpxchg_local should
probably not be used.
> + goto redo;
> + return;
> +slow:
> + __slab_free(s, page, x, addr, c->offset);
> }
>
> void kmem_cache_free(struct kmem_cache *s, void *x)
> Index: linux-2.6.23-rc1/include/linux/slub_def.h
> ===================================================================
> --- linux-2.6.23-rc1.orig/include/linux/slub_def.h 2007-07-27 19:30:03.000000000 -0700
> +++ linux-2.6.23-rc1/include/linux/slub_def.h 2007-07-27 21:15:27.000000000 -0700
> @@ -12,11 +12,11 @@
> #include <linux/kobject.h>
>
> struct kmem_cache_cpu {
> - void **freelist;
> - struct page *page;
> - int node;
> - unsigned int offset;
> - unsigned int objsize;
> + void **freelist; /* Updated through atomic ops */
> + struct page *page; /* Updated with interrupts disabled */
> + int node; /* Updated with interrupts disabled */
> + unsigned int offset; /* Set up on kmem_cache_create() */
> + unsigned int objsize; /* Set up on kmem_cache_create() */
> };
>
> struct kmem_cache_node {
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
SLUB Use cmpxchg() everywhere.
It applies to "SLUB: Single atomic instruction alloc/free using
cmpxchg".
Signed-off-by: Mathieu Desnoyers <[email protected]>
---
mm/slub.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
Index: slab/mm/slub.c
===================================================================
--- slab.orig/mm/slub.c 2007-08-20 18:42:16.000000000 -0400
+++ slab/mm/slub.c 2007-08-20 18:42:28.000000000 -0400
@@ -1682,7 +1682,7 @@ redo:
object[c->offset] = freelist;
- if (unlikely(cmpxchg_local(&c->freelist, freelist, object) != freelist))
+ if (unlikely(cmpxchg(&c->freelist, freelist, object) != freelist))
goto redo;
return;
slow:
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Ok, I played with your patch a bit, and the results are quite
interesting:
SLUB use cmpxchg_local
my changes:
- Fixed an erroneous test in slab_free() (logic was flipped from the
original code when testing for slow path. It explains the wrong
numbers you have with big free).
- Use cmpxchg_local
- Changed smp_rmb() for barrier(). We are not interested in read order
across cpus, what we want is to be ordered wrt local interrupts only.
barrier() is much cheaper than a rmb().
It applies on top of the
"SLUB Use cmpxchg() everywhere" patch.
Summary:
(tests repeated 10000 times on a 3GHz Pentium 4)
(kernel DEBUG menuconfig options are turned off)
results are in cycles per iteration
I did 2 runs of the slab.git HEAD to have an idea of errors associated
to the measurements:
| slab.git HEAD slub (min-max) | cmpxchg_local slub
kmalloc(8) | 190 - 201 | 83
kfree(8) | 351 - 351 | 363
kmalloc(64) | 224 - 245 | 115
kfree(64) | 389 - 394 | 397
kmalloc(16384)| 713 - 741 | 724
kfree(16384) | 843 - 856 | 843
Therefore, there seems to be a repeatable gain on the kmalloc fast path
(more than twice faster). No significant performance hit for the kfree
case, but no gain neither, same for large kmalloc, as expected.
Signed-off-by: Mathieu Desnoyers <[email protected]>
---
mm/slub.c | 30 +++++++++++++++++-------------
1 file changed, 17 insertions(+), 13 deletions(-)
Index: slab/mm/slub.c
===================================================================
--- slab.orig/mm/slub.c 2007-08-21 12:59:38.000000000 -0400
+++ slab/mm/slub.c 2007-08-21 13:16:31.000000000 -0400
@@ -1554,26 +1554,26 @@ static void __always_inline *slab_alloc(
void **object;
struct kmem_cache_cpu *c;
-redo:
+ preempt_disable();
c = get_cpu_slab(s, raw_smp_processor_id());
- object = c->freelist;
- if (unlikely(!object))
- goto slow;
-
if (unlikely(!node_match(c, node)))
goto slow;
- if (unlikely(cmpxchg(&c->freelist, object,
- object[c->offset]) != object))
- goto redo;
+redo:
+ object = c->freelist;
+ if (unlikely(!object))
+ goto slow;
+ if (unlikely(cmpxchg_local(&c->freelist, object,
+ object[c->offset]) != object))
+ goto redo;
+ preempt_enable();
if (unlikely((gfpflags & __GFP_ZERO)))
memset(object, 0, c->objsize);
-
return object;
slow:
+ preempt_enable();
return __slab_alloc(s, gfpflags, node, addr);
-
}
void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
@@ -1670,22 +1670,26 @@ static void __always_inline slab_free(st
void **freelist;
struct kmem_cache_cpu *c;
+ preempt_disable();
c = get_cpu_slab(s, raw_smp_processor_id());
- if (unlikely(c->node >= 0))
+ if (unlikely(c->node < 0))
goto slow;
redo:
freelist = c->freelist;
- smp_rmb();
+ barrier(); /* Read freelist before page, wrt local interrupts */
if (unlikely(page != c->page))
goto slow;
object[c->offset] = freelist;
- if (unlikely(cmpxchg(&c->freelist, freelist, object) != freelist))
+ if (unlikely(cmpxchg_local(&c->freelist,
+ freelist, object) != freelist))
goto redo;
+ preempt_enable();
return;
slow:
+ preempt_enable();
__slab_free(s, page, x, addr, c->offset);
}
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Hi Christoph,
If you are interested in the raw numbers:
The (very basic) test module follows. Make sure you change get_cycles()
for get_cycles_sync() if you plan to run this on x86_64.
(tests taken on a 3GHz Pentium 4)
* slub HEAD, test 1
[ 99.774699] SLUB Performance testing
[ 99.785431] ========================
[ 99.796099] 1. Kmalloc: Repeatedly allocate then free test
[ 99.813159] 10000 times kmalloc(8) =
[ 99.824072] number of loops: 10000
[ 99.834207] total time: 2019990
[ 99.843562] -> 201 cycles
[ 99.852535] 10000 times kfree =
[ 99.862167] number of loops: 10000
[ 99.872310] total time: 3519982
[ 99.881669] -> 351 cycles
[ 99.890128] 10000 times kmalloc(16) =
[ 99.901298] number of loops: 10000
[ 99.911433] total time: 1986503
[ 99.920786] -> 198 cycles
[ 99.929784] 10000 times kfree =
[ 99.939397] number of loops: 10000
[ 99.949532] total time: 3596775
[ 99.958885] -> 359 cycles
[ 99.967352] 10000 times kmalloc(32) =
[ 99.978522] number of loops: 10000
[ 99.988657] total time: 2009003
[ 99.998098] -> 200 cycles
[ 100.007171] 10000 times kfree =
[ 100.016786] number of loops: 10000
[ 100.026919] total time: 3814418
[ 100.036296] -> 381 cycles
[ 100.044844] 10000 times kmalloc(64) =
[ 100.056016] number of loops: 10000
[ 100.066150] total time: 2242620
[ 100.075504] -> 224 cycles
[ 100.084619] 10000 times kfree =
[ 100.094234] number of loops: 10000
[ 100.104369] total time: 3941348
[ 100.113722] -> 394 cycles
[ 100.122475] 10000 times kmalloc(128) =
[ 100.133914] number of loops: 10000
[ 100.144049] total time: 2857560
[ 100.153485] -> 285 cycles
[ 100.162705] 10000 times kfree =
[ 100.172332] number of loops: 10000
[ 100.182468] total time: 4241543
[ 100.191821] -> 424 cycles
[ 100.200996] 10000 times kmalloc(256) =
[ 100.212437] number of loops: 10000
[ 100.222571] total time: 4119900
[ 100.231949] -> 411 cycles
[ 100.241570] 10000 times kfree =
[ 100.251218] number of loops: 10000
[ 100.261353] total time: 5462655
[ 100.270705] -> 546 cycles
[ 100.280105] 10000 times kmalloc(512) =
[ 100.291548] number of loops: 10000
[ 100.301683] total time: 4802820
[ 100.311037] -> 480 cycles
[ 100.320899] 10000 times kfree =
[ 100.330518] number of loops: 10000
[ 100.340661] total time: 6191827
[ 100.350040] -> 619 cycles
[ 100.359917] 10000 times kmalloc(1024) =
[ 100.371633] number of loops: 10000
[ 100.381767] total time: 6235890
[ 100.391120] -> 623 cycles
[ 100.401419] 10000 times kfree =
[ 100.411034] number of loops: 10000
[ 100.421170] total time: 7504095
[ 100.430523] -> 750 cycles
[ 100.440608] 10000 times kmalloc(2048) =
[ 100.452300] number of loops: 10000
[ 100.462433] total time: 6863955
[ 100.471786] -> 686 cycles
[ 100.482287] 10000 times kfree =
[ 100.491922] number of loops: 10000
[ 100.502065] total time: 8110590
[ 100.511419] -> 811 cycles
[ 100.520824] 10000 times kmalloc(4096) =
[ 100.532537] number of loops: 10000
[ 100.542670] total time: 4824007
[ 100.552023] -> 482 cycles
[ 100.561618] 10000 times kfree =
[ 100.571255] number of loops: 10000
[ 100.581390] total time: 5387670
[ 100.590768] -> 538 cycles
[ 100.600835] 10000 times kmalloc(8192) =
[ 100.612549] number of loops: 10000
[ 100.622684] total time: 6808680
[ 100.632037] -> 680 cycles
[ 100.642285] 10000 times kfree =
[ 100.651898] number of loops: 10000
[ 100.662031] total time: 7349797
[ 100.671385] -> 734 cycles
[ 100.681563] 10000 times kmalloc(16384) =
[ 100.693523] number of loops: 10000
[ 100.703658] total time: 7133790
[ 100.713036] -> 713 cycles
[ 100.723654] 10000 times kfree =
[ 100.733299] number of loops: 10000
[ 100.743434] total time: 8431725
[ 100.752788] -> 843 cycles
[ 100.760588] 2. Kmalloc: alloc/free test
[ 100.773091] 10000 times kmalloc(8)/kfree =
[ 100.785558] number of loops: 10000
[ 100.795694] total time: 3223072
[ 100.805046] -> 322 cycles
[ 100.813904] 10000 times kmalloc(16)/kfree =
[ 100.826629] number of loops: 10000
[ 100.836763] total time: 3181702
[ 100.846116] -> 318 cycles
[ 100.854975] 10000 times kmalloc(32)/kfree =
[ 100.867725] number of loops: 10000
[ 100.877860] total time: 3183517
[ 100.887296] -> 318 cycles
[ 100.896179] 10000 times kmalloc(64)/kfree =
[ 100.908905] number of loops: 10000
[ 100.919039] total time: 3253335
[ 100.928418] -> 325 cycles
[ 100.937277] 10000 times kmalloc(128)/kfree =
[ 100.950272] number of loops: 10000
[ 100.960407] total time: 3181478
[ 100.969760] -> 318 cycles
[ 100.978652] 10000 times kmalloc(256)/kfree =
[ 100.991662] number of loops: 10000
[ 101.001796] total time: 3282810
[ 101.011149] -> 328 cycles
[ 101.020042] 10000 times kmalloc(512)/kfree =
[ 101.033025] number of loops: 10000
[ 101.043161] total time: 3286725
[ 101.052515] -> 328 cycles
[ 101.061409] 10000 times kmalloc(1024)/kfree =
[ 101.074652] number of loops: 10000
[ 101.084787] total time: 3281677
[ 101.094141] -> 328 cycles
[ 101.103033] 10000 times kmalloc(2048)/kfree =
[ 101.116277] number of loops: 10000
[ 101.126413] total time: 3281460
[ 101.135789] -> 328 cycles
[ 101.145850] 10000 times kmalloc(4096)/kfree =
[ 101.159120] number of loops: 10000
[ 101.169255] total time: 6786870
[ 101.178608] -> 678 cycles
[ 101.189783] 10000 times kmalloc(8192)/kfree =
[ 101.203030] number of loops: 10000
[ 101.213166] total time: 10135372
[ 101.222846] -> 1013 cycles
[ 101.234812] 10000 times kmalloc(16384)/kfree =
[ 101.248342] number of loops: 10000
[ 101.258476] total time: 11575222
[ 101.268090] -> 1157 cycles
* Slub HEAD, test 2
[ 225.212707] SLUB Performance testing
[ 225.223363] ========================
[ 225.234040] 1. Kmalloc: Repeatedly allocate then free test
[ 225.251038] 10000 times kmalloc(8) =
[ 225.261956] number of loops: 10000
[ 225.272089] total time: 1908952
[ 225.281444] -> 190 cycles
[ 225.290415] 10000 times kfree =
[ 225.300028] number of loops: 10000
[ 225.310164] total time: 3516172
[ 225.319527] -> 351 cycles
[ 225.327979] 10000 times kmalloc(16) =
[ 225.339174] number of loops: 10000
[ 225.349309] total time: 1955085
[ 225.358661] -> 195 cycles
[ 225.367662] 10000 times kfree =
[ 225.377275] number of loops: 10000
[ 225.387411] total time: 3602730
[ 225.396789] -> 360 cycles
[ 225.405258] 10000 times kmalloc(32) =
[ 225.416426] number of loops: 10000
[ 225.426561] total time: 2010450
[ 225.435914] -> 201 cycles
[ 225.444949] 10000 times kfree =
[ 225.454631] number of loops: 10000
[ 225.464829] total time: 3706695
[ 225.474239] -> 370 cycles
[ 225.482926] 10000 times kmalloc(64) =
[ 225.494100] number of loops: 10000
[ 225.504236] total time: 2450453
[ 225.513590] -> 245 cycles
[ 225.522689] 10000 times kfree =
[ 225.532303] number of loops: 10000
[ 225.542443] total time: 3897607
[ 225.551798] -> 389 cycles
[ 225.560546] 10000 times kmalloc(128) =
[ 225.571987] number of loops: 10000
[ 225.582122] total time: 2837782
[ 225.591475] -> 283 cycles
[ 225.600653] 10000 times kfree =
[ 225.610277] number of loops: 10000
[ 225.620412] total time: 4130122
[ 225.629789] -> 413 cycles
[ 225.638956] 10000 times kmalloc(256) =
[ 225.650398] number of loops: 10000
[ 225.660534] total time: 4093110
[ 225.669887] -> 409 cycles
[ 225.679515] 10000 times kfree =
[ 225.689135] number of loops: 10000
[ 225.699269] total time: 5478518
[ 225.708633] -> 547 cycles
[ 225.718031] 10000 times kmalloc(512) =
[ 225.729468] number of loops: 10000
[ 225.739602] total time: 4765365
[ 225.748955] -> 476 cycles
[ 225.758810] 10000 times kfree =
[ 225.768431] number of loops: 10000
[ 225.778565] total time: 6167265
[ 225.787918] -> 616 cycles
[ 225.797811] 10000 times kmalloc(1024) =
[ 225.809504] number of loops: 10000
[ 225.819639] total time: 6280335
[ 225.828991] -> 628 cycles
[ 225.839302] 10000 times kfree =
[ 225.848942] number of loops: 10000
[ 225.859078] total time: 7538393
[ 225.868456] -> 753 cycles
[ 225.878535] 10000 times kmalloc(2048) =
[ 225.890226] number of loops: 10000
[ 225.900362] total time: 6842648
[ 225.909716] -> 684 cycles
[ 225.920218] 10000 times kfree =
[ 225.929834] number of loops: 10000
[ 225.939977] total time: 8114550
[ 225.949331] -> 811 cycles
[ 225.958730] 10000 times kmalloc(4096) =
[ 225.970419] number of loops: 10000
[ 225.980554] total time: 4803345
[ 225.989933] -> 480 cycles
[ 225.999531] 10000 times kfree =
[ 226.009145] number of loops: 10000
[ 226.019278] total time: 5398282
[ 226.028656] -> 539 cycles
[ 226.038659] 10000 times kmalloc(8192) =
[ 226.050349] number of loops: 10000
[ 226.060484] total time: 6613185
[ 226.069837] -> 661 cycles
[ 226.080124] 10000 times kfree =
[ 226.089738] number of loops: 10000
[ 226.099874] total time: 7467765
[ 226.109251] -> 746 cycles
[ 226.119523] 10000 times kmalloc(16384) =
[ 226.131476] number of loops: 10000
[ 226.141611] total time: 7417830
[ 226.150963] -> 741 cycles
[ 226.161627] 10000 times kfree =
[ 226.171250] number of loops: 10000
[ 226.181386] total time: 8564873
[ 226.190765] -> 856 cycles
[ 226.198565] 2. Kmalloc: alloc/free test
[ 226.211071] 10000 times kmalloc(8)/kfree =
[ 226.223538] number of loops: 10000
[ 226.233674] total time: 3233355
[ 226.243028] -> 323 cycles
[ 226.251887] 10000 times kmalloc(16)/kfree =
[ 226.264612] number of loops: 10000
[ 226.274755] total time: 3181650
[ 226.284132] -> 318 cycles
[ 226.292991] 10000 times kmalloc(32)/kfree =
[ 226.305716] number of loops: 10000
[ 226.315851] total time: 3181140
[ 226.325228] -> 318 cycles
[ 226.334089] 10000 times kmalloc(64)/kfree =
[ 226.346839] number of loops: 10000
[ 226.356974] total time: 3183255
[ 226.366352] -> 318 cycles
[ 226.375216] 10000 times kmalloc(128)/kfree =
[ 226.388210] number of loops: 10000
[ 226.398345] total time: 3188498
[ 226.407698] -> 318 cycles
[ 226.416591] 10000 times kmalloc(256)/kfree =
[ 226.429602] number of loops: 10000
[ 226.439737] total time: 3281130
[ 226.449090] -> 328 cycles
[ 226.457983] 10000 times kmalloc(512)/kfree =
[ 226.470992] number of loops: 10000
[ 226.481127] total time: 3283387
[ 226.490480] -> 328 cycles
[ 226.499375] 10000 times kmalloc(1024)/kfree =
[ 226.512619] number of loops: 10000
[ 226.522754] total time: 3287888
[ 226.532106] -> 328 cycles
[ 226.540999] 10000 times kmalloc(2048)/kfree =
[ 226.554270] number of loops: 10000
[ 226.564405] total time: 3281467
[ 226.573757] -> 328 cycles
[ 226.583720] 10000 times kmalloc(4096)/kfree =
[ 226.596989] number of loops: 10000
[ 226.607124] total time: 6489922
[ 226.616476] -> 648 cycles
[ 226.627646] 10000 times kmalloc(8192)/kfree =
[ 226.640891] number of loops: 10000
[ 226.651027] total time: 10095772
[ 226.660639] -> 1009 cycles
[ 226.672381] 10000 times kmalloc(16384)/kfree =
[ 226.685910] number of loops: 10000
[ 226.696045] total time: 11056943
[ 226.705658] -> 1105 cycles
* cmpxchg_local Slub test
[ 119.518080] SLUB Performance testing
[ 119.528739] ========================
[ 119.539393] 1. Kmalloc: Repeatedly allocate then free test
[ 119.556032] 10000 times kmalloc(8) =
[ 119.566952] number of loops: 10000
[ 119.577087] total time: 831930
[ 119.586182] -> 83 cycles
[ 119.594933] 10000 times kfree =
[ 119.604570] number of loops: 10000
[ 119.614705] total time: 3636653
[ 119.624059] -> 363 cycles
[ 119.632142] 10000 times kmalloc(16) =
[ 119.643330] number of loops: 10000
[ 119.653464] total time: 851895
[ 119.662564] -> 85 cycles
[ 119.671341] 10000 times kfree =
[ 119.680954] number of loops: 10000
[ 119.691089] total time: 3722985
[ 119.700441] -> 372 cycles
[ 119.708551] 10000 times kmalloc(32) =
[ 119.719721] number of loops: 10000
[ 119.729855] total time: 928470
[ 119.738973] -> 92 cycles
[ 119.747770] 10000 times kfree =
[ 119.757382] number of loops: 10000
[ 119.767518] total time: 3772147
[ 119.776871] -> 377 cycles
[ 119.785065] 10000 times kmalloc(64) =
[ 119.796246] number of loops: 10000
[ 119.806380] total time: 1157535
[ 119.815734] -> 115 cycles
[ 119.824858] 10000 times kfree =
[ 119.834473] number of loops: 10000
[ 119.844609] total time: 3975780
[ 119.853962] -> 397 cycles
[ 119.862361] 10000 times kmalloc(128) =
[ 119.873801] number of loops: 10000
[ 119.883940] total time: 1792695
[ 119.893382] -> 179 cycles
[ 119.902658] 10000 times kfree =
[ 119.912282] number of loops: 10000
[ 119.922416] total time: 4385093
[ 119.931769] -> 438 cycles
[ 119.940617] 10000 times kmalloc(256) =
[ 119.952146] number of loops: 10000
[ 119.962345] total time: 3140227
[ 119.971753] -> 314 cycles
[ 119.981498] 10000 times kfree =
[ 119.991124] number of loops: 10000
[ 120.001258] total time: 5641035
[ 120.010613] -> 564 cycles
[ 120.019752] 10000 times kmalloc(512) =
[ 120.031186] number of loops: 10000
[ 120.041321] total time: 3989085
[ 120.050676] -> 398 cycles
[ 120.060528] 10000 times kfree =
[ 120.070146] number of loops: 10000
[ 120.080281] total time: 6159315
[ 120.089633] -> 615 cycles
[ 120.099344] 10000 times kmalloc(1024) =
[ 120.111033] number of loops: 10000
[ 120.121168] total time: 5733525
[ 120.130546] -> 573 cycles
[ 120.140827] 10000 times kfree =
[ 120.150444] number of loops: 10000
[ 120.160579] total time: 7450170
[ 120.169956] -> 745 cycles
[ 120.179854] 10000 times kmalloc(2048) =
[ 120.191544] number of loops: 10000
[ 120.201680] total time: 6298380
[ 120.211033] -> 629 cycles
[ 120.221555] 10000 times kfree =
[ 120.231170] number of loops: 10000
[ 120.241305] total time: 8169892
[ 120.250666] -> 816 cycles
[ 120.260044] 10000 times kmalloc(4096) =
[ 120.271733] number of loops: 10000
[ 120.281868] total time: 4739040
[ 120.291221] -> 473 cycles
[ 120.300847] 10000 times kfree =
[ 120.310483] number of loops: 10000
[ 120.320619] total time: 5484870
[ 120.329973] -> 548 cycles
[ 120.339970] 10000 times kmalloc(8192) =
[ 120.351659] number of loops: 10000
[ 120.361805] total time: 6594803
[ 120.371184] -> 659 cycles
[ 120.381465] 10000 times kfree =
[ 120.391104] number of loops: 10000
[ 120.401240] total time: 7454415
[ 120.410618] -> 745 cycles
[ 120.420834] 10000 times kmalloc(16384) =
[ 120.432797] number of loops: 10000
[ 120.442932] total time: 7245863
[ 120.452311] -> 724 cycles
[ 120.462923] 10000 times kfree =
[ 120.472545] number of loops: 10000
[ 120.482689] total time: 8431665
[ 120.492043] -> 843 cycles
[ 120.499843] 2. Kmalloc: alloc/free test
[ 120.511647] 10000 times kmalloc(8)/kfree =
[ 120.524116] number of loops: 10000
[ 120.534251] total time: 1129178
[ 120.543629] -> 112 cycles
[ 120.551773] 10000 times kmalloc(16)/kfree =
[ 120.564524] number of loops: 10000
[ 120.574658] total time: 1036778
[ 120.584036] -> 103 cycles
[ 120.592180] 10000 times kmalloc(32)/kfree =
[ 120.604905] number of loops: 10000
[ 120.615040] total time: 1031220
[ 120.624419] -> 103 cycles
[ 120.632561] 10000 times kmalloc(64)/kfree =
[ 120.645289] number of loops: 10000
[ 120.655425] total time: 1031025
[ 120.664777] -> 103 cycles
[ 120.672954] 10000 times kmalloc(128)/kfree =
[ 120.685941] number of loops: 10000
[ 120.696075] total time: 1127857
[ 120.705429] -> 112 cycles
[ 120.713599] 10000 times kmalloc(256)/kfree =
[ 120.726591] number of loops: 10000
[ 120.736724] total time: 1110885
[ 120.746101] -> 111 cycles
[ 120.754271] 10000 times kmalloc(512)/kfree =
[ 120.767256] number of loops: 10000
[ 120.777392] total time: 1110855
[ 120.786770] -> 111 cycles
[ 120.794940] 10000 times kmalloc(1024)/kfree =
[ 120.808239] number of loops: 10000
[ 120.818373] total time: 1111522
[ 120.827729] -> 111 cycles
[ 120.835932] 10000 times kmalloc(2048)/kfree =
[ 120.849175] number of loops: 10000
[ 120.859311] total time: 1217235
[ 120.868664] -> 121 cycles
[ 120.878627] 10000 times kmalloc(4096)/kfree =
[ 120.891872] number of loops: 10000
[ 120.902006] total time: 6501330
[ 120.911383] -> 650 cycles
[ 120.922653] 10000 times kmalloc(8192)/kfree =
[ 120.935922] number of loops: 10000
[ 120.946065] total time: 10424333
[ 120.955677] -> 1042 cycles
[ 120.967571] 10000 times kmalloc(16384)/kfree =
[ 120.981099] number of loops: 10000
[ 120.991233] total time: 11495947
[ 121.000871] -> 1149 cycles
* test module
Note that the kmalloc_caches is defined out because we are not a
boot-time, but at module load time and it does not work.
/* test-slub.c
*
* Compare local cmpxchg with irq disable / enable with cmpxchg_local for slub.
*/
#include <linux/jiffies.h>
#include <linux/compiler.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/calc64.h>
#include <asm/timex.h>
#include <asm/system.h>
#define TEST_COUNT 10000
static int slub_test_init(void)
{
void **v = kmalloc(TEST_COUNT * sizeof(void *), GFP_KERNEL);
unsigned int i;
cycles_t time1, time2, time;
long rem;
int size;
printk(KERN_ALERT "test init\n");
printk(KERN_ALERT "SLUB Performance testing\n");
printk(KERN_ALERT "========================\n");
printk(KERN_ALERT "1. Kmalloc: Repeatedly allocate then free test\n");
for (size = 8; size <= PAGE_SIZE << 2; size <<= 1) {
time1 = get_cycles();
for (i = 0; i < TEST_COUNT; i++) {
v[i] = kmalloc(size, GFP_KERNEL);
}
time2 = get_cycles();
time = time2 - time1;
printk(KERN_ALERT "%i times kmalloc(%d) = \n", i, size);
printk(KERN_ALERT "number of loops: %d\n", TEST_COUNT);
printk(KERN_ALERT "total time: %llu\n", time);
time = div_long_long_rem(time, TEST_COUNT, &rem);
printk(KERN_ALERT "-> %llu cycles\n", time);
time1 = get_cycles();
for (i = 0; i < TEST_COUNT; i++) {
kfree(v[i]);
}
time2 = get_cycles();
time = time2 - time1;
printk(KERN_ALERT "%i times kfree = \n", i);
printk(KERN_ALERT "number of loops: %d\n", TEST_COUNT);
printk(KERN_ALERT "total time: %llu\n", time);
time = div_long_long_rem(time, TEST_COUNT, &rem);
printk(KERN_ALERT "-> %llu cycles\n", time);
}
printk(KERN_ALERT "2. Kmalloc: alloc/free test\n");
for (size = 8; size <= PAGE_SIZE << 2; size <<= 1) {
time1 = get_cycles();
for (i = 0; i < TEST_COUNT; i++) {
kfree(kmalloc(size, GFP_KERNEL));
}
time2 = get_cycles();
time = time2 - time1;
printk(KERN_ALERT "%i times kmalloc(%d)/kfree = \n", i, size);
printk(KERN_ALERT "number of loops: %d\n", TEST_COUNT);
printk(KERN_ALERT "total time: %llu\n", time);
time = div_long_long_rem(time, TEST_COUNT, &rem);
printk(KERN_ALERT "-> %llu cycles\n", time);
}
#if 0
printk(KERN_ALERT "3. kmem_cache_alloc: Repeatedly allocate then free test\n");
for (size = 3; size <= PAGE_SHIFT; size ++) {
time1 = get_cycles();
for (i = 0; i < TEST_COUNT; i++) {
v[i] = kmem_cache_alloc(kmalloc_caches + size, GFP_KERNEL);
}
time2 = get_cycles();
time = time2 - time1;
printk(KERN_ALERT "%d times kmem_cache_alloc(%d) = \n", i, 1 << size);
printk(KERN_ALERT "number of loops: %d\n", TEST_COUNT);
printk(KERN_ALERT "total time: %llu\n", time);
time = div_long_long_rem(time, TEST_COUNT, &rem);
printk(KERN_ALERT "-> %llu cycles\n", time);
time1 = get_cycles();
for (i = 0; i < TEST_COUNT; i++) {
kmem_cache_free(kmalloc_caches + size, v[i]);
}
time2 = get_cycles();
time = time2 - time1;
printk(KERN_ALERT "%i times kmem_cache_free = \n", i);
printk(KERN_ALERT "number of loops: %d\n", TEST_COUNT);
printk(KERN_ALERT "total time: %llu\n", time);
time = div_long_long_rem(time, TEST_COUNT, &rem);
printk(KERN_ALERT "-> %llu cycles\n", time);
}
printk(KERN_ALERT "4. kmem_cache_alloc: alloc/free test\n");
for (size = 3; size <= PAGE_SHIFT; size++) {
time1 = get_cycles();
for (i = 0; i < TEST_COUNT; i++) {
kmem_cache_free(kmalloc_caches + size,
kmem_cache_alloc(kmalloc_caches + size,
GFP_KERNEL));
}
time2 = get_cycles();
time = time2 - time1;
printk(KERN_ALERT "%d times kmem_cache_alloc(%d)/kmem_cache_free = \n", i, 1 << size);
printk(KERN_ALERT "number of loops: %d\n", TEST_COUNT);
printk(KERN_ALERT "total time: %llu\n", time);
time = div_long_long_rem(time, TEST_COUNT, &rem);
printk(KERN_ALERT "-> %llu cycles\n", time);
}
printk(KERN_ALERT "5. kmem_cache_zalloc: Repeatedly allocate then free test\n");
for (size = 3; size <= PAGE_SHIFT; size ++) {
time1 = get_cycles();
for (i = 0; i < TEST_COUNT; i++) {
v[i] = kmem_cache_zalloc(kmalloc_caches + size, GFP_KERNEL);
}
time2 = get_cycles();
time = time2 - time1;
printk(KERN_ALERT "%d times kmem_cache_zalloc(%d) = \n", i, 1 << size);
printk(KERN_ALERT "number of loops: %d\n", TEST_COUNT);
printk(KERN_ALERT "total time: %llu\n", time);
time = div_long_long_rem(time, TEST_COUNT, &rem);
printk(KERN_ALERT "-> %llu cycles\n", time);
time1 = get_cycles();
for (i = 0; i < TEST_COUNT; i++) {
kmem_cache_free(kmalloc_caches + size, v[i]);
}
time2 = get_cycles();
time = time2 - time1;
printk(KERN_ALERT "%i times kmem_cache_free = \n", i);
printk(KERN_ALERT "number of loops: %d\n", TEST_COUNT);
printk(KERN_ALERT "total time: %llu\n", time);
time = div_long_long_rem(time, TEST_COUNT, &rem);
printk(KERN_ALERT "-> %llu cycles\n", time);
}
printk(KERN_ALERT "6. kmem_cache_zalloc: alloc/free test\n");
for (size = 3; size <= PAGE_SHIFT; size++) {
time1 = get_cycles();
for (i = 0; i < TEST_COUNT; i++) {
kmem_cache_free(kmalloc_caches + size,
kmem_cache_zalloc(kmalloc_caches + size,
GFP_KERNEL));
}
time2 = get_cycles();
time = time2 - time1;
printk(KERN_ALERT "%d times kmem_cache_zalloc(%d)/kmem_cache_free = \n", i, 1 << size);
printk(KERN_ALERT "number of loops: %d\n", TEST_COUNT);
printk(KERN_ALERT "total time: %llu\n", time);
time = div_long_long_rem(time, TEST_COUNT, &rem);
printk(KERN_ALERT "-> %llu cycles\n", time);
}
#endif //0
kfree(v);
return -EAGAIN; /* Fail will directly unload the module */
}
static void slub_test_exit(void)
{
printk(KERN_ALERT "test exit\n");
}
module_init(slub_test_init)
module_exit(slub_test_exit)
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Mathieu Desnoyers");
MODULE_DESCRIPTION("SLUB test");
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
* Mathieu Desnoyers ([email protected]) wrote:
> Ok, I played with your patch a bit, and the results are quite
> interesting:
>
...
> Summary:
>
> (tests repeated 10000 times on a 3GHz Pentium 4)
> (kernel DEBUG menuconfig options are turned off)
> results are in cycles per iteration
> I did 2 runs of the slab.git HEAD to have an idea of errors associated
> to the measurements:
>
> | slab.git HEAD slub (min-max) | cmpxchg_local slub
> kmalloc(8) | 190 - 201 | 83
> kfree(8) | 351 - 351 | 363
> kmalloc(64) | 224 - 245 | 115
> kfree(64) | 389 - 394 | 397
> kmalloc(16384)| 713 - 741 | 724
> kfree(16384) | 843 - 856 | 843
>
> Therefore, there seems to be a repeatable gain on the kmalloc fast path
> (more than twice faster). No significant performance hit for the kfree
> case, but no gain neither, same for large kmalloc, as expected.
>
Having no performance improvement for kfree seems a little weird, since
we are moving from irq disable to cmpxchg_local in the fast path. A
possible explanation would be that we are always hitting the slow path.
I did a simple test, counting the number of fast vs slow paths with my
cmpxchg_local slub version:
(initial state before the test)
[ 386.359364] Fast slub free: 654982
[ 386.369507] Slow slub free: 392195
[ 386.379660] SLUB Performance testing
[ 386.390361] ========================
[ 386.401020] 1. Kmalloc: Repeatedly allocate then free test
...
(after test 1)
[ 387.366002] Fast slub free: 657338 diff (2356)
[ 387.376158] Slow slub free: 482162 diff (89967)
[ 387.386294] 2. Kmalloc: alloc/free test
...
(after test 2)
[ 387.897816] Fast slub free: 748968 (diff 91630)
[ 387.907947] Slow slub free: 482584 diff (422)
Therefore, in the test where we have separate passes for slub allocation
and free, we hit mostly the slow path. Any particular reason for that ?
Note that the alloc/free test (test 2) seems to hit the fast path as
expected.
Mathieu
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Tue, 21 Aug 2007, Mathieu Desnoyers wrote:
> - Fixed an erroneous test in slab_free() (logic was flipped from the
> original code when testing for slow path. It explains the wrong
> numbers you have with big free).
If you look at the numbers that I posted earlier then you will see that
even the measurements without free were not up to par.
> It applies on top of the
> "SLUB Use cmpxchg() everywhere" patch.
Which one is that?
> | slab.git HEAD slub (min-max) | cmpxchg_local slub
> kmalloc(8) | 190 - 201 | 83
> kfree(8) | 351 - 351 | 363
> kmalloc(64) | 224 - 245 | 115
> kfree(64) | 389 - 394 | 397
> kmalloc(16384)| 713 - 741 | 724
> kfree(16384) | 843 - 856 | 843
>
> Therefore, there seems to be a repeatable gain on the kmalloc fast path
> (more than twice faster). No significant performance hit for the kfree
> case, but no gain neither, same for large kmalloc, as expected.
There is a consistent loss on slab_free it seems. The 16k numbers are
irrelevant since we do not use slab_alloc/slab_free due to the direct pass
through patch but call the page allocator directly. That also explains
that there is no loss there.
The kmalloc numbers look encouraging. I will check to see if I can
reproduce it once I sort out the patches.
On Tue, 21 Aug 2007, Mathieu Desnoyers wrote:
> If you are interested in the raw numbers:
>
> The (very basic) test module follows. Make sure you change get_cycles()
> for get_cycles_sync() if you plan to run this on x86_64.
Which test is which? Would you be able to format this in a way that we can
easily read it?
On Tue, 21 Aug 2007, Mathieu Desnoyers wrote:
> Therefore, in the test where we have separate passes for slub allocation
> and free, we hit mostly the slow path. Any particular reason for that ?
Maybe on SMP you are schedule to run on a different processor? Note that
I ran my tests at early boot where such effects do not occur.
> Note that the alloc/free test (test 2) seems to hit the fast path as
> expected.
It is much more likely in that case that the execution thread stays on one
processor.
* Christoph Lameter ([email protected]) wrote:
> On Tue, 21 Aug 2007, Mathieu Desnoyers wrote:
>
> > - Fixed an erroneous test in slab_free() (logic was flipped from the
> > original code when testing for slow path. It explains the wrong
> > numbers you have with big free).
>
> If you look at the numbers that I posted earlier then you will see that
> even the measurements without free were not up to par.
>
I seem to get a clear performance improvement in the kmalloc fast path.
> > It applies on top of the
> > "SLUB Use cmpxchg() everywhere" patch.
>
> Which one is that?
>
This one:
SLUB Use cmpxchg() everywhere.
It applies to "SLUB: Single atomic instruction alloc/free using
cmpxchg".
Signed-off-by: Mathieu Desnoyers <[email protected]>
---
mm/slub.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
Index: slab/mm/slub.c
===================================================================
--- slab.orig/mm/slub.c 2007-08-20 18:42:16.000000000 -0400
+++ slab/mm/slub.c 2007-08-20 18:42:28.000000000 -0400
@@ -1682,7 +1682,7 @@ redo:
object[c->offset] = freelist;
- if (unlikely(cmpxchg_local(&c->freelist, freelist, object) != freelist))
+ if (unlikely(cmpxchg(&c->freelist, freelist, object) != freelist))
goto redo;
return;
slow:
> > | slab.git HEAD slub (min-max) | cmpxchg_local slub
> > kmalloc(8) | 190 - 201 | 83
> > kfree(8) | 351 - 351 | 363
> > kmalloc(64) | 224 - 245 | 115
> > kfree(64) | 389 - 394 | 397
> > kmalloc(16384)| 713 - 741 | 724
> > kfree(16384) | 843 - 856 | 843
> >
> > Therefore, there seems to be a repeatable gain on the kmalloc fast path
> > (more than twice faster). No significant performance hit for the kfree
> > case, but no gain neither, same for large kmalloc, as expected.
>
> There is a consistent loss on slab_free it seems. The 16k numbers are
> irrelevant since we do not use slab_alloc/slab_free due to the direct pass
> through patch but call the page allocator directly. That also explains
> that there is no loss there.
>
Yes. slab_free in these tests falls mostly into __slab_free() slow path
(I instrumented the number of slow and fast path to get this). The small
performance hit (~10 cycles) can be explained by the added
preempt_disable()/preempt_enable().
> The kmalloc numbers look encouraging. I will check to see if I can
> reproduce it once I sort out the patches.
Ok.
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Tue, 21 Aug 2007, Mathieu Desnoyers wrote:
> - Changed smp_rmb() for barrier(). We are not interested in read order
> across cpus, what we want is to be ordered wrt local interrupts only.
> barrier() is much cheaper than a rmb().
But this means a preempt disable is required. RT users do not want that.
Without preemption the processor can be moved after c has been determined.
That is why the smp_rmb() is there.
On Tue, 21 Aug 2007, Mathieu Desnoyers wrote:
> SLUB Use cmpxchg() everywhere.
>
> It applies to "SLUB: Single atomic instruction alloc/free using
> cmpxchg".
> +++ slab/mm/slub.c 2007-08-20 18:42:28.000000000 -0400
> @@ -1682,7 +1682,7 @@ redo:
>
> object[c->offset] = freelist;
>
> - if (unlikely(cmpxchg_local(&c->freelist, freelist, object) != freelist))
> + if (unlikely(cmpxchg(&c->freelist, freelist, object) != freelist))
> goto redo;
> return;
> slow:
Ok so regular cmpxchg, no cmpxchg_local. cmpxchg_local does not bring
anything more? My measurements did not show any difference. I measured on
Athlon64. What processor is being used?
Reformatting...
* Mathieu Desnoyers ([email protected]) wrote:
> Hi Christoph,
>
> If you are interested in the raw numbers:
>
> The (very basic) test module follows. Make sure you change get_cycles()
> for get_cycles_sync() if you plan to run this on x86_64.
>
> (tests taken on a 3GHz Pentium 4)
>
(Note: test 1 uses the kfree slow path, as figured out by
instrumentation)
SLUB Performance testing
========================
1. Kmalloc: Repeatedly allocate then free test
* slub HEAD, test 1
kmalloc(8) = 201 cycles kfree = 351 cycles
kmalloc(16) = 198 cycles kfree = 359 cycles
kmalloc(32) = 200 cycles kfree = 381 cycles
kmalloc(64) = 224 cycles kfree = 394 cycles
kmalloc(128) = 285 cycles kfree = 424 cycles
kmalloc(256) = 411 cycles kfree = 546 cycles
kmalloc(512) = 480 cycles kfree = 619 cycles
kmalloc(1024) = 623 cycles kfree = 750 cycles
kmalloc(2048) = 686 cycles kfree = 811 cycles
kmalloc(4096) = 482 cycles kfree = 538 cycles
kmalloc(8192) = 680 cycles kfree = 734 cycles
kmalloc(16384) = 713 cycles kfree = 843 cycles
* Slub HEAD, test 2
kmalloc(8) = 190 cycles kfree = 351 cycles
kmalloc(16) = 195 cycles kfree = 360 cycles
kmalloc(32) = 201 cycles kfree = 370 cycles
kmalloc(64) = 245 cycles kfree = 389 cycles
kmalloc(128) = 283 cycles kfree = 413 cycles
kmalloc(256) = 409 cycles kfree = 547 cycles
kmalloc(512) = 476 cycles kfree = 616 cycles
kmalloc(1024) = 628 cycles kfree = 753 cycles
kmalloc(2048) = 684 cycles kfree = 811 cycles
kmalloc(4096) = 480 cycles kfree = 539 cycles
kmalloc(8192) = 661 cycles kfree = 746 cycles
kmalloc(16384) = 741 cycles kfree = 856 cycles
* cmpxchg_local Slub test
kmalloc(8) = 83 cycles kfree = 363 cycles
kmalloc(16) = 85 cycles kfree = 372 cycles
kmalloc(32) = 92 cycles kfree = 377 cycles
kmalloc(64) = 115 cycles kfree = 397 cycles
kmalloc(128) = 179 cycles kfree = 438 cycles
kmalloc(256) = 314 cycles kfree = 564 cycles
kmalloc(512) = 398 cycles kfree = 615 cycles
kmalloc(1024) = 573 cycles kfree = 745 cycles
kmalloc(2048) = 629 cycles kfree = 816 cycles
kmalloc(4096) = 473 cycles kfree = 548 cycles
kmalloc(8192) = 659 cycles kfree = 745 cycles
kmalloc(16384) = 724 cycles kfree = 843 cycles
2. Kmalloc: alloc/free test
* slub HEAD, test 1
kmalloc(8)/kfree = 322 cycles
kmalloc(16)/kfree = 318 cycles
kmalloc(32)/kfree = 318 cycles
kmalloc(64)/kfree = 325 cycles
kmalloc(128)/kfree = 318 cycles
kmalloc(256)/kfree = 328 cycles
kmalloc(512)/kfree = 328 cycles
kmalloc(1024)/kfree = 328 cycles
kmalloc(2048)/kfree = 328 cycles
kmalloc(4096)/kfree = 678 cycles
kmalloc(8192)/kfree = 1013 cycles
kmalloc(16384)/kfree = 1157 cycles
* Slub HEAD, test 2
kmalloc(8)/kfree = 323 cycles
kmalloc(16)/kfree = 318 cycles
kmalloc(32)/kfree = 318 cycles
kmalloc(64)/kfree = 318 cycles
kmalloc(128)/kfree = 318 cycles
kmalloc(256)/kfree = 328 cycles
kmalloc(512)/kfree = 328 cycles
kmalloc(1024)/kfree = 328 cycles
kmalloc(2048)/kfree = 328 cycles
kmalloc(4096)/kfree = 648 cycles
kmalloc(8192)/kfree = 1009 cycles
kmalloc(16384)/kfree = 1105 cycles
* cmpxchg_local Slub test
kmalloc(8)/kfree = 112 cycles
kmalloc(16)/kfree = 103 cycles
kmalloc(32)/kfree = 103 cycles
kmalloc(64)/kfree = 103 cycles
kmalloc(128)/kfree = 112 cycles
kmalloc(256)/kfree = 111 cycles
kmalloc(512)/kfree = 111 cycles
kmalloc(1024)/kfree = 111 cycles
kmalloc(2048)/kfree = 121 cycles
kmalloc(4096)/kfree = 650 cycles
kmalloc(8192)/kfree = 1042 cycles
kmalloc(16384)/kfree = 1149 cycles
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
* Christoph Lameter ([email protected]) wrote:
> On Tue, 21 Aug 2007, Mathieu Desnoyers wrote:
>
> > - Changed smp_rmb() for barrier(). We are not interested in read order
> > across cpus, what we want is to be ordered wrt local interrupts only.
> > barrier() is much cheaper than a rmb().
>
> But this means a preempt disable is required. RT users do not want that.
> Without preemption the processor can be moved after c has been determined.
> That is why the smp_rmb() is there.
preemption is required if we want to use cmpxchg_local anyway.
We may have to find a way to use preemption while being able to give an
upper bound on the preempt disabled execution time. I think I got a way
to do this yesterday.. I'll dig in my patches.
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Tue, 21 Aug 2007, Mathieu Desnoyers wrote:
> * cmpxchg_local Slub test
> kmalloc(8) = 83 cycles kfree = 363 cycles
> kmalloc(16) = 85 cycles kfree = 372 cycles
> kmalloc(32) = 92 cycles kfree = 377 cycles
> kmalloc(64) = 115 cycles kfree = 397 cycles
> kmalloc(128) = 179 cycles kfree = 438 cycles
So for consecutive allocs of small slabs up to 128 bytes this effectively
doubles the speed of kmalloc.
> kmalloc(256) = 314 cycles kfree = 564 cycles
> kmalloc(512) = 398 cycles kfree = 615 cycles
> kmalloc(1024) = 573 cycles kfree = 745 cycles
Less of a benefit.
> kmalloc(2048) = 629 cycles kfree = 816 cycles
Allmost as before.
> kmalloc(4096) = 473 cycles kfree = 548 cycles
> kmalloc(8192) = 659 cycles kfree = 745 cycles
> kmalloc(16384) = 724 cycles kfree = 843 cycles
Page allocator pass through measurements.
On Tue, 21 Aug 2007, Mathieu Desnoyers wrote:
> kmalloc(8)/kfree = 112 cycles
> kmalloc(16)/kfree = 103 cycles
> kmalloc(32)/kfree = 103 cycles
> kmalloc(64)/kfree = 103 cycles
> kmalloc(128)/kfree = 112 cycles
> kmalloc(256)/kfree = 111 cycles
> kmalloc(512)/kfree = 111 cycles
> kmalloc(1024)/kfree = 111 cycles
> kmalloc(2048)/kfree = 121 cycles
Looks good. This improves handling for short lived objects about
threefold.
> kmalloc(4096)/kfree = 650 cycles
> kmalloc(8192)/kfree = 1042 cycles
> kmalloc(16384)/kfree = 1149 cycles
Hmmm... The page allocator is really bad here....
Could we use the cmpxchg_local approach for the per cpu queues in the
page_allocator? May have an even greater influence on overall system
performance than the SLUB changes.
* Christoph Lameter ([email protected]) wrote:
> On Tue, 21 Aug 2007, Mathieu Desnoyers wrote:
>
> > SLUB Use cmpxchg() everywhere.
> >
> > It applies to "SLUB: Single atomic instruction alloc/free using
> > cmpxchg".
>
> > +++ slab/mm/slub.c 2007-08-20 18:42:28.000000000 -0400
> > @@ -1682,7 +1682,7 @@ redo:
> >
> > object[c->offset] = freelist;
> >
> > - if (unlikely(cmpxchg_local(&c->freelist, freelist, object) != freelist))
> > + if (unlikely(cmpxchg(&c->freelist, freelist, object) != freelist))
> > goto redo;
> > return;
> > slow:
>
> Ok so regular cmpxchg, no cmpxchg_local. cmpxchg_local does not bring
> anything more? My measurements did not show any difference. I measured on
> Athlon64. What processor is being used?
>
This patch only cleans up the tree before proposing my cmpxchg_local
changes. There was an inconsistent use of cmpxchg/cmpxchg_local there.
Using cmpxchg_local vs cmpxchg has a clear impact on the fast paths, as
shown below: it saves about 60 to 70 cycles for kmalloc and 200 cycles
for the kmalloc/kfree pair (test 2).
Pros :
- we can use barrier() instead of rmb()
- cmpxchg_local is faster
Con :
- we must disable preemption
I use a 3GHz Pentium 4 for my tests.
Results (compared to cmpxchg_local numbers) :
SLUB Performance testing
========================
1. Kmalloc: Repeatedly allocate then free test
(kfree here is slow path)
* cmpxchg
kmalloc(8) = 271 cycles kfree = 645 cycles
kmalloc(16) = 158 cycles kfree = 428 cycles
kmalloc(32) = 153 cycles kfree = 446 cycles
kmalloc(64) = 178 cycles kfree = 459 cycles
kmalloc(128) = 247 cycles kfree = 481 cycles
kmalloc(256) = 363 cycles kfree = 605 cycles
kmalloc(512) = 449 cycles kfree = 677 cycles
kmalloc(1024) = 626 cycles kfree = 810 cycles
kmalloc(2048) = 681 cycles kfree = 869 cycles
kmalloc(4096) = 471 cycles kfree = 575 cycles
kmalloc(8192) = 666 cycles kfree = 747 cycles
kmalloc(16384) = 736 cycles kfree = 853 cycles
* cmpxchg_local
kmalloc(8) = 83 cycles kfree = 363 cycles
kmalloc(16) = 85 cycles kfree = 372 cycles
kmalloc(32) = 92 cycles kfree = 377 cycles
kmalloc(64) = 115 cycles kfree = 397 cycles
kmalloc(128) = 179 cycles kfree = 438 cycles
kmalloc(256) = 314 cycles kfree = 564 cycles
kmalloc(512) = 398 cycles kfree = 615 cycles
kmalloc(1024) = 573 cycles kfree = 745 cycles
kmalloc(2048) = 629 cycles kfree = 816 cycles
kmalloc(4096) = 473 cycles kfree = 548 cycles
kmalloc(8192) = 659 cycles kfree = 745 cycles
kmalloc(16384) = 724 cycles kfree = 843 cycles
2. Kmalloc: alloc/free test
*cmpxchg
kmalloc(8)/kfree = 321 cycles
kmalloc(16)/kfree = 308 cycles
kmalloc(32)/kfree = 311 cycles
kmalloc(64)/kfree = 310 cycles
kmalloc(128)/kfree = 306 cycles
kmalloc(256)/kfree = 325 cycles
kmalloc(512)/kfree = 324 cycles
kmalloc(1024)/kfree = 322 cycles
kmalloc(2048)/kfree = 309 cycles
kmalloc(4096)/kfree = 678 cycles
kmalloc(8192)/kfree = 1027 cycles
kmalloc(16384)/kfree = 1204 cycles
* cmpxchg_local
kmalloc(8)/kfree = 112 cycles
kmalloc(16)/kfree = 103 cycles
kmalloc(32)/kfree = 103 cycles
kmalloc(64)/kfree = 103 cycles
kmalloc(128)/kfree = 112 cycles
kmalloc(256)/kfree = 111 cycles
kmalloc(512)/kfree = 111 cycles
kmalloc(1024)/kfree = 111 cycles
kmalloc(2048)/kfree = 121 cycles
kmalloc(4096)/kfree = 650 cycles
kmalloc(8192)/kfree = 1042 cycles
kmalloc(16384)/kfree = 1149 cycles
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Tue, 21 Aug 2007, Mathieu Desnoyers wrote:
> Using cmpxchg_local vs cmpxchg has a clear impact on the fast paths, as
> shown below: it saves about 60 to 70 cycles for kmalloc and 200 cycles
> for the kmalloc/kfree pair (test 2).
Hmmmm.. I wonder if the AMD processors simply do the same in either
version.
* Mathieu Desnoyers ([email protected]) wrote:
> * Christoph Lameter ([email protected]) wrote:
> > On Tue, 21 Aug 2007, Mathieu Desnoyers wrote:
> >
> > > - Changed smp_rmb() for barrier(). We are not interested in read order
> > > across cpus, what we want is to be ordered wrt local interrupts only.
> > > barrier() is much cheaper than a rmb().
> >
> > But this means a preempt disable is required. RT users do not want that.
> > Without preemption the processor can be moved after c has been determined.
> > That is why the smp_rmb() is there.
>
> preemption is required if we want to use cmpxchg_local anyway.
>
> We may have to find a way to use preemption while being able to give an
> upper bound on the preempt disabled execution time. I think I got a way
> to do this yesterday.. I'll dig in my patches.
>
Yeah, I remember having done so : moving the preempt disable nearer to
the cmpxchg, checking if the cpuid has changed between the
raw_smp_processor_id() read and the preempt_disable done later, redo if
it is different. It makes the slow path faster, but makes the fast path
more complex, therefore I finally dropped the patch. And we talk about
~10 cycles for the slow path here, I doubt it's worth the complexity
added to the fast path.
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Tue, 21 Aug 2007, Mathieu Desnoyers wrote:
> Are you running a UP or SMP kernel ? If you run a UP kernel, the
> cmpxchg_local and cmpxchg are identical.
UP.
> Oh, and if you run your tests at boot time, the alternatives code may
> have removed the lock prefix, therefore making cmpxchg and cmpxchg_local
> exactly the same.
Tests were run at boot time.
That still does not explain kmalloc not showing improvements.
* Christoph Lameter ([email protected]) wrote:
> On Tue, 21 Aug 2007, Mathieu Desnoyers wrote:
>
> > Using cmpxchg_local vs cmpxchg has a clear impact on the fast paths, as
> > shown below: it saves about 60 to 70 cycles for kmalloc and 200 cycles
> > for the kmalloc/kfree pair (test 2).
>
> Hmmmm.. I wonder if the AMD processors simply do the same in either
> version.
No supposed to. I remember having posted numbers that show a
difference.
Are you running a UP or SMP kernel ? If you run a UP kernel, the
cmpxchg_local and cmpxchg are identical.
Oh, and if you run your tests at boot time, the alternatives code may
have removed the lock prefix, therefore making cmpxchg and cmpxchg_local
exactly the same.
Mathieu
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
* Christoph Lameter ([email protected]) wrote:
> On Tue, 21 Aug 2007, Mathieu Desnoyers wrote:
>
> > Are you running a UP or SMP kernel ? If you run a UP kernel, the
> > cmpxchg_local and cmpxchg are identical.
>
> UP.
>
> > Oh, and if you run your tests at boot time, the alternatives code may
> > have removed the lock prefix, therefore making cmpxchg and cmpxchg_local
> > exactly the same.
>
> Tests were run at boot time.
>
> That still does not explain kmalloc not showing improvements.
>
Hrm, weird.. because it should. Here are the numbers I posted
previously:
The measurements I get (in cycles):
enable interrupts (STI) disable interrupts (CLI) local CMPXCHG
IA32 (P4) 112 82 26
x86_64 AMD64 125 102 19
So both AMD64 and IA32 should be improved.
So why those improvements are not shown in your test ? A few possible
causes:
- Do you have any CONFIG_DEBUG_* options activated ? smp_processor_id()
may end up being more expensive in these cases.
- Rounding error.. you seem to round at 0.1ms, but I keep the values in
cycles. The times that you get (1.1ms) seems strangely higher than
mine, which are under 1000 cycles on a 3GHz system (less than 333ns).
I guess there is both a ms - ns error there and/or not enough
precision in your numbers.
Mathieu
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Tue, 21 Aug 2007, Mathieu Desnoyers wrote:
> - Rounding error.. you seem to round at 0.1ms, but I keep the values in
> cycles. The times that you get (1.1ms) seems strangely higher than
> mine, which are under 1000 cycles on a 3GHz system (less than 333ns).
> I guess there is both a ms - ns error there and/or not enough
> precision in your numbers.
Nope the rounding for output is depending on the amount. Rounds to one
digit after whatever unit we figured out is best to display.
And multiplications (cyc2ns) do not result in rounding errors.
* Christoph Lameter ([email protected]) wrote:
> On Tue, 21 Aug 2007, Mathieu Desnoyers wrote:
>
> > - Rounding error.. you seem to round at 0.1ms, but I keep the values in
> > cycles. The times that you get (1.1ms) seems strangely higher than
> > mine, which are under 1000 cycles on a 3GHz system (less than 333ns).
> > I guess there is both a ms - ns error there and/or not enough
> > precision in your numbers.
>
> Nope the rounding for output is depending on the amount. Rounds to one
> digit after whatever unit we figured out is best to display.
>
> And multiplications (cyc2ns) do not result in rounding errors.
>
Ok, I see now that the 1.1ms was for the 10000 iterations, which makes
it about 230 ns/iteration for the 10000 times kmalloc(8) = 2.3ms test.
As I am going back through the initial cmpxchg_local implementation, it
seems like it was executing __slab_alloc() with preemption disabled,
which is wrong. new_slab() is not designed for that.
I'll try to run my tests on AMD64.
Mathieu
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Mathieu Desnoyers <[email protected]> writes:
>
> The measurements I get (in cycles):
> enable interrupts (STI) disable interrupts (CLI) local CMPXCHG
> IA32 (P4) 112 82 26
> x86_64 AMD64 125 102 19
What exactly did you benchmark here? On K8 CLI/STI are only supposed
to be a few cycles. pushf/popf might me more expensive, but not that much.
-Andi
On Tue, 21 Aug 2007, Mathieu Desnoyers wrote:
> As I am going back through the initial cmpxchg_local implementation, it
> seems like it was executing __slab_alloc() with preemption disabled,
> which is wrong. new_slab() is not designed for that.
The version I send you did not use preemption.
We need to make a decision if we want to go without preemption and cmpxchg
or with preemption and cmpxchg_local.
If we really want to do this then the implementation of all of these
components need to result in competitive performance on all platforms.
* Andi Kleen ([email protected]) wrote:
> Mathieu Desnoyers <[email protected]> writes:
> >
> > The measurements I get (in cycles):
> > enable interrupts (STI) disable interrupts (CLI) local CMPXCHG
> > IA32 (P4) 112 82 26
> > x86_64 AMD64 125 102 19
>
> What exactly did you benchmark here? On K8 CLI/STI are only supposed
> to be a few cycles. pushf/popf might me more expensive, but not that much.
>
Hi Andi,
I benchmarked cmpxchg_local vs local_irq_save/local_irq_restore.
Details, and code, follow.
* cpuinfo:
processor : 0
vendor_id : AuthenticAMD
cpu family : 15
model : 35
model name : AMD Athlon(tm)64 X2 Dual Core Processor 3800+
stepping : 2
cpu MHz : 2009.204
cache size : 512 KB
physical id : 0
siblings : 2
core id : 0
cpu cores : 2
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt lm 3dnowext 3dnow pni lahf_lm cmp_legacy
bogomips : 4023.38
TLB size : 1024 4K pages
clflush size : 64
cache_alignment : 64
address sizes : 40 bits physical, 48 bits virtual
power management: ts fid vid ttp
processor : 1
vendor_id : AuthenticAMD
cpu family : 15
model : 35
model name : AMD Athlon(tm)64 X2 Dual Core Processor 3800+
stepping : 2
cpu MHz : 2009.204
cache size : 512 KB
physical id : 0
siblings : 2
core id : 1
cpu cores : 2
fpu : yes
fpu_exception : yes
cpuid level : 1
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt lm 3dnowext 3dnow pni lahf_lm cmp_legacy
bogomips : 4018.49
TLB size : 1024 4K pages
clflush size : 64
cache_alignment : 64
address sizes : 40 bits physical, 48 bits virtual
power management: ts fid vid ttp
* Test ran:
/* test-cmpxchg-nolock.c
*
* Compare local cmpxchg with irq disable / enable.
*/
#include <linux/jiffies.h>
#include <linux/compiler.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/calc64.h>
#include <asm/timex.h>
#include <asm/system.h>
#define NR_LOOPS 20000
int test_val;
static void do_test_cmpxchg(void)
{
int ret;
long flags;
unsigned int i;
cycles_t time1, time2, time;
long rem;
local_irq_save(flags);
preempt_disable();
time1 = get_cycles();
for (i = 0; i < NR_LOOPS; i++) {
ret = cmpxchg_local(&test_val, 0, 0);
}
time2 = get_cycles();
local_irq_restore(flags);
preempt_enable();
time = time2 - time1;
printk(KERN_ALERT "test results: time for non locked cmpxchg\n");
printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
printk(KERN_ALERT "total time: %llu\n", time);
time = div_long_long_rem(time, NR_LOOPS, &rem);
printk(KERN_ALERT "-> non locked cmpxchg takes %llu cycles\n", time);
printk(KERN_ALERT "test end\n");
}
/*
* This test will have a higher standard deviation due to incoming interrupts.
*/
static void do_test_enable_int(void)
{
long flags;
unsigned int i;
cycles_t time1, time2, time;
long rem;
local_irq_save(flags);
preempt_disable();
time1 = get_cycles();
for (i = 0; i < NR_LOOPS; i++) {
local_irq_restore(flags);
}
time2 = get_cycles();
local_irq_restore(flags);
preempt_enable();
time = time2 - time1;
printk(KERN_ALERT "test results: time for enabling interrupts (STI)\n");
printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
printk(KERN_ALERT "total time: %llu\n", time);
time = div_long_long_rem(time, NR_LOOPS, &rem);
printk(KERN_ALERT "-> enabling interrupts (STI) takes %llu cycles\n",
time);
printk(KERN_ALERT "test end\n");
}
static void do_test_disable_int(void)
{
unsigned long flags, flags2;
unsigned int i;
cycles_t time1, time2, time;
long rem;
local_irq_save(flags);
preempt_disable();
time1 = get_cycles();
for ( i = 0; i < NR_LOOPS; i++) {
local_irq_save(flags2);
}
time2 = get_cycles();
local_irq_restore(flags);
preempt_enable();
time = time2 - time1;
printk(KERN_ALERT "test results: time for disabling interrupts (CLI)\n");
printk(KERN_ALERT "number of loops: %d\n", NR_LOOPS);
printk(KERN_ALERT "total time: %llu\n", time);
time = div_long_long_rem(time, NR_LOOPS, &rem);
printk(KERN_ALERT "-> disabling interrupts (CLI) takes %llu cycles\n",
time);
printk(KERN_ALERT "test end\n");
}
static int ltt_test_init(void)
{
printk(KERN_ALERT "test init\n");
do_test_cmpxchg();
do_test_enable_int();
do_test_disable_int();
return -EAGAIN; /* Fail will directly unload the module */
}
static void ltt_test_exit(void)
{
printk(KERN_ALERT "test exit\n");
}
module_init(ltt_test_init)
module_exit(ltt_test_exit)
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Mathieu Desnoyers");
MODULE_DESCRIPTION("Cmpxchg vs int Test");
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Ok. Measurements vs. simple cmpxchg on a Intel(R) Pentium(R) 4 CPU 3.20GHz
(hyperthreading enabled). Test run with your module show only minor
performance improvements and lots of regressions. So we must have
cmpxchg_local to see any improvements? Some kind of a recent optimization
of cmpxchg performance that we do not see on older cpus?
Code of kmem_cache_alloc (to show you that there are no debug options on):
Dump of assembler code for function kmem_cache_alloc:
0x4015cfa9 <kmem_cache_alloc+0>: push %ebp
0x4015cfaa <kmem_cache_alloc+1>: mov %esp,%ebp
0x4015cfac <kmem_cache_alloc+3>: push %edi
0x4015cfad <kmem_cache_alloc+4>: push %esi
0x4015cfae <kmem_cache_alloc+5>: push %ebx
0x4015cfaf <kmem_cache_alloc+6>: sub $0x10,%esp
0x4015cfb2 <kmem_cache_alloc+9>: mov %eax,%esi
0x4015cfb4 <kmem_cache_alloc+11>: mov %edx,0xffffffe8(%ebp)
0x4015cfb7 <kmem_cache_alloc+14>: mov 0x4(%ebp),%eax
0x4015cfba <kmem_cache_alloc+17>: mov %eax,0xfffffff0(%ebp)
0x4015cfbd <kmem_cache_alloc+20>: mov %fs:0x404af008,%eax
0x4015cfc3 <kmem_cache_alloc+26>: mov 0x90(%esi,%eax,4),%edi
0x4015cfca <kmem_cache_alloc+33>: mov (%edi),%ecx
0x4015cfcc <kmem_cache_alloc+35>: test %ecx,%ecx
0x4015cfce <kmem_cache_alloc+37>: je 0x4015d00a <kmem_cache_alloc+97>
0x4015cfd0 <kmem_cache_alloc+39>: mov 0xc(%edi),%eax
0x4015cfd3 <kmem_cache_alloc+42>: mov (%ecx,%eax,4),%eax
0x4015cfd6 <kmem_cache_alloc+45>: mov %eax,%edx
0x4015cfd8 <kmem_cache_alloc+47>: mov %ecx,%eax
0x4015cfda <kmem_cache_alloc+49>: lock cmpxchg %edx,(%edi)
0x4015cfde <kmem_cache_alloc+53>: mov %eax,%ebx
0x4015cfe0 <kmem_cache_alloc+55>: cmp %ecx,%eax
0x4015cfe2 <kmem_cache_alloc+57>: jne 0x4015cfbd <kmem_cache_alloc+20>
0x4015cfe4 <kmem_cache_alloc+59>: cmpw $0x0,0xffffffe8(%ebp)
0x4015cfe9 <kmem_cache_alloc+64>: jns 0x4015d006 <kmem_cache_alloc+93>
0x4015cfeb <kmem_cache_alloc+66>: mov 0x10(%edi),%edx
0x4015cfee <kmem_cache_alloc+69>: xor %eax,%eax
0x4015cff0 <kmem_cache_alloc+71>: mov %edx,%ecx
0x4015cff2 <kmem_cache_alloc+73>: shr $0x2,%ecx
0x4015cff5 <kmem_cache_alloc+76>: mov %ebx,%edi
Base
1. Kmalloc: Repeatedly allocate then free test
10000 times kmalloc(8) -> 332 cycles kfree -> 422 cycles
10000 times kmalloc(16) -> 218 cycles kfree -> 360 cycles
10000 times kmalloc(32) -> 214 cycles kfree -> 368 cycles
10000 times kmalloc(64) -> 244 cycles kfree -> 390 cycles
10000 times kmalloc(128) -> 320 cycles kfree -> 417 cycles
10000 times kmalloc(256) -> 438 cycles kfree -> 550 cycles
10000 times kmalloc(512) -> 527 cycles kfree -> 626 cycles
10000 times kmalloc(1024) -> 678 cycles kfree -> 775 cycles
10000 times kmalloc(2048) -> 748 cycles kfree -> 822 cycles
10000 times kmalloc(4096) -> 641 cycles kfree -> 650 cycles
10000 times kmalloc(8192) -> 741 cycles kfree -> 817 cycles
10000 times kmalloc(16384) -> 872 cycles kfree -> 927 cycles
2. Kmalloc: alloc/free test
10000 times kmalloc(8)/kfree -> 332 cycles
10000 times kmalloc(16)/kfree -> 327 cycles
10000 times kmalloc(32)/kfree -> 323 cycles
10000 times kmalloc(64)/kfree -> 320 cycles
10000 times kmalloc(128)/kfree -> 320 cycles
10000 times kmalloc(256)/kfree -> 333 cycles
10000 times kmalloc(512)/kfree -> 332 cycles
10000 times kmalloc(1024)/kfree -> 330 cycles
10000 times kmalloc(2048)/kfree -> 334 cycles
10000 times kmalloc(4096)/kfree -> 674 cycles
10000 times kmalloc(8192)/kfree -> 1155 cycles
10000 times kmalloc(16384)/kfree -> 1226 cycles
Slub cmpxchg.
1. Kmalloc: Repeatedly allocate then free test
10000 times kmalloc(8) -> 296 cycles kfree -> 515 cycles
10000 times kmalloc(16) -> 193 cycles kfree -> 412 cycles
10000 times kmalloc(32) -> 188 cycles kfree -> 422 cycles
10000 times kmalloc(64) -> 222 cycles kfree -> 441 cycles
10000 times kmalloc(128) -> 292 cycles kfree -> 476 cycles
10000 times kmalloc(256) -> 414 cycles kfree -> 589 cycles
10000 times kmalloc(512) -> 513 cycles kfree -> 673 cycles
10000 times kmalloc(1024) -> 694 cycles kfree -> 825 cycles
10000 times kmalloc(2048) -> 739 cycles kfree -> 878 cycles
10000 times kmalloc(4096) -> 636 cycles kfree -> 653 cycles
10000 times kmalloc(8192) -> 715 cycles kfree -> 799 cycles
10000 times kmalloc(16384) -> 855 cycles kfree -> 927 cycles
2. Kmalloc: alloc/free test
10000 times kmalloc(8)/kfree -> 354 cycles
10000 times kmalloc(16)/kfree -> 336 cycles
10000 times kmalloc(32)/kfree -> 335 cycles
10000 times kmalloc(64)/kfree -> 337 cycles
10000 times kmalloc(128)/kfree -> 337 cycles
10000 times kmalloc(256)/kfree -> 355 cycles
10000 times kmalloc(512)/kfree -> 354 cycles
10000 times kmalloc(1024)/kfree -> 337 cycles
10000 times kmalloc(2048)/kfree -> 339 cycles
10000 times kmalloc(4096)/kfree -> 674 cycles
10000 times kmalloc(8192)/kfree -> 1128 cycles
10000 times kmalloc(16384)/kfree -> 1240 cycles
* Christoph Lameter ([email protected]) wrote:
> Ok. Measurements vs. simple cmpxchg on a Intel(R) Pentium(R) 4 CPU 3.20GHz
> (hyperthreading enabled). Test run with your module show only minor
> performance improvements and lots of regressions. So we must have
> cmpxchg_local to see any improvements? Some kind of a recent optimization
> of cmpxchg performance that we do not see on older cpus?
>
I did not expect the cmpxchg with LOCK prefix to be faster than irq
save/restore. You will need to run these tests using cmpxchg_local to
see an improvement.
Mathieu
>
> Code of kmem_cache_alloc (to show you that there are no debug options on):
>
> Dump of assembler code for function kmem_cache_alloc:
> 0x4015cfa9 <kmem_cache_alloc+0>: push %ebp
> 0x4015cfaa <kmem_cache_alloc+1>: mov %esp,%ebp
> 0x4015cfac <kmem_cache_alloc+3>: push %edi
> 0x4015cfad <kmem_cache_alloc+4>: push %esi
> 0x4015cfae <kmem_cache_alloc+5>: push %ebx
> 0x4015cfaf <kmem_cache_alloc+6>: sub $0x10,%esp
> 0x4015cfb2 <kmem_cache_alloc+9>: mov %eax,%esi
> 0x4015cfb4 <kmem_cache_alloc+11>: mov %edx,0xffffffe8(%ebp)
> 0x4015cfb7 <kmem_cache_alloc+14>: mov 0x4(%ebp),%eax
> 0x4015cfba <kmem_cache_alloc+17>: mov %eax,0xfffffff0(%ebp)
> 0x4015cfbd <kmem_cache_alloc+20>: mov %fs:0x404af008,%eax
> 0x4015cfc3 <kmem_cache_alloc+26>: mov 0x90(%esi,%eax,4),%edi
> 0x4015cfca <kmem_cache_alloc+33>: mov (%edi),%ecx
> 0x4015cfcc <kmem_cache_alloc+35>: test %ecx,%ecx
> 0x4015cfce <kmem_cache_alloc+37>: je 0x4015d00a <kmem_cache_alloc+97>
> 0x4015cfd0 <kmem_cache_alloc+39>: mov 0xc(%edi),%eax
> 0x4015cfd3 <kmem_cache_alloc+42>: mov (%ecx,%eax,4),%eax
> 0x4015cfd6 <kmem_cache_alloc+45>: mov %eax,%edx
> 0x4015cfd8 <kmem_cache_alloc+47>: mov %ecx,%eax
> 0x4015cfda <kmem_cache_alloc+49>: lock cmpxchg %edx,(%edi)
> 0x4015cfde <kmem_cache_alloc+53>: mov %eax,%ebx
> 0x4015cfe0 <kmem_cache_alloc+55>: cmp %ecx,%eax
> 0x4015cfe2 <kmem_cache_alloc+57>: jne 0x4015cfbd <kmem_cache_alloc+20>
> 0x4015cfe4 <kmem_cache_alloc+59>: cmpw $0x0,0xffffffe8(%ebp)
> 0x4015cfe9 <kmem_cache_alloc+64>: jns 0x4015d006 <kmem_cache_alloc+93>
> 0x4015cfeb <kmem_cache_alloc+66>: mov 0x10(%edi),%edx
> 0x4015cfee <kmem_cache_alloc+69>: xor %eax,%eax
> 0x4015cff0 <kmem_cache_alloc+71>: mov %edx,%ecx
> 0x4015cff2 <kmem_cache_alloc+73>: shr $0x2,%ecx
> 0x4015cff5 <kmem_cache_alloc+76>: mov %ebx,%edi
>
> Base
>
> 1. Kmalloc: Repeatedly allocate then free test
> 10000 times kmalloc(8) -> 332 cycles kfree -> 422 cycles
> 10000 times kmalloc(16) -> 218 cycles kfree -> 360 cycles
> 10000 times kmalloc(32) -> 214 cycles kfree -> 368 cycles
> 10000 times kmalloc(64) -> 244 cycles kfree -> 390 cycles
> 10000 times kmalloc(128) -> 320 cycles kfree -> 417 cycles
> 10000 times kmalloc(256) -> 438 cycles kfree -> 550 cycles
> 10000 times kmalloc(512) -> 527 cycles kfree -> 626 cycles
> 10000 times kmalloc(1024) -> 678 cycles kfree -> 775 cycles
> 10000 times kmalloc(2048) -> 748 cycles kfree -> 822 cycles
> 10000 times kmalloc(4096) -> 641 cycles kfree -> 650 cycles
> 10000 times kmalloc(8192) -> 741 cycles kfree -> 817 cycles
> 10000 times kmalloc(16384) -> 872 cycles kfree -> 927 cycles
> 2. Kmalloc: alloc/free test
> 10000 times kmalloc(8)/kfree -> 332 cycles
> 10000 times kmalloc(16)/kfree -> 327 cycles
> 10000 times kmalloc(32)/kfree -> 323 cycles
> 10000 times kmalloc(64)/kfree -> 320 cycles
> 10000 times kmalloc(128)/kfree -> 320 cycles
> 10000 times kmalloc(256)/kfree -> 333 cycles
> 10000 times kmalloc(512)/kfree -> 332 cycles
> 10000 times kmalloc(1024)/kfree -> 330 cycles
> 10000 times kmalloc(2048)/kfree -> 334 cycles
> 10000 times kmalloc(4096)/kfree -> 674 cycles
> 10000 times kmalloc(8192)/kfree -> 1155 cycles
> 10000 times kmalloc(16384)/kfree -> 1226 cycles
>
> Slub cmpxchg.
>
> 1. Kmalloc: Repeatedly allocate then free test
> 10000 times kmalloc(8) -> 296 cycles kfree -> 515 cycles
> 10000 times kmalloc(16) -> 193 cycles kfree -> 412 cycles
> 10000 times kmalloc(32) -> 188 cycles kfree -> 422 cycles
> 10000 times kmalloc(64) -> 222 cycles kfree -> 441 cycles
> 10000 times kmalloc(128) -> 292 cycles kfree -> 476 cycles
> 10000 times kmalloc(256) -> 414 cycles kfree -> 589 cycles
> 10000 times kmalloc(512) -> 513 cycles kfree -> 673 cycles
> 10000 times kmalloc(1024) -> 694 cycles kfree -> 825 cycles
> 10000 times kmalloc(2048) -> 739 cycles kfree -> 878 cycles
> 10000 times kmalloc(4096) -> 636 cycles kfree -> 653 cycles
> 10000 times kmalloc(8192) -> 715 cycles kfree -> 799 cycles
> 10000 times kmalloc(16384) -> 855 cycles kfree -> 927 cycles
> 2. Kmalloc: alloc/free test
> 10000 times kmalloc(8)/kfree -> 354 cycles
> 10000 times kmalloc(16)/kfree -> 336 cycles
> 10000 times kmalloc(32)/kfree -> 335 cycles
> 10000 times kmalloc(64)/kfree -> 337 cycles
> 10000 times kmalloc(128)/kfree -> 337 cycles
> 10000 times kmalloc(256)/kfree -> 355 cycles
> 10000 times kmalloc(512)/kfree -> 354 cycles
> 10000 times kmalloc(1024)/kfree -> 337 cycles
> 10000 times kmalloc(2048)/kfree -> 339 cycles
> 10000 times kmalloc(4096)/kfree -> 674 cycles
> 10000 times kmalloc(8192)/kfree -> 1128 cycles
> 10000 times kmalloc(16384)/kfree -> 1240 cycles
>
>
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
* Christoph Lameter ([email protected]) wrote:
> On Tue, 21 Aug 2007, Mathieu Desnoyers wrote:
>
> > As I am going back through the initial cmpxchg_local implementation, it
> > seems like it was executing __slab_alloc() with preemption disabled,
> > which is wrong. new_slab() is not designed for that.
>
> The version I send you did not use preemption.
>
> We need to make a decision if we want to go without preemption and cmpxchg
> or with preemption and cmpxchg_local.
>
I don't expect any performance improvements with cmpxchg() over irq
disable/restore. I think we'll have to use cmpxchg_local
Also, we may argue that locked cmpxchg will have more scalability impact
than cmpxchg_local. Actually, I expect the LOCK prefix to have a bigger
scalability impact than the irq save/restore pair.
> If we really want to do this then the implementation of all of these
> components need to result in competitive performance on all platforms.
>
The minor issue I see here is on architectures where we have to simulate
cmpxchg_local with irq save/restore. Depending on how we implement the
code, it may result in two irq save/restore pairs instead of one, which
could make the code slower. However, if we are clever enough in our
low-level primitive usage, I think we could make the code use
cmpxchg_local when available and fall back on only _one_ irq disabled
section surrounding the whole code for other architectures.
Mathieu
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Tue, Aug 21, 2007 at 06:06:19PM -0700, Christoph Lameter wrote:
> Ok. Measurements vs. simple cmpxchg on a Intel(R) Pentium(R) 4 CPU 3.20GHz
Note the P4 is a extreme case in that "unusual" instructions are
quite slow (basically anything that falls out of the trace cache). Core2
tends to be much more benign and generally acts more like a K8 in latencies.
There are millions and millions of P4s around of course and we
shouldn't disregard them, but they're not the future and not
highest priority.
-Andi
On Wed, Aug 22, 2007 at 09:45:33AM -0400, Mathieu Desnoyers wrote:
> Measurements on a AMD64 2.0 GHz dual-core
>
> In this test, we seem to remove 10 cycles from the kmalloc fast path.
> On small allocations, it gives a 14% performance increase. kfree fast
> path also seems to have a 10 cycles improvement.
Looks good. Anything that makes kmalloc faster is good
-Andi
Measurements on a AMD64 2.0 GHz dual-core
In this test, we seem to remove 10 cycles from the kmalloc fast path.
On small allocations, it gives a 14% performance increase. kfree fast
path also seems to have a 10 cycles improvement.
1. Kmalloc: Repeatedly allocate then free test
* cmpxchg_local slub
kmalloc(8) = 63 cycles kfree = 126 cycles
kmalloc(16) = 66 cycles kfree = 129 cycles
kmalloc(32) = 76 cycles kfree = 138 cycles
kmalloc(64) = 100 cycles kfree = 288 cycles
kmalloc(128) = 128 cycles kfree = 309 cycles
kmalloc(256) = 170 cycles kfree = 315 cycles
kmalloc(512) = 221 cycles kfree = 357 cycles
kmalloc(1024) = 324 cycles kfree = 393 cycles
kmalloc(2048) = 354 cycles kfree = 440 cycles
kmalloc(4096) = 394 cycles kfree = 330 cycles
kmalloc(8192) = 523 cycles kfree = 481 cycles
kmalloc(16384) = 643 cycles kfree = 649 cycles
* Base
kmalloc(8) = 74 cycles kfree = 113 cycles
kmalloc(16) = 76 cycles kfree = 116 cycles
kmalloc(32) = 85 cycles kfree = 133 cycles
kmalloc(64) = 111 cycles kfree = 279 cycles
kmalloc(128) = 138 cycles kfree = 294 cycles
kmalloc(256) = 181 cycles kfree = 304 cycles
kmalloc(512) = 237 cycles kfree = 327 cycles
kmalloc(1024) = 340 cycles kfree = 379 cycles
kmalloc(2048) = 378 cycles kfree = 433 cycles
kmalloc(4096) = 399 cycles kfree = 329 cycles
kmalloc(8192) = 528 cycles kfree = 624 cycles
kmalloc(16384) = 651 cycles kfree = 737 cycles
2. Kmalloc: alloc/free test
* cmpxchg_local slub
kmalloc(8)/kfree = 96 cycles
kmalloc(16)/kfree = 97 cycles
kmalloc(32)/kfree = 97 cycles
kmalloc(64)/kfree = 97 cycles
kmalloc(128)/kfree = 97 cycles
kmalloc(256)/kfree = 105 cycles
kmalloc(512)/kfree = 108 cycles
kmalloc(1024)/kfree = 105 cycles
kmalloc(2048)/kfree = 107 cycles
kmalloc(4096)/kfree = 390 cycles
kmalloc(8192)/kfree = 626 cycles
kmalloc(16384)/kfree = 662 cycles
* Base
kmalloc(8)/kfree = 116 cycles
kmalloc(16)/kfree = 116 cycles
kmalloc(32)/kfree = 116 cycles
kmalloc(64)/kfree = 116 cycles
kmalloc(128)/kfree = 116 cycles
kmalloc(256)/kfree = 126 cycles
kmalloc(512)/kfree = 126 cycles
kmalloc(1024)/kfree = 126 cycles
kmalloc(2048)/kfree = 126 cycles
kmalloc(4096)/kfree = 384 cycles
kmalloc(8192)/kfree = 749 cycles
kmalloc(16384)/kfree = 786 cycles
Mathieu
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
define have_arch_cmpxchg()
* Christoph Lameter ([email protected]) wrote:
> If we really want to do this then the implementation of all of these
> components need to result in competitive performance on all platforms.
>
Here is the first of 2 proposed patches to keep the same performances on
architectures that does not implement cmpxchg_local.
Add have_arch_cmpxchg(), which can be used within the body of functions to
select between irq disable and local_cmpxchg.
Signed-off-by: Mathieu Desnoyers <[email protected]>
---
include/linux/hardirq.h | 6 ++++++
1 file changed, 6 insertions(+)
Index: slab/include/linux/hardirq.h
===================================================================
--- slab.orig/include/linux/hardirq.h 2007-08-22 10:27:07.000000000 -0400
+++ slab/include/linux/hardirq.h 2007-08-22 10:27:46.000000000 -0400
@@ -149,4 +149,10 @@ extern void irq_exit(void);
#define nmi_enter() do { lockdep_off(); __irq_enter(); } while (0)
#define nmi_exit() do { __irq_exit(); lockdep_on(); } while (0)
+#ifdef __HAVE_ARCH_CMPXCHG
+#define have_arch_cmpxchg() 1
+#else
+#define have_arch_cmpxchg() 0
+#endif
+
#endif /* LINUX_HARDIRQ_H */
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
SLUB: use have_arch_cmpxchg()
* Christoph Lameter ([email protected]) wrote:
> If we really want to do this then the implementation of all of these
> components need to result in competitive performance on all platforms.
>
Patch 2 of 2
Use have_arch_cmpxchg() to select between irq disable and cmpxchg_local.
Pro:
- Should provide the same performances on architectures that does not provide
fast local_cmpxchg.
- Improved performances on architectures that provide fast local_cmpxchg.
Cons:
- Does not help code readability, i.e.:
if (have_arch_cmpxchg())
preempt_disable();
else
local_irq_save(flags);
It applies on top of the cmpxchg_local() slub patch.
It depends on the Add have_arch_cmpxchg() patch.
Signed-off-by: Mathieu Desnoyers <[email protected]>
---
mm/slub.c | 65 +++++++++++++++++++++++++++++++++++++++++++-------------------
1 file changed, 46 insertions(+), 19 deletions(-)
Index: slab/mm/slub.c
===================================================================
--- slab.orig/mm/slub.c 2007-08-22 10:28:22.000000000 -0400
+++ slab/mm/slub.c 2007-08-22 10:51:53.000000000 -0400
@@ -1450,7 +1450,8 @@ static void *__slab_alloc(struct kmem_ca
void **freelist = NULL;
unsigned long flags;
- local_irq_save(flags);
+ if (have_arch_cmpxchg())
+ local_irq_save(flags);
c = get_cpu_slab(s, smp_processor_id());
if (!c->page)
/* Slab was flushed */
@@ -1482,7 +1483,8 @@ out_object:
c->freelist = object[c->offset];
out:
slab_unlock(c->page);
- local_irq_restore(flags);
+ if (have_arch_cmpxchg())
+ local_irq_restore(flags);
if (unlikely((gfpflags & __GFP_ZERO)))
memset(object, 0, c->objsize);
return object;
@@ -1525,7 +1527,8 @@ new_slab:
c->page = new;
goto load_freelist;
}
- local_irq_restore(flags);
+ if (have_arch_cmpxchg())
+ local_irq_restore(flags);
return NULL;
debug:
object = c->page->freelist;
@@ -1553,8 +1556,12 @@ static void __always_inline *slab_alloc(
{
void **object;
struct kmem_cache_cpu *c;
+ unsigned long flags;
- preempt_disable();
+ if (have_arch_cmpxchg())
+ preempt_disable();
+ else
+ local_irq_save(flags);
c = get_cpu_slab(s, raw_smp_processor_id());
if (unlikely(!node_match(c, node)))
goto slow;
@@ -1564,15 +1571,21 @@ redo:
if (unlikely(!object))
goto slow;
- if (unlikely(cmpxchg_local(&c->freelist, object,
- object[c->offset]) != object))
- goto redo;
- preempt_enable();
+ if (have_arch_cmpxchg()) {
+ if (unlikely(cmpxchg_local(&c->freelist, object,
+ object[c->offset]) != object))
+ goto redo;
+ preempt_enable();
+ } else {
+ c->freelist = object[c->offset];
+ local_irq_restore(flags);
+ }
if (unlikely((gfpflags & __GFP_ZERO)))
memset(object, 0, c->objsize);
return object;
slow:
- preempt_enable();
+ if (have_arch_cmpxchg())
+ preempt_enable();
return __slab_alloc(s, gfpflags, node, addr);
}
@@ -1605,7 +1618,8 @@ static void __slab_free(struct kmem_cach
void **object = (void *)x;
unsigned long flags;
- local_irq_save(flags);
+ if (have_arch_cmpxchg())
+ local_irq_save(flags);
slab_lock(page);
if (unlikely(SlabDebug(page)))
@@ -1631,7 +1645,8 @@ checks_ok:
out_unlock:
slab_unlock(page);
- local_irq_restore(flags);
+ if (have_arch_cmpxchg())
+ local_irq_restore(flags);
return;
slab_empty:
@@ -1642,7 +1657,8 @@ slab_empty:
remove_partial(s, page);
slab_unlock(page);
- local_irq_restore(flags);
+ if (have_arch_cmpxchg())
+ local_irq_restore(flags);
discard_slab(s, page);
return;
@@ -1669,27 +1685,38 @@ static void __always_inline slab_free(st
void **object = (void *)x;
void **freelist;
struct kmem_cache_cpu *c;
+ unsigned long flags;
- preempt_disable();
+ if (have_arch_cmpxchg())
+ preempt_disable();
+ else
+ local_irq_save(flags);
c = get_cpu_slab(s, raw_smp_processor_id());
if (unlikely(c->node < 0))
goto slow;
redo:
freelist = c->freelist;
- barrier(); /* Read freelist before page, wrt local interrupts */
+ if (have_arch_cmpxchg())
+ barrier(); /* Read freelist before page, wrt local interrupts */
if (unlikely(page != c->page))
goto slow;
object[c->offset] = freelist;
- if (unlikely(cmpxchg_local(&c->freelist,
- freelist, object) != freelist))
- goto redo;
- preempt_enable();
+ if (have_arch_cmpxchg()) {
+ if (unlikely(cmpxchg_local(&c->freelist,
+ freelist, object) != freelist))
+ goto redo;
+ preempt_enable();
+ } else {
+ c->freelist = object;
+ local_irq_restore(flags);
+ }
return;
slow:
- preempt_enable();
+ if (have_arch_cmpxchg())
+ preempt_enable();
__slab_free(s, page, x, addr, c->offset);
}
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Hi Mathieu,
On 8/22/07, Mathieu Desnoyers <[email protected]> wrote:
> Cons:
> - Does not help code readability, i.e.:
>
> if (have_arch_cmpxchg())
> preempt_disable();
> else
> local_irq_save(flags);
Heh, that's an understatement, as now slub is starting to look a bit
like slab... ;-) We need to hide that if-else maze into helper
functions for sure.
On 8/22/07, Mathieu Desnoyers <[email protected]> wrote:
> --- slab.orig/mm/slub.c 2007-08-22 10:28:22.000000000 -0400
> +++ slab/mm/slub.c 2007-08-22 10:51:53.000000000 -0400
> @@ -1450,7 +1450,8 @@ static void *__slab_alloc(struct kmem_ca
> void **freelist = NULL;
> unsigned long flags;
>
> - local_irq_save(flags);
> + if (have_arch_cmpxchg())
> + local_irq_save(flags);
I haven't been following on the cmpxchg_local() discussion too much so
the obvious question is: why do we do local_irq_save() for the _has_
cmpxchg() case here...
On 8/22/07, Mathieu Desnoyers <[email protected]> wrote:
> @@ -1553,8 +1556,12 @@ static void __always_inline *slab_alloc(
> {
> void **object;
> struct kmem_cache_cpu *c;
> + unsigned long flags;
>
> - preempt_disable();
> + if (have_arch_cmpxchg())
> + preempt_disable();
> + else
> + local_irq_save(flags);
...and preempt_disable() here?
On 8/22/07, Mathieu Desnoyers <[email protected]> wrote:
> + if (have_arch_cmpxchg()) {
> + if (unlikely(cmpxchg_local(&c->freelist, object,
> + object[c->offset]) != object))
> + goto redo;
> + preempt_enable();
> + } else {
> + c->freelist = object[c->offset];
> + local_irq_restore(flags);
> + }
If you move the preempt_enable/local_irq_restore pair outside of the
if-else block, you can make a static inline function slob_get_object()
that does:
static inline bool slob_get_object(struct kmem_cache *c, void **object)
{
if (have_arch_cmpxchg()) {
if (unlikely(cmpxchg_local(&c->freelist, object,
object[c->offset]) != object))
return false;
} else {
c->freelist = object[c->offset];
}
return true;
}
And let the compiler optimize out the branch for the non-cmpxchg case.
Same for the reverse case too (i.e. slob_put_object).
Pekka
On Wed, 22 Aug 2007, Mathieu Desnoyers wrote:
> define have_arch_cmpxchg()
Should this not be
have_arch_cmpxchg_local() ?
cmpxchg does not bring us anything.
I can confirm Mathieus' measurement now:
Athlon64:
regular NUMA/discontig
1. Kmalloc: Repeatedly allocate then free test
10000 times kmalloc(8) -> 79 cycles kfree -> 92 cycles
10000 times kmalloc(16) -> 79 cycles kfree -> 93 cycles
10000 times kmalloc(32) -> 88 cycles kfree -> 95 cycles
10000 times kmalloc(64) -> 124 cycles kfree -> 132 cycles
10000 times kmalloc(128) -> 157 cycles kfree -> 247 cycles
10000 times kmalloc(256) -> 200 cycles kfree -> 257 cycles
10000 times kmalloc(512) -> 250 cycles kfree -> 277 cycles
10000 times kmalloc(1024) -> 337 cycles kfree -> 314 cycles
10000 times kmalloc(2048) -> 365 cycles kfree -> 330 cycles
10000 times kmalloc(4096) -> 352 cycles kfree -> 240 cycles
10000 times kmalloc(8192) -> 456 cycles kfree -> 340 cycles
10000 times kmalloc(16384) -> 646 cycles kfree -> 471 cycles
2. Kmalloc: alloc/free test
10000 times kmalloc(8)/kfree -> 124 cycles
10000 times kmalloc(16)/kfree -> 124 cycles
10000 times kmalloc(32)/kfree -> 124 cycles
10000 times kmalloc(64)/kfree -> 124 cycles
10000 times kmalloc(128)/kfree -> 124 cycles
10000 times kmalloc(256)/kfree -> 132 cycles
10000 times kmalloc(512)/kfree -> 132 cycles
10000 times kmalloc(1024)/kfree -> 132 cycles
10000 times kmalloc(2048)/kfree -> 132 cycles
10000 times kmalloc(4096)/kfree -> 319 cycles
10000 times kmalloc(8192)/kfree -> 486 cycles
10000 times kmalloc(16384)/kfree -> 539 cycles
cmpxchg_local NUMA/discontig
1. Kmalloc: Repeatedly allocate then free test
10000 times kmalloc(8) -> 55 cycles kfree -> 90 cycles
10000 times kmalloc(16) -> 55 cycles kfree -> 92 cycles
10000 times kmalloc(32) -> 70 cycles kfree -> 91 cycles
10000 times kmalloc(64) -> 100 cycles kfree -> 141 cycles
10000 times kmalloc(128) -> 128 cycles kfree -> 233 cycles
10000 times kmalloc(256) -> 172 cycles kfree -> 251 cycles
10000 times kmalloc(512) -> 225 cycles kfree -> 275 cycles
10000 times kmalloc(1024) -> 325 cycles kfree -> 311 cycles
10000 times kmalloc(2048) -> 346 cycles kfree -> 330 cycles
10000 times kmalloc(4096) -> 351 cycles kfree -> 238 cycles
10000 times kmalloc(8192) -> 450 cycles kfree -> 342 cycles
10000 times kmalloc(16384) -> 630 cycles kfree -> 546 cycles
2. Kmalloc: alloc/free test
10000 times kmalloc(8)/kfree -> 81 cycles
10000 times kmalloc(16)/kfree -> 81 cycles
10000 times kmalloc(32)/kfree -> 81 cycles
10000 times kmalloc(64)/kfree -> 81 cycles
10000 times kmalloc(128)/kfree -> 81 cycles
10000 times kmalloc(256)/kfree -> 91 cycles
10000 times kmalloc(512)/kfree -> 90 cycles
10000 times kmalloc(1024)/kfree -> 91 cycles
10000 times kmalloc(2048)/kfree -> 90 cycles
10000 times kmalloc(4096)/kfree -> 318 cycles
10000 times kmalloc(8192)/kfree -> 483 cycles
10000 times kmalloc(16384)/kfree -> 536 cycles
Here is the current cmpxchg_local version that I used for testing.
Signed-off-by: Christoph Lameter <[email protected]>
---
include/linux/slub_def.h | 10 +++---
mm/slub.c | 74 ++++++++++++++++++++++++++++++++---------------
2 files changed, 56 insertions(+), 28 deletions(-)
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c 2007-08-21 22:34:30.000000000 -0700
+++ linux-2.6/mm/slub.c 2007-08-22 02:07:26.000000000 -0700
@@ -1442,13 +1442,18 @@ static void *__slab_alloc(struct kmem_ca
{
void **object;
struct page *new;
+ unsigned long flags;
+ local_irq_save(flags);
+ put_cpu_no_resched();
if (!c->page)
+ /* Slab was flushed */
goto new_slab;
slab_lock(c->page);
if (unlikely(!node_match(c, node)))
goto another_slab;
+
load_freelist:
object = c->page->freelist;
if (unlikely(!object))
@@ -1457,11 +1462,15 @@ load_freelist:
goto debug;
object = c->page->freelist;
- c->freelist = object[c->offset];
c->page->inuse = s->objects;
c->page->freelist = NULL;
c->node = page_to_nid(c->page);
+ c->freelist = object[c->offset];
+out:
slab_unlock(c->page);
+ local_irq_restore(flags);
+ if (unlikely((gfpflags & __GFP_ZERO)))
+ memset(object, 0, c->objsize);
return object;
another_slab:
@@ -1502,6 +1511,7 @@ new_slab:
c->page = new;
goto load_freelist;
}
+ local_irq_restore(flags);
return NULL;
debug:
object = c->page->freelist;
@@ -1511,8 +1521,7 @@ debug:
c->page->inuse++;
c->page->freelist = object[c->offset];
c->node = -1;
- slab_unlock(c->page);
- return object;
+ goto out;
}
/*
@@ -1529,25 +1538,29 @@ static void __always_inline *slab_alloc(
gfp_t gfpflags, int node, void *addr)
{
void **object;
- unsigned long flags;
struct kmem_cache_cpu *c;
- local_irq_save(flags);
- c = get_cpu_slab(s, smp_processor_id());
- if (unlikely(!c->freelist || !node_match(c, node)))
+ c = get_cpu_slab(s, get_cpu());
+redo:
+ object = c->freelist;
+ if (unlikely(!object))
+ goto slow;
- object = __slab_alloc(s, gfpflags, node, addr, c);
+ if (unlikely(!node_match(c, node)))
+ goto slow;
- else {
- object = c->freelist;
- c->freelist = object[c->offset];
- }
- local_irq_restore(flags);
+ if (unlikely(cmpxchg_local(&c->freelist, object,
+ object[c->offset]) != object))
+ goto redo;
- if (unlikely((gfpflags & __GFP_ZERO) && object))
+ put_cpu();
+ if (unlikely((gfpflags & __GFP_ZERO)))
memset(object, 0, c->objsize);
return object;
+slow:
+ return __slab_alloc(s, gfpflags, node, addr, c);
+
}
void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
@@ -1577,7 +1590,10 @@ static void __slab_free(struct kmem_cach
{
void *prior;
void **object = (void *)x;
+ unsigned long flags;
+ local_irq_save(flags);
+ put_cpu_no_resched();
slab_lock(page);
if (unlikely(SlabDebug(page)))
@@ -1603,6 +1619,7 @@ checks_ok:
out_unlock:
slab_unlock(page);
+ local_irq_restore(flags);
return;
slab_empty:
@@ -1613,6 +1630,7 @@ slab_empty:
remove_partial(s, page);
slab_unlock(page);
+ local_irq_restore(flags);
discard_slab(s, page);
return;
@@ -1637,19 +1655,29 @@ static void __always_inline slab_free(st
struct page *page, void *x, void *addr)
{
void **object = (void *)x;
- unsigned long flags;
+ void **freelist;
struct kmem_cache_cpu *c;
- local_irq_save(flags);
debug_check_no_locks_freed(object, s->objsize);
- c = get_cpu_slab(s, smp_processor_id());
- if (likely(page == c->page && c->node >= 0)) {
- object[c->offset] = c->freelist;
- c->freelist = object;
- } else
- __slab_free(s, page, x, addr, c->offset);
- local_irq_restore(flags);
+ c = get_cpu_slab(s, get_cpu());
+ if (unlikely(c->node < 0))
+ goto slow;
+redo:
+ freelist = c->freelist;
+ barrier(); /* If interrupt changes c->page -> cmpxchg failure */
+ if (unlikely(page != c->page))
+ goto slow;
+
+ object[c->offset] = freelist;
+ if (unlikely(cmpxchg_local(&c->freelist, freelist, object)
+ != freelist))
+ goto redo;
+
+ put_cpu();
+ return;
+slow:
+ __slab_free(s, page, x, addr, c->offset);
}
void kmem_cache_free(struct kmem_cache *s, void *x)
Index: linux-2.6/include/linux/slub_def.h
===================================================================
--- linux-2.6.orig/include/linux/slub_def.h 2007-08-21 22:34:30.000000000 -0700
+++ linux-2.6/include/linux/slub_def.h 2007-08-22 01:56:05.000000000 -0700
@@ -12,11 +12,11 @@
#include <linux/kobject.h>
struct kmem_cache_cpu {
- void **freelist;
- struct page *page;
- int node;
- unsigned int offset;
- unsigned int objsize;
+ void **freelist; /* Updated through local atomic ops */
+ struct page *page; /* Updated with interrupts disabled */
+ int node; /* Updated with interrupts disabled */
+ unsigned int offset; /* Set up on kmem_cache_create() */
+ unsigned int objsize; /* Set up on kmem_cache_create() */
};
struct kmem_cache_node {
* Christoph Lameter ([email protected]) wrote:
> void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
> @@ -1577,7 +1590,10 @@ static void __slab_free(struct kmem_cach
> {
> void *prior;
> void **object = (void *)x;
> + unsigned long flags;
>
> + local_irq_save(flags);
> + put_cpu_no_resched();
Those two lines may skip a preempt_check.
Could we change them to this instead ?
put_cpu();
local_irq_save(flags);
Otherwise, it would be good to call
preempt_check_resched();
After each local_irq_restore() in this function.
Mathieu
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Wed, 22 Aug 2007, Mathieu Desnoyers wrote:
> * Christoph Lameter ([email protected]) wrote:
> > void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
> > @@ -1577,7 +1590,10 @@ static void __slab_free(struct kmem_cach
> > {
> > void *prior;
> > void **object = (void *)x;
> > + unsigned long flags;
> >
> > + local_irq_save(flags);
> > + put_cpu_no_resched();
>
> Those two lines may skip a preempt_check.
Yes we cannot execute something else here.
> Could we change them to this instead ?
>
> put_cpu();
> local_irq_save(flags);
Then the thread could be preempted and rescheduled on a different cpu
between put_cpu and local_irq_save() which means that we loose the
state information of the kmem_cache_cpu structure.
> Otherwise, it would be good to call
>
> preempt_check_resched();
>
> After each local_irq_restore() in this function.
We could do that but maybe the frequency of these checks would be too
high? When should the resched checks be used?
On Wed, 22 Aug 2007, Mathieu Desnoyers wrote:
> > Then the thread could be preempted and rescheduled on a different cpu
> > between put_cpu and local_irq_save() which means that we loose the
> > state information of the kmem_cache_cpu structure.
> >
>
> Maybe am I misunderstanding something, but kmem_cache_cpu does not seem
> to be passed to __slab_free() at all, nor any data referenced by it. So
> why do we care about being preempted there ?
Right it is only useful for __slab_alloc. I just changed them both to look
the same. We could do it that way in __slab_free() to avoid the later
preempt_check_resched().
> > We could do that but maybe the frequency of these checks would be too
> > high? When should the resched checks be used?
>
> Since we are only doing this on the slow path, it does not hurt.
> preempt_check_resched() is embedded in preempt_enable() and has a very
> low impact (simple thread flag check in the standard case).
Ok then lets add it.
* Christoph Lameter ([email protected]) wrote:
> On Wed, 22 Aug 2007, Mathieu Desnoyers wrote:
>
> > * Christoph Lameter ([email protected]) wrote:
> > > void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
> > > @@ -1577,7 +1590,10 @@ static void __slab_free(struct kmem_cach
> > > {
> > > void *prior;
> > > void **object = (void *)x;
> > > + unsigned long flags;
> > >
> > > + local_irq_save(flags);
> > > + put_cpu_no_resched();
> >
> > Those two lines may skip a preempt_check.
>
> Yes we cannot execute something else here.
>
> > Could we change them to this instead ?
> >
> > put_cpu();
> > local_irq_save(flags);
>
> Then the thread could be preempted and rescheduled on a different cpu
> between put_cpu and local_irq_save() which means that we loose the
> state information of the kmem_cache_cpu structure.
>
Maybe am I misunderstanding something, but kmem_cache_cpu does not seem
to be passed to __slab_free() at all, nor any data referenced by it. So
why do we care about being preempted there ?
> > Otherwise, it would be good to call
> >
> > preempt_check_resched();
> >
> > After each local_irq_restore() in this function.
>
> We could do that but maybe the frequency of these checks would be too
> high? When should the resched checks be used?
Since we are only doing this on the slow path, it does not hurt.
preempt_check_resched() is embedded in preempt_enable() and has a very
low impact (simple thread flag check in the standard case).
Mathieu
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Ok so we need this.
Fix up preempt checks.
Signed-off-by: Christoph Lameter <[email protected]>
---
mm/slub.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c 2007-08-22 13:33:40.000000000 -0700
+++ linux-2.6/mm/slub.c 2007-08-22 13:35:31.000000000 -0700
@@ -1469,6 +1469,7 @@ load_freelist:
out:
slab_unlock(c->page);
local_irq_restore(flags);
+ preempt_check_resched();
if (unlikely((gfpflags & __GFP_ZERO)))
memset(object, 0, c->objsize);
return object;
@@ -1512,6 +1513,7 @@ new_slab:
goto load_freelist;
}
local_irq_restore(flags);
+ preempt_check_resched();
return NULL;
debug:
object = c->page->freelist;
@@ -1592,8 +1594,8 @@ static void __slab_free(struct kmem_cach
void **object = (void *)x;
unsigned long flags;
+ put_cpu();
local_irq_save(flags);
- put_cpu_no_resched();
slab_lock(page);
if (unlikely(SlabDebug(page)))
On Tue, 2007-08-21 at 16:14 -0700, Christoph Lameter wrote:
> On Tue, 21 Aug 2007, Mathieu Desnoyers wrote:
>
> > - Changed smp_rmb() for barrier(). We are not interested in read order
> > across cpus, what we want is to be ordered wrt local interrupts only.
> > barrier() is much cheaper than a rmb().
>
> But this means a preempt disable is required. RT users do not want that.
> Without preemption the processor can be moved after c has been determined.
> That is why the smp_rmb() is there.
Likewise for disabling interrupts, we don't like that either. So
anything that requires cpu-pinning is preferably not done.
That said, we can suffer a preempt-off section if its O(1) and only a
few hundred cycles.
The trouble with all this percpu data in slub is that it also requires
pinning to the cpu in much of the slow path, either that or what we've
been doing so far with slab, a lock per cpu, and just grab one of those
locks and stick to the data belonging to that lock, regardless of
whether we get migrated.
slab-rt has these locks for all allocations and they are a massive
bottleneck for quite a few workloads, getting a fast path allocation
without using these would be most welcome.
So, if the fast path can be done with a preempt off, it might be doable
to suffer the slow path with a per cpu lock like that.
* Pekka Enberg ([email protected]) wrote:
> Hi Mathieu,
>
> On 8/22/07, Mathieu Desnoyers <[email protected]> wrote:
> > Cons:
> > - Does not help code readability, i.e.:
> >
> > if (have_arch_cmpxchg())
> > preempt_disable();
> > else
> > local_irq_save(flags);
>
> Heh, that's an understatement, as now slub is starting to look a bit
> like slab... ;-) We need to hide that if-else maze into helper
> functions for sure.
>
Hrm, actually, I don't think such have_arch_cmpxchg() macro will be
required at all because of the small performance hit disabling
preemption will have on the slow and fast paths. Let's compare, for each
of the slow path and fast path, what locking looks like on architecture
with and without local cmpxchg:
What we actually do here is:
fast path:
with local_cmpxchg:
preempt_disable()
preempt_enable()
without local_cmpxchg:
preempt_disable()
local_irq_save
local_irq_restore
preempt_enable()
(we therefore disable preemption _and_ disable interrupts for
nothing)
slow path:
both with and without local_cmpxchg():
preempt_disable()
preempt_enable()
local_irq_save()
local_irq_restore()
Therefore, we would add preempt disable/enable to the fast path of
architectures where local_cmpxchg is emulated with irqs off. But since
preempt disable/enable is just a check counter increment/decrement with
barrier() and thread flag check, I doubt it would hurt performances
enough to justify the added complexity of disabling interrupts for the
whole fast path in this case.
Mathieu
> On 8/22/07, Mathieu Desnoyers <[email protected]> wrote:
> > --- slab.orig/mm/slub.c 2007-08-22 10:28:22.000000000 -0400
> > +++ slab/mm/slub.c 2007-08-22 10:51:53.000000000 -0400
> > @@ -1450,7 +1450,8 @@ static void *__slab_alloc(struct kmem_ca
> > void **freelist = NULL;
> > unsigned long flags;
> >
> > - local_irq_save(flags);
> > + if (have_arch_cmpxchg())
> > + local_irq_save(flags);
>
> I haven't been following on the cmpxchg_local() discussion too much so
> the obvious question is: why do we do local_irq_save() for the _has_
> cmpxchg() case here...
>
> On 8/22/07, Mathieu Desnoyers <[email protected]> wrote:
> > @@ -1553,8 +1556,12 @@ static void __always_inline *slab_alloc(
> > {
> > void **object;
> > struct kmem_cache_cpu *c;
> > + unsigned long flags;
> >
> > - preempt_disable();
> > + if (have_arch_cmpxchg())
> > + preempt_disable();
> > + else
> > + local_irq_save(flags);
>
> ...and preempt_disable() here?
>
> On 8/22/07, Mathieu Desnoyers <[email protected]> wrote:
> > + if (have_arch_cmpxchg()) {
> > + if (unlikely(cmpxchg_local(&c->freelist, object,
> > + object[c->offset]) != object))
> > + goto redo;
> > + preempt_enable();
> > + } else {
> > + c->freelist = object[c->offset];
> > + local_irq_restore(flags);
> > + }
>
> If you move the preempt_enable/local_irq_restore pair outside of the
> if-else block, you can make a static inline function slob_get_object()
> that does:
>
> static inline bool slob_get_object(struct kmem_cache *c, void **object)
> {
> if (have_arch_cmpxchg()) {
> if (unlikely(cmpxchg_local(&c->freelist, object,
> object[c->offset]) != object))
> return false;
> } else {
> c->freelist = object[c->offset];
> }
> return true;
> }
>
> And let the compiler optimize out the branch for the non-cmpxchg case.
> Same for the reverse case too (i.e. slob_put_object).
>
> Pekka
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Mon, 27 Aug 2007, Peter Zijlstra wrote:
> So, if the fast path can be done with a preempt off, it might be doable
> to suffer the slow path with a per cpu lock like that.
Sadly the cmpxchg_local requires local per cpu data access. Isnt there
some way to make this less expensive on RT? Acessing cpu local memory is
really good for performance on NUMA since the data is optimally placed and
one can avoid/reduce locking if the process stays tied to the processor.
On Mon, 27 Aug 2007, Mathieu Desnoyers wrote:
> Hrm, actually, I don't think such have_arch_cmpxchg() macro will be
> required at all because of the small performance hit disabling
> preemption will have on the slow and fast paths. Let's compare, for each
> of the slow path and fast path, what locking looks like on architecture
> with and without local cmpxchg:
>
> What we actually do here is:
>
> fast path:
> with local_cmpxchg:
> preempt_disable()
> preempt_enable()
> without local_cmpxchg:
> preempt_disable()
> local_irq_save
> local_irq_restore
> preempt_enable()
> (we therefore disable preemption _and_ disable interrupts for
> nothing)
Hmmm..... This is a performance issue for preempt kernels.
> slow path:
> both with and without local_cmpxchg():
> preempt_disable()
> preempt_enable()
Here we potentially loose our per cpu structure since the process may be
rescheduled.
> local_irq_save()
> local_irq_restore()
>
>
> Therefore, we would add preempt disable/enable to the fast path of
> architectures where local_cmpxchg is emulated with irqs off. But since
> preempt disable/enable is just a check counter increment/decrement with
> barrier() and thread flag check, I doubt it would hurt performances
> enough to justify the added complexity of disabling interrupts for the
> whole fast path in this case.
One additional cacheline referenced in the hot path. Ok the counter may be
hot as well if this is a preemptible kernel. Nevertheless a cause for
concern.
* Christoph Lameter ([email protected]) wrote:
> On Mon, 27 Aug 2007, Peter Zijlstra wrote:
>
> > So, if the fast path can be done with a preempt off, it might be doable
> > to suffer the slow path with a per cpu lock like that.
>
> Sadly the cmpxchg_local requires local per cpu data access. Isnt there
> some way to make this less expensive on RT? Acessing cpu local memory is
> really good for performance on NUMA since the data is optimally placed and
> one can avoid/reduce locking if the process stays tied to the processor.
>
On the slow path, in slab_new, we already have to reenable interrupts
because we can sleep. If we make sure that whenever we return to an irq
disable code path we take the current per-cpu data structure again, can
we make the preempt-disable/irq-disabled code paths O(1) ?
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
* Christoph Lameter ([email protected]) wrote:
> On Mon, 27 Aug 2007, Mathieu Desnoyers wrote:
>
> > Hrm, actually, I don't think such have_arch_cmpxchg() macro will be
> > required at all because of the small performance hit disabling
> > preemption will have on the slow and fast paths. Let's compare, for each
> > of the slow path and fast path, what locking looks like on architecture
> > with and without local cmpxchg:
> >
> > What we actually do here is:
> >
> > fast path:
> > with local_cmpxchg:
> > preempt_disable()
> > preempt_enable()
> > without local_cmpxchg:
> > preempt_disable()
> > local_irq_save
> > local_irq_restore
> > preempt_enable()
> > (we therefore disable preemption _and_ disable interrupts for
> > nothing)
>
> Hmmm..... This is a performance issue for preempt kernels.
>
> > slow path:
> > both with and without local_cmpxchg():
> > preempt_disable()
> > preempt_enable()
>
> Here we potentially loose our per cpu structure since the process may be
> rescheduled.
>
Yes, what you do here is more precisely:
preempt_disable()
local_irq_save()
preempt_enable_no_resched()
...
local_irq_restore()
preempt_check_resched()
> > local_irq_save()
> > local_irq_restore()
> >
> >
> > Therefore, we would add preempt disable/enable to the fast path of
> > architectures where local_cmpxchg is emulated with irqs off. But since
> > preempt disable/enable is just a check counter increment/decrement with
> > barrier() and thread flag check, I doubt it would hurt performances
> > enough to justify the added complexity of disabling interrupts for the
> > whole fast path in this case.
>
> One additional cacheline referenced in the hot path. Ok the counter may be
> hot as well if this is a preemptible kernel. Nevertheless a cause for
> concern.
>
If kernel is non preemptible, the macros preempt_*() are defined to
nothing. And as you say, on preemptible kernels, these variables (thread
flags and preempt count) are very likely to be in cache, since they are
in the thread info struct, but it should still be kept in mind.
Mathieu
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Mon, 27 Aug 2007, Mathieu Desnoyers wrote:
> * Christoph Lameter ([email protected]) wrote:
> > On Mon, 27 Aug 2007, Peter Zijlstra wrote:
> >
> > > So, if the fast path can be done with a preempt off, it might be doable
> > > to suffer the slow path with a per cpu lock like that.
> >
> > Sadly the cmpxchg_local requires local per cpu data access. Isnt there
> > some way to make this less expensive on RT? Acessing cpu local memory is
> > really good for performance on NUMA since the data is optimally placed and
> > one can avoid/reduce locking if the process stays tied to the processor.
> >
>
> On the slow path, in slab_new, we already have to reenable interrupts
> because we can sleep. If we make sure that whenever we return to an irq
> disable code path we take the current per-cpu data structure again, can
> we make the preempt-disable/irq-disabled code paths O(1) ?
Not sure exactly what you are getting at?
This would mean running __alloc_pages tied to one processor even though
waiting is possible?
* Christoph Lameter ([email protected]) wrote:
> On Mon, 27 Aug 2007, Mathieu Desnoyers wrote:
>
> > * Christoph Lameter ([email protected]) wrote:
> > > On Mon, 27 Aug 2007, Peter Zijlstra wrote:
> > >
> > > > So, if the fast path can be done with a preempt off, it might be doable
> > > > to suffer the slow path with a per cpu lock like that.
> > >
> > > Sadly the cmpxchg_local requires local per cpu data access. Isnt there
> > > some way to make this less expensive on RT? Acessing cpu local memory is
> > > really good for performance on NUMA since the data is optimally placed and
> > > one can avoid/reduce locking if the process stays tied to the processor.
> > >
> >
> > On the slow path, in slab_new, we already have to reenable interrupts
> > because we can sleep. If we make sure that whenever we return to an irq
> > disable code path we take the current per-cpu data structure again, can
> > we make the preempt-disable/irq-disabled code paths O(1) ?
>
> Not sure exactly what you are getting at?
> This would mean running __alloc_pages tied to one processor even though
> waiting is possible?
>
Not exactly. What I propose is:
- Running slab_alloc and slab_free fast paths in preempt_disable
context, using cmpxchg_local.
- Running slab_alloc and slab_free slow paths with irqs disabled.
- Running __alloc_pages in preemptible context, not tied to any CPU.
In this scheme, calling __alloc_pages from slab_alloc would reenable
interrupts and potentially migrate us to a different CPU. We would
therefore have to get once again our per-cpu data structure once we get
back into irq disabled code, because we may be running on a different
CPU. This is actually what the __slab_alloc slow path does:
new_slab:
new = get_partial(s, gfpflags, node);
if (new) {
c->page = new;
goto load_freelist;
}
new = new_slab(s, gfpflags, node);
----> within new_slab, we can reenable interrupts for the
__slab_alloc call.
if (new) {
c = get_cpu_slab(s, smp_processor_id());
if (c->page) {
/*
* Someone else populated the cpu_slab while we
* enabled interrupts, or we have gotten scheduled
* on another cpu. The page may not be on the
* requested node even if __GFP_THISNODE was
* specified. So we need to recheck.
*/
if (node_match(c, node)) {
/*
* Current cpuslab is acceptable and we
* want the current one since its cache hot
*/
discard_slab(s, new);
slab_lock(c->page);
goto load_freelist;
}
/* New slab does not fit our expectations */
flush_slab(s, c);
}
slab_lock(new);
SetSlabFrozen(new);
c->page = new;
goto load_freelist;
So the idea would be to split the code in O(1)
preempt_disable/irq_disable sections and to enable interrupt and check
for current per-cpu data structure when re-entering in irq disabled
code.
Mathieu
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
I think the simplest solution may be to leave slub as done in the patch
that we developed last week. The arch must provide a cmpxchg_local that is
performance wise the fastest possible. On x86 this is going to be the
cmpxchg_local on others where cmpxchg is slower than interrupt
disable/enable this is going to be the emulation that does
interrupt disable
cmpchg simulation
interrupt enable
If we can establish that this is not a performance regression then we have
a clean solution source code wise. It also minimizes the interrupt holdoff
for the non-cmpxchg_local arches. However, it means that we will have to
disable interrupts twice for the slow path. If that is too expensive then
we need a different solution.
* Christoph Lameter ([email protected]) wrote:
> I think the simplest solution may be to leave slub as done in the patch
> that we developed last week. The arch must provide a cmpxchg_local that is
> performance wise the fastest possible. On x86 this is going to be the
> cmpxchg_local on others where cmpxchg is slower than interrupt
> disable/enable this is going to be the emulation that does
>
> interrupt disable
>
> cmpchg simulation
>
> interrupt enable
>
>
> If we can establish that this is not a performance regression then we have
> a clean solution source code wise. It also minimizes the interrupt holdoff
> for the non-cmpxchg_local arches. However, it means that we will have to
> disable interrupts twice for the slow path. If that is too expensive then
> we need a different solution.
>
cmpxchg_local is not used on the slow path... ?
Did you meant:
it means that we will have to disable preemption _and_ interrupts on the
fast path for non-cmpxchg_local arches ?
Or maybe am I thinking about the wrong code snippet there ?
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Mon, 27 Aug 2007, Mathieu Desnoyers wrote:
> > a clean solution source code wise. It also minimizes the interrupt holdoff
> > for the non-cmpxchg_local arches. However, it means that we will have to
> > disable interrupts twice for the slow path. If that is too expensive then
> > we need a different solution.
> >
>
> cmpxchg_local is not used on the slow path... ?
Right.
> Did you meant:
>
> it means that we will have to disable preemption _and_ interrupts on the
> fast path for non-cmpxchg_local arches ?
We would have to disable preemption and interrupts once on the fast path.
The interrupt holdoff would just be a couple of instructions.
The slow path would require disable preemption and two interrupt disables.
Question is if this makes sense performance wise. If not then we may have
to look at more complicated schemes.
* Christoph Lameter ([email protected]) wrote:
> On Mon, 27 Aug 2007, Mathieu Desnoyers wrote:
>
> > > a clean solution source code wise. It also minimizes the interrupt holdoff
> > > for the non-cmpxchg_local arches. However, it means that we will have to
> > > disable interrupts twice for the slow path. If that is too expensive then
> > > we need a different solution.
> > >
> >
> > cmpxchg_local is not used on the slow path... ?
>
> Right.
>
> > Did you meant:
> >
> > it means that we will have to disable preemption _and_ interrupts on the
> > fast path for non-cmpxchg_local arches ?
>
> We would have to disable preemption and interrupts once on the fast path.
> The interrupt holdoff would just be a couple of instructions.
>
Right.
> The slow path would require disable preemption and two interrupt disables.
>
If the slow path have to call new_slab, then yes. But it seems that not
every slow path must call it, so for the other slow paths, only one
interrupt disable would be required.
> Question is if this makes sense performance wise. If not then we may have
> to look at more complicated schemes.
>
Yep, such as the arch_have_cmpxchg() macro that I proposed, but it
really hurts my eyes... :(
Mathieu
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Mon, 27 Aug 2007, Mathieu Desnoyers wrote:
> > The slow path would require disable preemption and two interrupt disables.
> If the slow path have to call new_slab, then yes. But it seems that not
> every slow path must call it, so for the other slow paths, only one
> interrupt disable would be required.
If we include new_slab then we get to 3 times:
1. In the cmpxchg_local emulation that fails
2. For the slow path
3. When calling the page allocator.
Hmmmm. One wild idea would be to use a priority futex for the slab lock?
That would make the slow paths interrupt safe without requiring interrupt
disable? Does a futex fit into the page struct?
* Christoph Lameter ([email protected]) wrote:
> On Mon, 27 Aug 2007, Mathieu Desnoyers wrote:
>
> > > The slow path would require disable preemption and two interrupt disables.
> > If the slow path have to call new_slab, then yes. But it seems that not
> > every slow path must call it, so for the other slow paths, only one
> > interrupt disable would be required.
>
> If we include new_slab then we get to 3 times:
>
> 1. In the cmpxchg_local emulation that fails
>
> 2. For the slow path
>
> 3. When calling the page allocator.
>
Hrm, I just want to certify one thing: A lot of code paths seems to go
to the slow path without requiring cmpxchg_local to execute at all. So
is the slow path more likely to be triggered by the (!object),
(!node_match) tests or by these same tests done in the redo after the
initial cmpxchg_local ?
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Mon, 27 Aug 2007, Mathieu Desnoyers wrote:
> Hrm, I just want to certify one thing: A lot of code paths seems to go
> to the slow path without requiring cmpxchg_local to execute at all. So
> is the slow path more likely to be triggered by the (!object),
> (!node_match) tests or by these same tests done in the redo after the
> initial cmpxchg_local ?
The slow path is more likely to be triggered by settings in the per cpu
structure. The cmpxchg failure is comparatively rare. So the worst case
is getting worse but the average use of interrupt enable/disable may not
change much. Need to have some measurements to confirm that. I can try to
run the emulation on IA64 and see what the result will be.
Measurements on IA64 slub w/per cpu vs slub w/per cpu/cmpxchg_local
emulation. Results are not good:
slub/per cpu
10000 times kmalloc(8)/kfree -> 105 cycles
10000 times kmalloc(16)/kfree -> 104 cycles
10000 times kmalloc(32)/kfree -> 105 cycles
10000 times kmalloc(64)/kfree -> 104 cycles
10000 times kmalloc(128)/kfree -> 104 cycles
10000 times kmalloc(256)/kfree -> 115 cycles
10000 times kmalloc(512)/kfree -> 116 cycles
10000 times kmalloc(1024)/kfree -> 115 cycles
10000 times kmalloc(2048)/kfree -> 115 cycles
10000 times kmalloc(4096)/kfree -> 115 cycles
10000 times kmalloc(8192)/kfree -> 117 cycles
10000 times kmalloc(16384)/kfree -> 439 cycles
10000 times kmalloc(32768)/kfree -> 800 cycles
slub/per cpu + cmpxchg_local emulation
10000 times kmalloc(8)/kfree -> 143 cycles
10000 times kmalloc(16)/kfree -> 143 cycles
10000 times kmalloc(32)/kfree -> 143 cycles
10000 times kmalloc(64)/kfree -> 143 cycles
10000 times kmalloc(128)/kfree -> 143 cycles
10000 times kmalloc(256)/kfree -> 154 cycles
10000 times kmalloc(512)/kfree -> 154 cycles
10000 times kmalloc(1024)/kfree -> 154 cycles
10000 times kmalloc(2048)/kfree -> 154 cycles
10000 times kmalloc(4096)/kfree -> 155 cycles
10000 times kmalloc(8192)/kfree -> 155 cycles
10000 times kmalloc(16384)/kfree -> 440 cycles
10000 times kmalloc(32768)/kfree -> 819 cycles
10000 times kmalloc(65536)/kfree -> 902 cycles
Parallel allocs:
Kmalloc N*alloc N*free(16): 0=102/136 1=97/136 2=99/140 3=98/140 4=100/138
5=99/139 6=100/139 7=101/141 Average=99/139
cmpxchg_local emulation
Kmalloc N*alloc N*free(16): 0=116/147 1=116/145 2=115/151 3=115/147
4=115/149 5=117/147 6=116/148 7=116/146 Average=116/147
Patch used:
Index: linux-2.6/include/asm-ia64/atomic.h
===================================================================
--- linux-2.6.orig/include/asm-ia64/atomic.h 2007-08-27 16:42:02.000000000 -0700
+++ linux-2.6/include/asm-ia64/atomic.h 2007-08-27 17:50:24.000000000 -0700
@@ -223,4 +223,17 @@ atomic64_add_negative (__s64 i, atomic64
#define smp_mb__after_atomic_inc() barrier()
#include <asm-generic/atomic.h>
+
+static inline void *cmpxchg_local(void **p, void *old, void *new)
+{
+ unsigned long flags;
+ void *before;
+
+ local_irq_save(flags);
+ before = *p;
+ if (likely(before == old))
+ *p = new;
+ local_irq_restore(flags);
+ return before;
+}
#endif /* _ASM_IA64_ATOMIC_H */
kmem_cache_alloc before
0000000000008900 <kmem_cache_alloc>:
8900: 01 28 31 0e 80 05 [MII] alloc r37=ar.pfs,12,7,0
8906: 40 02 00 62 00 00 mov r36=b0
890c: 00 00 04 00 nop.i 0x0;;
8910: 0b 18 01 00 25 04 [MMI] mov r35=psr;;
8916: 00 00 04 0e 00 00 rsm 0x4000
891c: 00 00 04 00 nop.i 0x0;;
8920: 08 50 90 1b 19 21 [MMI] adds r10=3300,r13
8926: 70 02 80 00 42 40 mov r39=r32
892c: 05 00 c4 00 mov r42=b0
8930: 09 40 01 42 00 21 [MMI] mov r40=r33
8936: 00 00 00 02 00 20 nop.m 0x0
893c: f5 e7 ff 9f mov r41=-1;;
8940: 0b 48 00 14 10 10 [MMI] ld4 r9=[r10];;
8946: 00 00 00 02 00 00 nop.m 0x0
894c: 01 48 58 00 sxt4 r8=r9;;
8950: 0b 18 20 40 12 20 [MMI] shladd r3=r8,3,r32;;
8956: 20 80 0f 82 48 00 addl r2=8432,r3
895c: 00 00 04 00 nop.i 0x0;;
8960: 0a 00 01 04 18 10 [MMI] ld8 r32=[r2];;
8966: e0 a0 80 00 42 60 adds r14=20,r32
896c: 05 00 01 84 mov r43=r32
8970: 0b 10 01 40 18 10 [MMI] ld8 r34=[r32];;
8976: 70 00 88 0c 72 00 cmp.eq p7,p6=0,r34
897c: 00 00 04 00 nop.i 0x0;;
8980: cb 70 00 1c 10 90 [MMI] (p06) ld4 r14=[r14];;
8986: e1 70 88 24 40 00 (p06) shladd r14=r14,3,r34
898c: 00 00 04 00 nop.i 0x0;;
8990: c2 70 00 1c 18 10 [MII] (p06) ld8 r14=[r14]
8996: 00 00 00 02 00 00 nop.i 0x0;;
899c: 00 00 04 00 nop.i 0x0
89a0: d8 00 38 40 98 11 [MMB] (p06) st8 [r32]=r14
89a6: 00 00 00 02 00 03 nop.m 0x0
89ac: 30 00 00 40 (p06) br.cond.sptk.few 89d0 <kmem_cache_alloc+0xd0>
89b0: 11 00 00 00 01 00 [MIB] nop.m 0x0
89b6: 00 00 00 02 00 00 nop.i 0x0
89bc: 18 d8 ff 58 br.call.sptk.many b0=61c0 <__slab_alloc>;;
89c0: 08 10 01 10 00 21 [MMI] mov r34=r8
89c6: 00 00 00 02 00 00 nop.m 0x0
89cc: 00 00 04 00 nop.i 0x0
89d0: 03 00 00 00 01 00 [MII] nop.m 0x0
89d6: 20 01 00 00 49 20 mov r18=16384;;
89dc: 22 19 31 80 and r17=r18,r35;;
89e0: 0a 38 44 00 06 b8 [MMI] cmp.eq p7,p6=r17,r0;;
89e6: 01 00 04 0c 00 00 (p06) ssm 0x4000
89ec: 00 00 04 00 nop.i 0x0
89f0: eb 00 00 02 07 80 [MMI] (p07) rsm 0x4000;;
89f6: 01 00 00 60 00 00 (p06) srlz.d
89fc: 00 00 04 00 nop.i 0x0;;
8a00: 08 00 00 00 01 00 [MMI] nop.m 0x0
8a06: b0 00 88 14 72 e0 cmp.eq p11,p10=0,r34
8a0c: e1 09 01 52 extr.u r15=r33,15,1
8a10: 09 58 60 40 00 21 [MMI] adds r11=24,r32
8a16: 70 02 88 00 42 00 mov r39=r34
8a1c: 05 00 00 84 mov r40=r0;;
8a20: 42 71 04 00 00 e4 [MII] (p10) mov r14=1
8a26: e2 00 00 00 42 00 (p11) mov r14=r0;;
8a2c: 00 00 04 00 nop.i 0x0
8a30: 0b 80 3c 1c 0c 20 [MMI] and r16=r15,r14;;
8a36: 80 00 40 12 73 00 cmp4.eq p8,p9=0,r16
8a3c: 00 00 04 00 nop.i 0x0;;
8a40: 31 49 01 16 10 10 [MIB] (p09) ld4 r41=[r11]
8a46: 00 00 00 02 80 04 nop.i 0x0
8a4c: 08 00 00 51 (p09) br.call.spnt.many b0=8a40 <kmem_cache_alloc+0x140>;;
8a50: 08 00 00 00 01 00 [MMI] nop.m 0x0
8a56: 80 00 88 00 42 00 mov r8=r34
8a5c: 40 0a 00 07 mov b0=r36
8a60: 11 00 00 00 01 00 [MIB] nop.m 0x0
8a66: 00 28 01 55 00 80 mov.i ar.pfs=r37
8a6c: 08 00 84 00 br.ret.sptk.many b0;;
8a70: 08 00 00 00 01 00 [MMI] nop.m 0x0
8a76: 00 00 00 02 00 00 nop.m 0x0
8a7c: 00 00 04 00 nop.i 0x0
kmem_cache_alloc with cmpxchg emulation:
0000000000008da0 <kmem_cache_alloc>:
8da0: 09 28 31 0e 80 05 [MMI] alloc r37=ar.pfs,12,7,0
8da6: a0 80 36 32 42 80 adds r10=3280,r13
8dac: 04 00 c4 00 mov r36=b0;;
8db0: 02 00 00 00 01 00 [MII] nop.m 0x0
8db6: 00 41 29 00 42 00 adds r16=40,r10;;
8dbc: 00 00 04 00 nop.i 0x0
8dc0: 0a 58 00 20 10 10 [MMI] ld4 r11=[r16];;
8dc6: e0 08 2c 00 42 00 adds r14=1,r11
8dcc: 00 00 04 00 nop.i 0x0
8dd0: 0b 00 00 00 01 00 [MMI] nop.m 0x0;;
8dd6: 00 70 40 20 23 00 st4 [r16]=r14
8ddc: 00 00 04 00 nop.i 0x0;;
8de0: 09 78 50 14 00 21 [MMI] adds r15=20,r10
8de6: 00 00 00 02 00 40 nop.m 0x0
8dec: 02 00 00 92 mov r18=16384;;
8df0: 0b 48 00 1e 10 10 [MMI] ld4 r9=[r15];;
8df6: 00 00 00 02 00 00 nop.m 0x0
8dfc: 01 48 58 00 sxt4 r8=r9;;
8e00: 0b 18 20 40 12 20 [MMI] shladd r3=r8,3,r32;;
8e06: 20 80 0f 82 48 00 addl r2=8432,r3
8e0c: 00 00 04 00 nop.i 0x0;;
8e10: 02 10 01 04 18 10 [MII] ld8 r34=[r2]
8e16: 00 00 00 02 00 20 nop.i 0x0;;
8e1c: 42 11 01 84 adds r17=20,r34
8e20: 09 00 00 00 01 00 [MMI] nop.m 0x0
8e26: f0 00 88 30 20 00 ld8 r15=[r34]
8e2c: 00 00 04 00 nop.i 0x0;;
8e30: 10 00 00 00 01 00 [MIB] nop.m 0x0
8e36: 60 00 3c 0e 72 03 cmp.eq p6,p7=0,r15
8e3c: 20 01 00 41 (p06) br.cond.spnt.few 8f50 <kmem_cache_alloc+0x1b0>
8e40: 0a b8 00 22 10 10 [MMI] ld4 r23=[r17];;
8e46: 60 b9 3c 24 40 00 shladd r22=r23,3,r15
8e4c: 00 00 04 00 nop.i 0x0
8e50: 0b 00 00 00 01 00 [MMI] nop.m 0x0;;
8e56: 40 01 58 30 20 00 ld8 r20=[r22]
8e5c: 00 00 04 00 nop.i 0x0;;
8e60: 0b a8 00 00 25 04 [MMI] mov r21=psr;;
8e66: 00 00 04 0e 00 00 rsm 0x4000
8e6c: 00 00 04 00 nop.i 0x0;;
8e70: 02 18 01 44 18 10 [MII] ld8 r35=[r34]
8e76: 30 91 54 18 40 00 and r19=r18,r21;;
8e7c: 00 00 04 00 nop.i 0x0
8e80: 0b 48 3c 46 08 78 [MMI] cmp.eq p9,p8=r15,r35;;
8e86: 02 a0 88 30 23 00 (p09) st8 [r34]=r20
8e8c: 00 00 04 00 nop.i 0x0;;
8e90: 0a 38 4c 00 06 b8 [MMI] cmp.eq p7,p6=r19,r0;;
8e96: 01 00 04 0c 00 00 (p06) ssm 0x4000
8e9c: 00 00 04 00 nop.i 0x0
8ea0: eb 00 00 02 07 80 [MMI] (p07) rsm 0x4000;;
8ea6: 01 00 00 60 00 00 (p06) srlz.d
8eac: 00 00 04 00 nop.i 0x0;;
8eb0: 11 00 00 00 01 00 [MIB] nop.m 0x0
8eb6: 00 00 00 02 00 04 nop.i 0x0
8ebc: 70 ff ff 49 (p08) br.cond.spnt.few 8e20 <kmem_cache_alloc+0x80>;;
8ec0: 03 00 00 00 01 00 [MII] nop.m 0x0
8ec6: 80 81 36 32 42 20 adds r24=3280,r13;;
8ecc: 83 c2 00 84 adds r25=40,r24;;
8ed0: 0a d8 00 32 10 10 [MMI] ld4 r27=[r25];;
8ed6: a0 f9 6f 7e 46 00 adds r26=-1,r27
8edc: 00 00 04 00 nop.i 0x0
8ee0: 0b 00 00 00 01 00 [MMI] nop.m 0x0;;
8ee6: 00 d0 64 20 23 00 st4 [r25]=r26
8eec: 00 00 04 00 nop.i 0x0;;
8ef0: 0b 90 40 30 00 21 [MMI] adds r18=16,r24;;
8ef6: 10 01 48 60 21 00 ld4.acq r17=[r18]
8efc: 00 00 04 00 nop.i 0x0;;
8f00: 11 00 00 00 01 00 [MIB] nop.m 0x0
8f06: c0 10 44 1a a8 06 tbit.z p12,p13=r17,1
8f0c: 08 00 00 51 (p13) br.call.spnt.many b0=8f00 <kmem_cache_alloc+0x160>;;
8f10: 02 00 00 00 01 00 [MII] nop.m 0x0
8f16: a0 f0 84 16 a8 c5 tbit.z p10,p11=r33,15;;
8f1c: 81 11 01 84 (p11) adds r14=24,r34
8f20: 62 39 01 46 00 e1 [MII] (p11) mov r39=r35
8f26: 82 02 00 00 42 00 (p11) mov r40=r0;;
8f2c: 00 00 04 00 nop.i 0x0
8f30: 79 49 01 1c 10 10 [MMB] (p11) ld4 r41=[r14]
8f36: 00 00 00 02 80 05 nop.m 0x0
8f3c: 08 00 00 51 (p11) br.call.spnt.many b0=8f30 <kmem_cache_alloc+0x190>;;
8f40: 10 00 00 00 01 00 [MIB] nop.m 0x0
8f46: 80 00 8c 00 42 00 mov r8=r35
8f4c: 30 00 00 40 br.few 8f70 <kmem_cache_alloc+0x1d0>
8f50: 08 38 01 40 00 21 [MMI] mov r39=r32
8f56: 80 02 84 00 42 40 mov r40=r33
8f5c: 05 20 01 84 mov r42=r36
8f60: 19 58 01 44 00 21 [MMB] mov r43=r34
8f66: 90 fa f3 ff 4f 00 mov r41=-1
8f6c: 28 d3 ff 58 br.call.sptk.many b0=6280 <__slab_alloc>;;
8f70: 00 00 00 00 01 00 [MII] nop.m 0x0
8f76: 00 20 05 80 03 00 mov b0=r36
8f7c: 00 00 04 00 nop.i 0x0
8f80: 11 00 00 00 01 00 [MIB] nop.m 0x0
8f86: 00 28 01 55 00 80 mov.i ar.pfs=r37
8f8c: 08 00 84 00 br.ret.sptk.many b0;;
8f90: 08 00 00 00 01 00 [MMI] nop.m 0x0
8f96: 00 00 00 02 00 00 nop.m 0x0
8f9c: 00 00 04 00 nop.i 0x0
On Mon, 2007-08-27 at 15:15 -0700, Christoph Lameter wrote:
> Hmmmm. One wild idea would be to use a priority futex for the slab lock?
> That would make the slow paths interrupt safe without requiring interrupt
> disable? Does a futex fit into the page struct?
Very much puzzled at what you propose. in-kernel we use rt_mutex (has
PI) or mutex, futexes are user-space. (on -rt spinlock_t == mutex ==
rt_mutex)
Neither disable interrupts since they are sleeping locks.
That said, on -rt we do not need to disable interrupts in the allocators
because its a bug to call an allocator from raw irq context.
Ok, I just had a look at ia64 instruction set, and I fear that cmpxchg
must always come with the acquire or release semantic. Is there any
cmpxchg equivalent on ia64 that would be acquire and release semantic
free ? This implicit memory ordering in the instruction seems to be
responsible for the slowdown.
If such primitive does not exist, then we should think about an irq
disable fallback for this local atomic operation. However, I would
prefer to let the cmpxchg_local primitive be bound to the "slow"
cmpxchg_acq and create something like _cmpxchg_local that would be
interrupt-safe, but not reentrant wrt NMIs.
This way, cmpxchg_local users could choose either the fast flavor
(_cmpxchg_local: not necessarily atomic wrt NMIs) or the most atomic
flavor (cmpxchg_local) available on the architecture. If you think of a
better name, please tell me... it could also be: fast version (mostly
used): cmpxchg_local(); slow, fully reentrant version:
cmpxchg_local_nmi().
Mathieu
* Christoph Lameter ([email protected]) wrote:
> Measurements on IA64 slub w/per cpu vs slub w/per cpu/cmpxchg_local
> emulation. Results are not good:
>
> slub/per cpu
> 10000 times kmalloc(8)/kfree -> 105 cycles
> 10000 times kmalloc(16)/kfree -> 104 cycles
> 10000 times kmalloc(32)/kfree -> 105 cycles
> 10000 times kmalloc(64)/kfree -> 104 cycles
> 10000 times kmalloc(128)/kfree -> 104 cycles
> 10000 times kmalloc(256)/kfree -> 115 cycles
> 10000 times kmalloc(512)/kfree -> 116 cycles
> 10000 times kmalloc(1024)/kfree -> 115 cycles
> 10000 times kmalloc(2048)/kfree -> 115 cycles
> 10000 times kmalloc(4096)/kfree -> 115 cycles
> 10000 times kmalloc(8192)/kfree -> 117 cycles
> 10000 times kmalloc(16384)/kfree -> 439 cycles
> 10000 times kmalloc(32768)/kfree -> 800 cycles
>
>
> slub/per cpu + cmpxchg_local emulation
> 10000 times kmalloc(8)/kfree -> 143 cycles
> 10000 times kmalloc(16)/kfree -> 143 cycles
> 10000 times kmalloc(32)/kfree -> 143 cycles
> 10000 times kmalloc(64)/kfree -> 143 cycles
> 10000 times kmalloc(128)/kfree -> 143 cycles
> 10000 times kmalloc(256)/kfree -> 154 cycles
> 10000 times kmalloc(512)/kfree -> 154 cycles
> 10000 times kmalloc(1024)/kfree -> 154 cycles
> 10000 times kmalloc(2048)/kfree -> 154 cycles
> 10000 times kmalloc(4096)/kfree -> 155 cycles
> 10000 times kmalloc(8192)/kfree -> 155 cycles
> 10000 times kmalloc(16384)/kfree -> 440 cycles
> 10000 times kmalloc(32768)/kfree -> 819 cycles
> 10000 times kmalloc(65536)/kfree -> 902 cycles
>
>
> Parallel allocs:
>
> Kmalloc N*alloc N*free(16): 0=102/136 1=97/136 2=99/140 3=98/140 4=100/138
> 5=99/139 6=100/139 7=101/141 Average=99/139
>
> cmpxchg_local emulation
> Kmalloc N*alloc N*free(16): 0=116/147 1=116/145 2=115/151 3=115/147
> 4=115/149 5=117/147 6=116/148 7=116/146 Average=116/147
>
> Patch used:
>
> Index: linux-2.6/include/asm-ia64/atomic.h
> ===================================================================
> --- linux-2.6.orig/include/asm-ia64/atomic.h 2007-08-27 16:42:02.000000000 -0700
> +++ linux-2.6/include/asm-ia64/atomic.h 2007-08-27 17:50:24.000000000 -0700
> @@ -223,4 +223,17 @@ atomic64_add_negative (__s64 i, atomic64
> #define smp_mb__after_atomic_inc() barrier()
>
> #include <asm-generic/atomic.h>
> +
> +static inline void *cmpxchg_local(void **p, void *old, void *new)
> +{
> + unsigned long flags;
> + void *before;
> +
> + local_irq_save(flags);
> + before = *p;
> + if (likely(before == old))
> + *p = new;
> + local_irq_restore(flags);
> + return before;
> +}
> #endif /* _ASM_IA64_ATOMIC_H */
>
> kmem_cache_alloc before
>
> 0000000000008900 <kmem_cache_alloc>:
> 8900: 01 28 31 0e 80 05 [MII] alloc r37=ar.pfs,12,7,0
> 8906: 40 02 00 62 00 00 mov r36=b0
> 890c: 00 00 04 00 nop.i 0x0;;
> 8910: 0b 18 01 00 25 04 [MMI] mov r35=psr;;
> 8916: 00 00 04 0e 00 00 rsm 0x4000
> 891c: 00 00 04 00 nop.i 0x0;;
> 8920: 08 50 90 1b 19 21 [MMI] adds r10=3300,r13
> 8926: 70 02 80 00 42 40 mov r39=r32
> 892c: 05 00 c4 00 mov r42=b0
> 8930: 09 40 01 42 00 21 [MMI] mov r40=r33
> 8936: 00 00 00 02 00 20 nop.m 0x0
> 893c: f5 e7 ff 9f mov r41=-1;;
> 8940: 0b 48 00 14 10 10 [MMI] ld4 r9=[r10];;
> 8946: 00 00 00 02 00 00 nop.m 0x0
> 894c: 01 48 58 00 sxt4 r8=r9;;
> 8950: 0b 18 20 40 12 20 [MMI] shladd r3=r8,3,r32;;
> 8956: 20 80 0f 82 48 00 addl r2=8432,r3
> 895c: 00 00 04 00 nop.i 0x0;;
> 8960: 0a 00 01 04 18 10 [MMI] ld8 r32=[r2];;
> 8966: e0 a0 80 00 42 60 adds r14=20,r32
> 896c: 05 00 01 84 mov r43=r32
> 8970: 0b 10 01 40 18 10 [MMI] ld8 r34=[r32];;
> 8976: 70 00 88 0c 72 00 cmp.eq p7,p6=0,r34
> 897c: 00 00 04 00 nop.i 0x0;;
> 8980: cb 70 00 1c 10 90 [MMI] (p06) ld4 r14=[r14];;
> 8986: e1 70 88 24 40 00 (p06) shladd r14=r14,3,r34
> 898c: 00 00 04 00 nop.i 0x0;;
> 8990: c2 70 00 1c 18 10 [MII] (p06) ld8 r14=[r14]
> 8996: 00 00 00 02 00 00 nop.i 0x0;;
> 899c: 00 00 04 00 nop.i 0x0
> 89a0: d8 00 38 40 98 11 [MMB] (p06) st8 [r32]=r14
> 89a6: 00 00 00 02 00 03 nop.m 0x0
> 89ac: 30 00 00 40 (p06) br.cond.sptk.few 89d0 <kmem_cache_alloc+0xd0>
> 89b0: 11 00 00 00 01 00 [MIB] nop.m 0x0
> 89b6: 00 00 00 02 00 00 nop.i 0x0
> 89bc: 18 d8 ff 58 br.call.sptk.many b0=61c0 <__slab_alloc>;;
> 89c0: 08 10 01 10 00 21 [MMI] mov r34=r8
> 89c6: 00 00 00 02 00 00 nop.m 0x0
> 89cc: 00 00 04 00 nop.i 0x0
> 89d0: 03 00 00 00 01 00 [MII] nop.m 0x0
> 89d6: 20 01 00 00 49 20 mov r18=16384;;
> 89dc: 22 19 31 80 and r17=r18,r35;;
> 89e0: 0a 38 44 00 06 b8 [MMI] cmp.eq p7,p6=r17,r0;;
> 89e6: 01 00 04 0c 00 00 (p06) ssm 0x4000
> 89ec: 00 00 04 00 nop.i 0x0
> 89f0: eb 00 00 02 07 80 [MMI] (p07) rsm 0x4000;;
> 89f6: 01 00 00 60 00 00 (p06) srlz.d
> 89fc: 00 00 04 00 nop.i 0x0;;
> 8a00: 08 00 00 00 01 00 [MMI] nop.m 0x0
> 8a06: b0 00 88 14 72 e0 cmp.eq p11,p10=0,r34
> 8a0c: e1 09 01 52 extr.u r15=r33,15,1
> 8a10: 09 58 60 40 00 21 [MMI] adds r11=24,r32
> 8a16: 70 02 88 00 42 00 mov r39=r34
> 8a1c: 05 00 00 84 mov r40=r0;;
> 8a20: 42 71 04 00 00 e4 [MII] (p10) mov r14=1
> 8a26: e2 00 00 00 42 00 (p11) mov r14=r0;;
> 8a2c: 00 00 04 00 nop.i 0x0
> 8a30: 0b 80 3c 1c 0c 20 [MMI] and r16=r15,r14;;
> 8a36: 80 00 40 12 73 00 cmp4.eq p8,p9=0,r16
> 8a3c: 00 00 04 00 nop.i 0x0;;
> 8a40: 31 49 01 16 10 10 [MIB] (p09) ld4 r41=[r11]
> 8a46: 00 00 00 02 80 04 nop.i 0x0
> 8a4c: 08 00 00 51 (p09) br.call.spnt.many b0=8a40 <kmem_cache_alloc+0x140>;;
> 8a50: 08 00 00 00 01 00 [MMI] nop.m 0x0
> 8a56: 80 00 88 00 42 00 mov r8=r34
> 8a5c: 40 0a 00 07 mov b0=r36
> 8a60: 11 00 00 00 01 00 [MIB] nop.m 0x0
> 8a66: 00 28 01 55 00 80 mov.i ar.pfs=r37
> 8a6c: 08 00 84 00 br.ret.sptk.many b0;;
> 8a70: 08 00 00 00 01 00 [MMI] nop.m 0x0
> 8a76: 00 00 00 02 00 00 nop.m 0x0
> 8a7c: 00 00 04 00 nop.i 0x0
>
> kmem_cache_alloc with cmpxchg emulation:
>
> 0000000000008da0 <kmem_cache_alloc>:
> 8da0: 09 28 31 0e 80 05 [MMI] alloc r37=ar.pfs,12,7,0
> 8da6: a0 80 36 32 42 80 adds r10=3280,r13
> 8dac: 04 00 c4 00 mov r36=b0;;
> 8db0: 02 00 00 00 01 00 [MII] nop.m 0x0
> 8db6: 00 41 29 00 42 00 adds r16=40,r10;;
> 8dbc: 00 00 04 00 nop.i 0x0
> 8dc0: 0a 58 00 20 10 10 [MMI] ld4 r11=[r16];;
> 8dc6: e0 08 2c 00 42 00 adds r14=1,r11
> 8dcc: 00 00 04 00 nop.i 0x0
> 8dd0: 0b 00 00 00 01 00 [MMI] nop.m 0x0;;
> 8dd6: 00 70 40 20 23 00 st4 [r16]=r14
> 8ddc: 00 00 04 00 nop.i 0x0;;
> 8de0: 09 78 50 14 00 21 [MMI] adds r15=20,r10
> 8de6: 00 00 00 02 00 40 nop.m 0x0
> 8dec: 02 00 00 92 mov r18=16384;;
> 8df0: 0b 48 00 1e 10 10 [MMI] ld4 r9=[r15];;
> 8df6: 00 00 00 02 00 00 nop.m 0x0
> 8dfc: 01 48 58 00 sxt4 r8=r9;;
> 8e00: 0b 18 20 40 12 20 [MMI] shladd r3=r8,3,r32;;
> 8e06: 20 80 0f 82 48 00 addl r2=8432,r3
> 8e0c: 00 00 04 00 nop.i 0x0;;
> 8e10: 02 10 01 04 18 10 [MII] ld8 r34=[r2]
> 8e16: 00 00 00 02 00 20 nop.i 0x0;;
> 8e1c: 42 11 01 84 adds r17=20,r34
> 8e20: 09 00 00 00 01 00 [MMI] nop.m 0x0
> 8e26: f0 00 88 30 20 00 ld8 r15=[r34]
> 8e2c: 00 00 04 00 nop.i 0x0;;
> 8e30: 10 00 00 00 01 00 [MIB] nop.m 0x0
> 8e36: 60 00 3c 0e 72 03 cmp.eq p6,p7=0,r15
> 8e3c: 20 01 00 41 (p06) br.cond.spnt.few 8f50 <kmem_cache_alloc+0x1b0>
> 8e40: 0a b8 00 22 10 10 [MMI] ld4 r23=[r17];;
> 8e46: 60 b9 3c 24 40 00 shladd r22=r23,3,r15
> 8e4c: 00 00 04 00 nop.i 0x0
> 8e50: 0b 00 00 00 01 00 [MMI] nop.m 0x0;;
> 8e56: 40 01 58 30 20 00 ld8 r20=[r22]
> 8e5c: 00 00 04 00 nop.i 0x0;;
> 8e60: 0b a8 00 00 25 04 [MMI] mov r21=psr;;
> 8e66: 00 00 04 0e 00 00 rsm 0x4000
> 8e6c: 00 00 04 00 nop.i 0x0;;
> 8e70: 02 18 01 44 18 10 [MII] ld8 r35=[r34]
> 8e76: 30 91 54 18 40 00 and r19=r18,r21;;
> 8e7c: 00 00 04 00 nop.i 0x0
> 8e80: 0b 48 3c 46 08 78 [MMI] cmp.eq p9,p8=r15,r35;;
> 8e86: 02 a0 88 30 23 00 (p09) st8 [r34]=r20
> 8e8c: 00 00 04 00 nop.i 0x0;;
> 8e90: 0a 38 4c 00 06 b8 [MMI] cmp.eq p7,p6=r19,r0;;
> 8e96: 01 00 04 0c 00 00 (p06) ssm 0x4000
> 8e9c: 00 00 04 00 nop.i 0x0
> 8ea0: eb 00 00 02 07 80 [MMI] (p07) rsm 0x4000;;
> 8ea6: 01 00 00 60 00 00 (p06) srlz.d
> 8eac: 00 00 04 00 nop.i 0x0;;
> 8eb0: 11 00 00 00 01 00 [MIB] nop.m 0x0
> 8eb6: 00 00 00 02 00 04 nop.i 0x0
> 8ebc: 70 ff ff 49 (p08) br.cond.spnt.few 8e20 <kmem_cache_alloc+0x80>;;
> 8ec0: 03 00 00 00 01 00 [MII] nop.m 0x0
> 8ec6: 80 81 36 32 42 20 adds r24=3280,r13;;
> 8ecc: 83 c2 00 84 adds r25=40,r24;;
> 8ed0: 0a d8 00 32 10 10 [MMI] ld4 r27=[r25];;
> 8ed6: a0 f9 6f 7e 46 00 adds r26=-1,r27
> 8edc: 00 00 04 00 nop.i 0x0
> 8ee0: 0b 00 00 00 01 00 [MMI] nop.m 0x0;;
> 8ee6: 00 d0 64 20 23 00 st4 [r25]=r26
> 8eec: 00 00 04 00 nop.i 0x0;;
> 8ef0: 0b 90 40 30 00 21 [MMI] adds r18=16,r24;;
> 8ef6: 10 01 48 60 21 00 ld4.acq r17=[r18]
> 8efc: 00 00 04 00 nop.i 0x0;;
> 8f00: 11 00 00 00 01 00 [MIB] nop.m 0x0
> 8f06: c0 10 44 1a a8 06 tbit.z p12,p13=r17,1
> 8f0c: 08 00 00 51 (p13) br.call.spnt.many b0=8f00 <kmem_cache_alloc+0x160>;;
> 8f10: 02 00 00 00 01 00 [MII] nop.m 0x0
> 8f16: a0 f0 84 16 a8 c5 tbit.z p10,p11=r33,15;;
> 8f1c: 81 11 01 84 (p11) adds r14=24,r34
> 8f20: 62 39 01 46 00 e1 [MII] (p11) mov r39=r35
> 8f26: 82 02 00 00 42 00 (p11) mov r40=r0;;
> 8f2c: 00 00 04 00 nop.i 0x0
> 8f30: 79 49 01 1c 10 10 [MMB] (p11) ld4 r41=[r14]
> 8f36: 00 00 00 02 80 05 nop.m 0x0
> 8f3c: 08 00 00 51 (p11) br.call.spnt.many b0=8f30 <kmem_cache_alloc+0x190>;;
> 8f40: 10 00 00 00 01 00 [MIB] nop.m 0x0
> 8f46: 80 00 8c 00 42 00 mov r8=r35
> 8f4c: 30 00 00 40 br.few 8f70 <kmem_cache_alloc+0x1d0>
> 8f50: 08 38 01 40 00 21 [MMI] mov r39=r32
> 8f56: 80 02 84 00 42 40 mov r40=r33
> 8f5c: 05 20 01 84 mov r42=r36
> 8f60: 19 58 01 44 00 21 [MMB] mov r43=r34
> 8f66: 90 fa f3 ff 4f 00 mov r41=-1
> 8f6c: 28 d3 ff 58 br.call.sptk.many b0=6280 <__slab_alloc>;;
> 8f70: 00 00 00 00 01 00 [MII] nop.m 0x0
> 8f76: 00 20 05 80 03 00 mov b0=r36
> 8f7c: 00 00 04 00 nop.i 0x0
> 8f80: 11 00 00 00 01 00 [MIB] nop.m 0x0
> 8f86: 00 28 01 55 00 80 mov.i ar.pfs=r37
> 8f8c: 08 00 84 00 br.ret.sptk.many b0;;
> 8f90: 08 00 00 00 01 00 [MMI] nop.m 0x0
> 8f96: 00 00 00 02 00 00 nop.m 0x0
> 8f9c: 00 00 04 00 nop.i 0x0
>
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Tue, 28 Aug 2007, Peter Zijlstra wrote:
> On Mon, 2007-08-27 at 15:15 -0700, Christoph Lameter wrote:
> > Hmmmm. One wild idea would be to use a priority futex for the slab lock?
> > That would make the slow paths interrupt safe without requiring interrupt
> > disable? Does a futex fit into the page struct?
>
> Very much puzzled at what you propose. in-kernel we use rt_mutex (has
> PI) or mutex, futexes are user-space. (on -rt spinlock_t == mutex ==
> rt_mutex)
>
> Neither disable interrupts since they are sleeping locks.
>
> That said, on -rt we do not need to disable interrupts in the allocators
> because its a bug to call an allocator from raw irq context.
Right so if a prioriuty futex would have been taken from a process
context and then an interrupt thread (or so no idea about RT) is scheduled
then the interrupt thread could switch to the process context and complete
the work there before doing the "interrupt" work. So disabling interrupts
is no longer necessary.
On Tue, 28 Aug 2007, Mathieu Desnoyers wrote:
> Ok, I just had a look at ia64 instruction set, and I fear that cmpxchg
> must always come with the acquire or release semantic. Is there any
> cmpxchg equivalent on ia64 that would be acquire and release semantic
> free ? This implicit memory ordering in the instruction seems to be
> responsible for the slowdown.
No. There is no cmpxchg used in the patches that I tested. The slowdown
seem to come from the need to serialize at barriers. Adding an interrupt
enable/disable in the middle of the hot path creates another serialization
point.
> If such primitive does not exist, then we should think about an irq
> disable fallback for this local atomic operation. However, I would
> prefer to let the cmpxchg_local primitive be bound to the "slow"
> cmpxchg_acq and create something like _cmpxchg_local that would be
> interrupt-safe, but not reentrant wrt NMIs.
Ummm... That is what I did. See the included patch that you quoted. The
measurements show that such a fallback is not preserving the performance
on IA64.
On Tue, 2007-08-28 at 12:36 -0700, Christoph Lameter wrote:
> On Tue, 28 Aug 2007, Peter Zijlstra wrote:
>
> > On Mon, 2007-08-27 at 15:15 -0700, Christoph Lameter wrote:
> > > Hmmmm. One wild idea would be to use a priority futex for the slab lock?
> > > That would make the slow paths interrupt safe without requiring interrupt
> > > disable? Does a futex fit into the page struct?
> >
> > Very much puzzled at what you propose. in-kernel we use rt_mutex (has
> > PI) or mutex, futexes are user-space. (on -rt spinlock_t == mutex ==
> > rt_mutex)
> >
> > Neither disable interrupts since they are sleeping locks.
> >
> > That said, on -rt we do not need to disable interrupts in the allocators
> > because its a bug to call an allocator from raw irq context.
>
> Right so if a prioriuty futex
futex stands for Fast Userspace muTEX, please lets call it a rt_mutex.
> would have been taken from a process
> context and then an interrupt thread (or so no idea about RT) is scheduled
> then the interrupt thread could switch to the process context and complete
> the work there before doing the "interrupt" work. So disabling interrupts
> is no longer necessary.
-rt does all of the irq handler in thread (process) context, the hard
irq handler just does something akin to a wakeup.
These irq threads typically run fifo/50 or simething like that.
[ note that this allows a form of irq priorisation even if the hardware
doesn't. ]
* Christoph Lameter ([email protected]) wrote:
> Measurements on IA64 slub w/per cpu vs slub w/per cpu/cmpxchg_local
> emulation. Results are not good:
>
Hi Christoph,
I tried to come up with a patch set implementing the basics of a new
critical section: local_enter(flags) and local_exit(flags).
Can you try those on ia64 and tell me if the results are better ?
See the 2 next posts...
Mathieu
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
slub - Use local_t protection
Use local_enter/local_exit for protection in the fast path.
Depends on the cmpxchg_local slub patch.
Signed-off-by: Mathieu Desnoyers <[email protected]>
CC: Christoph Lameter <[email protected]>
---
mm/slub.c | 18 ++++++++++--------
1 file changed, 10 insertions(+), 8 deletions(-)
Index: linux-2.6-lttng/mm/slub.c
===================================================================
--- linux-2.6-lttng.orig/mm/slub.c 2007-09-04 15:47:20.000000000 -0400
+++ linux-2.6-lttng/mm/slub.c 2007-09-04 15:52:07.000000000 -0400
@@ -1456,7 +1456,6 @@ static void *__slab_alloc(struct kmem_ca
unsigned long flags;
local_irq_save(flags);
- put_cpu_no_resched();
if (!c->page)
/* Slab was flushed */
goto new_slab;
@@ -1480,7 +1479,6 @@ load_freelist:
out:
slab_unlock(c->page);
local_irq_restore(flags);
- preempt_check_resched();
if (unlikely((gfpflags & __GFP_ZERO)))
memset(object, 0, c->objsize);
return object;
@@ -1524,7 +1522,6 @@ new_slab:
goto load_freelist;
}
local_irq_restore(flags);
- preempt_check_resched();
return NULL;
debug:
object = c->page->freelist;
@@ -1552,8 +1549,10 @@ static void __always_inline *slab_alloc(
{
void **object;
struct kmem_cache_cpu *c;
+ unsigned long flags;
- c = get_cpu_slab(s, get_cpu());
+ local_enter(flags);
+ c = get_cpu_slab(s, smp_processor_id());
redo:
object = c->freelist;
if (unlikely(!object))
@@ -1566,12 +1565,13 @@ redo:
object[c->offset]) != object))
goto redo;
- put_cpu();
+ local_exit(flags);
if (unlikely((gfpflags & __GFP_ZERO)))
memset(object, 0, c->objsize);
return object;
slow:
+ local_exit(flags);
return __slab_alloc(s, gfpflags, node, addr, c);
}
@@ -1605,7 +1605,6 @@ static void __slab_free(struct kmem_cach
void **object = (void *)x;
unsigned long flags;
- put_cpu();
local_irq_save(flags);
slab_lock(page);
@@ -1670,10 +1669,12 @@ static void __always_inline slab_free(st
void **object = (void *)x;
void **freelist;
struct kmem_cache_cpu *c;
+ unsigned long flags;
debug_check_no_locks_freed(object, s->objsize);
- c = get_cpu_slab(s, get_cpu());
+ local_enter(flags);
+ c = get_cpu_slab(s, smp_processor_id());
if (unlikely(c->node < 0))
goto slow;
redo:
@@ -1687,9 +1688,10 @@ redo:
!= freelist))
goto redo;
- put_cpu();
+ local_exit(flags);
return;
slow:
+ local_exit(flags);
__slab_free(s, page, x, addr, c->offset);
}
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
local_t protection (critical section)
Adds local_enter(flags) and local_exit(flags) as primitives to surround critical
sections using local_t types.
On architectures providing fast atomic primitives, this turns into a preempt
disable/enable().
However, on architectures not providing such fast primitives, such as ia64, it
turns into a local irq disable/enable so that we can use *_local primitives that
are non atomic.
This is only the primary work here: made for testing ia64 with cmpxchg_local
(other local_* primitives still use atomic_long_t operations as fallback).
Signed-off-by: Mathieu Desnoyers <[email protected]>
Christoph Lameter <[email protected]>
---
include/asm-generic/local.h | 3 +++
include/asm-i386/local.h | 3 +++
include/asm-ia64/intrinsics.h | 14 ++++++++++++--
3 files changed, 18 insertions(+), 2 deletions(-)
Index: linux-2.6-lttng/include/asm-generic/local.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-generic/local.h 2007-09-04 15:32:02.000000000 -0400
+++ linux-2.6-lttng/include/asm-generic/local.h 2007-09-04 15:36:41.000000000 -0400
@@ -46,6 +46,9 @@ typedef struct
#define local_add_unless(l, a, u) atomic_long_add_unless((&(l)->a), (a), (u))
#define local_inc_not_zero(l) atomic_long_inc_not_zero(&(l)->a)
+#define local_enter(flags) local_irq_save(flags)
+#define local_exit(flags) local_irq_restore(flags)
+
/* Non-atomic variants, ie. preemption disabled and won't be touched
* in interrupt, etc. Some archs can optimize this case well. */
#define __local_inc(l) local_set((l), local_read(l) + 1)
Index: linux-2.6-lttng/include/asm-i386/local.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-i386/local.h 2007-09-04 15:28:52.000000000 -0400
+++ linux-2.6-lttng/include/asm-i386/local.h 2007-09-04 15:31:54.000000000 -0400
@@ -194,6 +194,9 @@ static __inline__ long local_sub_return(
})
#define local_inc_not_zero(l) local_add_unless((l), 1, 0)
+#define local_enter(flags) preempt_disable()
+#define local_exit(flags) preempt_enable()
+
/* On x86, these are no better than the atomic variants. */
#define __local_inc(l) local_inc(l)
#define __local_dec(l) local_dec(l)
Index: linux-2.6-lttng/include/asm-ia64/intrinsics.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-ia64/intrinsics.h 2007-09-04 15:47:24.000000000 -0400
+++ linux-2.6-lttng/include/asm-ia64/intrinsics.h 2007-09-04 15:49:41.000000000 -0400
@@ -160,8 +160,18 @@ extern long ia64_cmpxchg_called_with_bad
#define cmpxchg(ptr,o,n) cmpxchg_acq(ptr,o,n)
#define cmpxchg64(ptr,o,n) cmpxchg_acq(ptr,o,n)
-#define cmpxchg_local cmpxchg
-#define cmpxchg64_local cmpxchg64
+/* Must be executed between local_enter/local_exit. */
+static inline void *cmpxchg_local(void **p, void *old, void *new)
+{
+ unsigned long flags;
+ void *before;
+
+ before = *p;
+ if (likely(before == old))
+ *p = new;
+ return before;
+}
+#define cmpxchg64_local cmpxchg_local
#ifdef CONFIG_IA64_DEBUG_CMPXCHG
# define CMPXCHG_BUGCHECK_DECL int _cmpxchg_bugcheck_count = 128;
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Tue, 4 Sep 2007, Mathieu Desnoyers wrote:
> @@ -1566,12 +1565,13 @@ redo:
> object[c->offset]) != object))
> goto redo;
>
> - put_cpu();
> + local_exit(flags);
> if (unlikely((gfpflags & __GFP_ZERO)))
> memset(object, 0, c->objsize);
>
> return object;
> slow:
> + local_exit(flags);
Here we can be rescheduled to another processors.
> return __slab_alloc(s, gfpflags, node, addr, c)
c may point to the wrong processor.
local_t protection (critical section)
Adds local_enter_save(flags) and local_exit_restore(flags) as primitives to
surround critical sections using local_t types.
On architectures providing fast atomic primitives, this turns into a preempt
disable/enable().
However, on architectures not providing such fast primitives, such as ia64, it
turns into a local irq disable/enable so that we can use *_local primitives that
are non atomic.
This is only the primary work here: made for testing ia64 with cmpxchg_local
(other local_* primitives still use atomic_long_t operations as fallback).
Signed-off-by: Mathieu Desnoyers <[email protected]>
Christoph Lameter <[email protected]>
---
include/asm-generic/local.h | 9 +++++++++
include/asm-i386/local.h | 17 +++++++++++++++++
include/asm-ia64/intrinsics.h | 14 ++++++++++++--
3 files changed, 38 insertions(+), 2 deletions(-)
Index: linux-2.6-lttng/include/asm-generic/local.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-generic/local.h 2007-09-04 15:32:02.000000000 -0400
+++ linux-2.6-lttng/include/asm-generic/local.h 2007-09-05 08:50:47.000000000 -0400
@@ -46,6 +46,15 @@ typedef struct
#define local_add_unless(l, a, u) atomic_long_add_unless((&(l)->a), (a), (u))
#define local_inc_not_zero(l) atomic_long_inc_not_zero(&(l)->a)
+#define local_enter_save(flags) local_irq_save(flags)
+#define local_exit_restore(flags) local_irq_restore(flags)
+#define local_enter() local_irq_disable()
+#define local_exit() local_irq_enable()
+#define local_nest_irq_save(flags) (flags)
+#define local_nest_irq_restore(flags) (flags)
+#define local_nest_irq_disable()
+#define local_nest_irq_enable()
+
/* Non-atomic variants, ie. preemption disabled and won't be touched
* in interrupt, etc. Some archs can optimize this case well. */
#define __local_inc(l) local_set((l), local_read(l) + 1)
Index: linux-2.6-lttng/include/asm-i386/local.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-i386/local.h 2007-09-04 15:28:52.000000000 -0400
+++ linux-2.6-lttng/include/asm-i386/local.h 2007-09-05 08:49:19.000000000 -0400
@@ -194,6 +194,23 @@ static __inline__ long local_sub_return(
})
#define local_inc_not_zero(l) local_add_unless((l), 1, 0)
+#define local_enter_save(flags) \
+ do { \
+ (flags); \
+ preempt_disable(); \
+ } while (0)
+#define local_exit_restore(flags) \
+ do { \
+ (flags); \
+ preempt_enable(); \
+ } while (0)
+#define local_enter() preempt_disable()
+#define local_exit() preempt_enable()
+#define local_nest_irq_save(flags) local_irq_save(flags)
+#define local_nest_irq_restore(flags) local_irq_restore(flags)
+#define local_nest_irq_disable() local_irq_disable()
+#define local_nest_irq_enable() local_irq_enable()
+
/* On x86, these are no better than the atomic variants. */
#define __local_inc(l) local_inc(l)
#define __local_dec(l) local_dec(l)
Index: linux-2.6-lttng/include/asm-ia64/intrinsics.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-ia64/intrinsics.h 2007-09-04 15:47:24.000000000 -0400
+++ linux-2.6-lttng/include/asm-ia64/intrinsics.h 2007-09-04 15:49:41.000000000 -0400
@@ -160,8 +160,18 @@ extern long ia64_cmpxchg_called_with_bad
#define cmpxchg(ptr,o,n) cmpxchg_acq(ptr,o,n)
#define cmpxchg64(ptr,o,n) cmpxchg_acq(ptr,o,n)
-#define cmpxchg_local cmpxchg
-#define cmpxchg64_local cmpxchg64
+/* Must be executed between local_enter/local_exit. */
+static inline void *cmpxchg_local(void **p, void *old, void *new)
+{
+ unsigned long flags;
+ void *before;
+
+ before = *p;
+ if (likely(before == old))
+ *p = new;
+ return before;
+}
+#define cmpxchg64_local cmpxchg_local
#ifdef CONFIG_IA64_DEBUG_CMPXCHG
# define CMPXCHG_BUGCHECK_DECL int _cmpxchg_bugcheck_count = 128;
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
slub - Use local_t protection
Use local_enter/local_exit for protection in the fast path.
Depends on the cmpxchg_local slub patch.
Changelog:
Add new primitives to switch from local critical section to interrupt disabled
section.
Signed-off-by: Mathieu Desnoyers <[email protected]>
CC: Christoph Lameter <[email protected]>
---
mm/slub.c | 55 ++++++++++++++++++++++++++++++++-----------------------
1 file changed, 32 insertions(+), 23 deletions(-)
Index: linux-2.6-lttng/mm/slub.c
===================================================================
--- linux-2.6-lttng.orig/mm/slub.c 2007-09-05 09:01:00.000000000 -0400
+++ linux-2.6-lttng/mm/slub.c 2007-09-05 09:05:34.000000000 -0400
@@ -1065,9 +1065,6 @@ static struct page *new_slab(struct kmem
BUG_ON(flags & GFP_SLAB_BUG_MASK);
- if (flags & __GFP_WAIT)
- local_irq_enable();
-
page = allocate_slab(s,
flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
if (!page)
@@ -1100,8 +1097,6 @@ static struct page *new_slab(struct kmem
page->freelist = start;
page->inuse = 0;
out:
- if (flags & __GFP_WAIT)
- local_irq_disable();
return page;
}
@@ -1455,8 +1450,7 @@ static void *__slab_alloc(struct kmem_ca
struct page *new;
unsigned long flags;
- local_irq_save(flags);
- put_cpu_no_resched();
+ local_nest_irq_save(flags);
if (!c->page)
/* Slab was flushed */
goto new_slab;
@@ -1479,8 +1473,7 @@ load_freelist:
c->freelist = object[c->offset];
out:
slab_unlock(c->page);
- local_irq_restore(flags);
- preempt_check_resched();
+ local_nest_irq_restore(flags);
if (unlikely((gfpflags & __GFP_ZERO)))
memset(object, 0, c->objsize);
return object;
@@ -1494,8 +1487,16 @@ new_slab:
c->page = new;
goto load_freelist;
}
-
+ if (gfpflags & __GFP_WAIT) {
+ local_nest_irq_enable();
+ local_exit();
+ }
new = new_slab(s, gfpflags, node);
+ if (gfpflags & __GFP_WAIT) {
+ local_enter();
+ local_nest_irq_disable();
+ }
+
if (new) {
c = get_cpu_slab(s, smp_processor_id());
if (c->page) {
@@ -1523,8 +1524,7 @@ new_slab:
c->page = new;
goto load_freelist;
}
- local_irq_restore(flags);
- preempt_check_resched();
+ local_nest_irq_restore(flags);
return NULL;
debug:
object = c->page->freelist;
@@ -1552,8 +1552,11 @@ static void __always_inline *slab_alloc(
{
void **object;
struct kmem_cache_cpu *c;
+ unsigned long flags;
+ void *ret;
- c = get_cpu_slab(s, get_cpu());
+ local_enter_save(flags);
+ c = get_cpu_slab(s, smp_processor_id());
redo:
object = c->freelist;
if (unlikely(!object))
@@ -1566,14 +1569,15 @@ redo:
object[c->offset]) != object))
goto redo;
- put_cpu();
+ local_exit_restore(flags);
if (unlikely((gfpflags & __GFP_ZERO)))
memset(object, 0, c->objsize);
return object;
slow:
- return __slab_alloc(s, gfpflags, node, addr, c);
-
+ ret = __slab_alloc(s, gfpflags, node, addr, c);
+ local_exit_restore(flags);
+ return ret;
}
void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
@@ -1605,8 +1609,7 @@ static void __slab_free(struct kmem_cach
void **object = (void *)x;
unsigned long flags;
- put_cpu();
- local_irq_save(flags);
+ local_nest_irq_save(flags);
slab_lock(page);
if (unlikely(SlabDebug(page)))
@@ -1632,7 +1635,7 @@ checks_ok:
out_unlock:
slab_unlock(page);
- local_irq_restore(flags);
+ local_nest_irq_restore(flags);
return;
slab_empty:
@@ -1643,7 +1646,7 @@ slab_empty:
remove_partial(s, page);
slab_unlock(page);
- local_irq_restore(flags);
+ local_nest_irq_restore(flags);
discard_slab(s, page);
return;
@@ -1670,10 +1673,12 @@ static void __always_inline slab_free(st
void **object = (void *)x;
void **freelist;
struct kmem_cache_cpu *c;
+ unsigned long flags;
debug_check_no_locks_freed(object, s->objsize);
- c = get_cpu_slab(s, get_cpu());
+ local_enter_save(flags);
+ c = get_cpu_slab(s, smp_processor_id());
if (unlikely(c->node < 0))
goto slow;
redo:
@@ -1687,10 +1692,11 @@ redo:
!= freelist))
goto redo;
- put_cpu();
+ local_exit_restore(flags);
return;
slow:
__slab_free(s, page, x, addr, c->offset);
+ local_exit_restore(flags);
}
void kmem_cache_free(struct kmem_cache *s, void *x)
@@ -2026,8 +2032,11 @@ static struct kmem_cache_node *early_kme
BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
+ if (gfpflags & __GFP_WAIT)
+ local_irq_enable();
page = new_slab(kmalloc_caches, gfpflags, node);
-
+ if (gfpflags & __GFP_WAIT)
+ local_irq_disable();
BUG_ON(!page);
if (page_to_nid(page) != node) {
printk(KERN_ERR "SLUB: Unable to allocate memory from "
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
* Christoph Lameter ([email protected]) wrote:
> On Tue, 4 Sep 2007, Mathieu Desnoyers wrote:
>
> > @@ -1566,12 +1565,13 @@ redo:
> > object[c->offset]) != object))
> > goto redo;
> >
> > - put_cpu();
> > + local_exit(flags);
> > if (unlikely((gfpflags & __GFP_ZERO)))
> > memset(object, 0, c->objsize);
> >
> > return object;
> > slow:
> > + local_exit(flags);
>
> Here we can be rescheduled to another processors.
>
> > return __slab_alloc(s, gfpflags, node, addr, c)
>
> c may point to the wrong processor.
Good point. the current CPU is not updated at the beginning of the
slow path.
I'll post the updated patchset. Comments are welcome, especially about
the naming scheme which is currently awkward.
Mathieu
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
On Wed, 5 Sep 2007, Mathieu Desnoyers wrote:
> Use local_enter/local_exit for protection in the fast path.
Sorry that it took some time to get back to this issue. KS interfered.
> @@ -1494,8 +1487,16 @@ new_slab:
> c->page = new;
> goto load_freelist;
> }
> -
> + if (gfpflags & __GFP_WAIT) {
> + local_nest_irq_enable();
> + local_exit();
> + }
> new = new_slab(s, gfpflags, node);
> + if (gfpflags & __GFP_WAIT) {
> + local_enter();
> + local_nest_irq_disable();
> + }
> +
> if (new) {
> c = get_cpu_slab(s, smp_processor_id());
> if (c->page) {
Hmmmm... Definitely an interesting change to move the interrupt
enable/disable to __slab_alloc. But it looks like it is getting a bit
messy. All my attempts ended also like this. Sigh.
> @@ -2026,8 +2032,11 @@ static struct kmem_cache_node *early_kme
>
> BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
>
> + if (gfpflags & __GFP_WAIT)
> + local_irq_enable();
> page = new_slab(kmalloc_caches, gfpflags, node);
> -
> + if (gfpflags & __GFP_WAIT)
> + local_irq_disable();
> BUG_ON(!page);
> if (page_to_nid(page) != node) {
> printk(KERN_ERR "SLUB: Unable to allocate memory from "
Hmmmm... Actually we could drop the irq disable / enable here since this
is boot code. That would also allow the removal of the later
local_irq_enable.
Good idea. I think I will do the moving of the interrupt enable/disable
independently.
On Wed, 5 Sep 2007, Mathieu Desnoyers wrote:
> Index: linux-2.6-lttng/include/asm-generic/local.h
> ===================================================================
> --- linux-2.6-lttng.orig/include/asm-generic/local.h 2007-09-04 15:32:02.000000000 -0400
> +++ linux-2.6-lttng/include/asm-generic/local.h 2007-09-05 08:50:47.000000000 -0400
> @@ -46,6 +46,15 @@ typedef struct
> #define local_add_unless(l, a, u) atomic_long_add_unless((&(l)->a), (a), (u))
> #define local_inc_not_zero(l) atomic_long_inc_not_zero(&(l)->a)
>
> +#define local_enter_save(flags) local_irq_save(flags)
> +#define local_exit_restore(flags) local_irq_restore(flags)
> +#define local_enter() local_irq_disable()
> +#define local_exit() local_irq_enable()
> +#define local_nest_irq_save(flags) (flags)
> +#define local_nest_irq_restore(flags) (flags)
> +#define local_nest_irq_disable()
> +#define local_nest_irq_enable()
> +
This list is going to increase with RT support in SLUB? Argh.
> Index: linux-2.6-lttng/include/asm-i386/local.h
> ===================================================================
> --- linux-2.6-lttng.orig/include/asm-i386/local.h 2007-09-04 15:28:52.000000000 -0400
> +++ linux-2.6-lttng/include/asm-i386/local.h 2007-09-05 08:49:19.000000000 -0400
> @@ -194,6 +194,23 @@ static __inline__ long local_sub_return(
> })
> #define local_inc_not_zero(l) local_add_unless((l), 1, 0)
>
> +#define local_enter_save(flags) \
> + do { \
> + (flags); \
> + preempt_disable(); \
> + } while (0)
> +#define local_exit_restore(flags) \
> + do { \
> + (flags); \
> + preempt_enable(); \
> + } while (0)
This does not result in warnings because a variable is not used or used
uninitialized?
* Christoph Lameter ([email protected]) wrote:
> On Wed, 5 Sep 2007, Mathieu Desnoyers wrote:
>
> > Index: linux-2.6-lttng/include/asm-generic/local.h
> > ===================================================================
> > --- linux-2.6-lttng.orig/include/asm-generic/local.h 2007-09-04 15:32:02.000000000 -0400
> > +++ linux-2.6-lttng/include/asm-generic/local.h 2007-09-05 08:50:47.000000000 -0400
> > @@ -46,6 +46,15 @@ typedef struct
> > #define local_add_unless(l, a, u) atomic_long_add_unless((&(l)->a), (a), (u))
> > #define local_inc_not_zero(l) atomic_long_inc_not_zero(&(l)->a)
> >
> > +#define local_enter_save(flags) local_irq_save(flags)
> > +#define local_exit_restore(flags) local_irq_restore(flags)
> > +#define local_enter() local_irq_disable()
> > +#define local_exit() local_irq_enable()
> > +#define local_nest_irq_save(flags) (flags)
> > +#define local_nest_irq_restore(flags) (flags)
> > +#define local_nest_irq_disable()
> > +#define local_nest_irq_enable()
> > +
>
> This list is going to increase with RT support in SLUB? Argh.
>
AFAIK, there is no difference between local irq save/restore in mainline
VS -RT. The same applies to preempt disable/enable.
The only thing we have to make sure is that the irq disable and
preempt disable code paths are short and O(1).
>
> > Index: linux-2.6-lttng/include/asm-i386/local.h
> > ===================================================================
> > --- linux-2.6-lttng.orig/include/asm-i386/local.h 2007-09-04 15:28:52.000000000 -0400
> > +++ linux-2.6-lttng/include/asm-i386/local.h 2007-09-05 08:49:19.000000000 -0400
> > @@ -194,6 +194,23 @@ static __inline__ long local_sub_return(
> > })
> > #define local_inc_not_zero(l) local_add_unless((l), 1, 0)
> >
> > +#define local_enter_save(flags) \
> > + do { \
> > + (flags); \
> > + preempt_disable(); \
> > + } while (0)
>
>
> > +#define local_exit_restore(flags) \
> > + do { \
> > + (flags); \
> > + preempt_enable(); \
> > + } while (0)
>
>
> This does not result in warnings because a variable is not used or used
> uninitialized?
Because the variable is not used at all if I don't put the "(flags)"
(gcc warns about this).
I'm glad that some of the proposed changes may help. I'll let the
cmpxchg_local patches sleep for a while so I can concentrate my efforts
on text edit lock, immediate values and markers. I think what we'll
really need for the cmpxchg_local is two flavors: one that is as atomic
as possible (for things such as tracing), and the other one the fastest
possible (potentially using irq disable). A lot of per architecture
testing/fine tuning will be required though, and I don't have the
hardware to do this.
Mathieu
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68