Use the this_cpu_cmpxchg_double functionality to implement a lockless
allocation algorith.
Each of the per cpu pointers is paired with a transaction id that ensures
that updates of the per cpu information can only occur in sequence on
a certain cpu.
A transaction id is a "long" integer that is comprised of an event number
and the cpu number. The event number is incremented for every change to the
per cpu state. This means that the cmpxchg instruction can verify for an
update that nothing interfered and that we are updating the percpu structure
for the processor where we picked up the information and that we are also
currently on that processor when we update the information.
This results in a significant decrease of the overhead in the fastpaths. It
also makes it easy to adopt the fast path for realtime kernels since this
is lockless and does not require that the use of the current per cpu area
over the critical section. It is only important that the per cpu area is
current at the beginning of the critical section and at that end.
So there is no need even to disable preemption which will make the allocations
scale well in a RT environment.
[Beware: There have been previous attempts at lockless fastpaths that
did not succeed. We hope to have learned from these experiences but
review certainly is necessary.]
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Signed-off-by: Christoph Lameter <[email protected]>
---
include/linux/slub_def.h | 3 -
mm/slub.c | 128 ++++++++++++++++++++++++++++++++++++++++-------
2 files changed, 113 insertions(+), 18 deletions(-)
Index: linux-2.6/include/linux/slub_def.h
===================================================================
--- linux-2.6.orig/include/linux/slub_def.h 2010-11-23 16:31:06.000000000 -0600
+++ linux-2.6/include/linux/slub_def.h 2010-11-23 16:53:34.000000000 -0600
@@ -36,7 +36,8 @@ enum stat_item {
NR_SLUB_STAT_ITEMS };
struct kmem_cache_cpu {
- void **freelist; /* Pointer to first free per cpu object */
+ void **freelist; /* Pointer to next available object */
+ unsigned long tid; /* Globally unique transaction id */
struct page *page; /* The slab from which we are allocating */
int node; /* The node of the page (or -1 for debug) */
#ifdef CONFIG_SLUB_STATS
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c 2010-11-23 16:31:06.000000000 -0600
+++ linux-2.6/mm/slub.c 2010-11-23 16:55:49.000000000 -0600
@@ -1486,6 +1486,31 @@ static void unfreeze_slab(struct kmem_ca
}
/*
+ * Calculate the next globally unique transaction for disambiguiation
+ * during cmpxchg. The transactions start with the cpu number and are then
+ * incremented by CONFIG_NR_CPUS.
+ */
+static inline unsigned long next_tid(unsigned long tid)
+{
+ return tid + CONFIG_NR_CPUS;
+}
+
+static inline unsigned int tid_to_cpu(unsigned long tid)
+{
+ return tid % CONFIG_NR_CPUS;
+}
+
+static inline unsigned long tid_to_event(unsigned long tid)
+{
+ return tid / CONFIG_NR_CPUS;
+}
+
+static inline unsigned int init_tid(int cpu)
+{
+ return cpu;
+}
+
+/*
* Remove the cpu slab
*/
static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
@@ -1509,6 +1534,7 @@ static void deactivate_slab(struct kmem_
/* Retrieve object from cpu_freelist */
object = c->freelist;
c->freelist = get_freepointer(s, c->freelist);
+ c->tid = next_tid(c->tid);
/* And put onto the regular freelist */
set_freepointer(s, object, page->freelist);
@@ -1646,10 +1672,15 @@ slab_out_of_memory(struct kmem_cache *s,
* a call to the page allocator and the setup of a new slab.
*/
static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
- unsigned long addr, struct kmem_cache_cpu *c)
+ unsigned long addr)
{
void **object;
struct page *new;
+ unsigned long flags;
+ struct kmem_cache_cpu *c;
+
+ local_irq_save(flags);
+ c = this_cpu_ptr(s->cpu_slab);
/* We handle __GFP_ZERO in the caller */
gfpflags &= ~__GFP_ZERO;
@@ -1675,7 +1706,9 @@ load_freelist:
c->page->freelist = NULL;
c->node = page_to_nid(c->page);
unlock_out:
+ c->tid = next_tid(c->tid);
slab_unlock(c->page);
+ local_irq_restore(flags);
stat(s, ALLOC_SLOWPATH);
return object;
@@ -1737,23 +1770,53 @@ static __always_inline void *slab_alloc(
{
void **object;
struct kmem_cache_cpu *c;
- unsigned long flags;
+ unsigned long tid;
if (slab_pre_alloc_hook(s, gfpflags))
return NULL;
- local_irq_save(flags);
+redo:
+ /*
+ * Must read kmem_cache cpu data via this cpu ptr. Preemption is
+ * enabled. We may switch back and forth between cpus while
+ * reading from one cpu area. That does not matter as long
+ * as we end up on the original cpu again when doing the cmpxchg.
+ */
c = __this_cpu_ptr(s->cpu_slab);
+
+ /*
+ * The transaction ids are globally unique per cpu and per operation on
+ * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
+ * occurs on the right processor and that there was no operation on the
+ * linked list in between.
+ */
+ tid = c->tid;
+ barrier();
+
object = c->freelist;
- if (unlikely(!object || !node_match(c, node)))
+ if (unlikely(!object || !node_match(c, c->node)))
- object = __slab_alloc(s, gfpflags, node, addr, c);
+ object = __slab_alloc(s, gfpflags, c->node, addr);
else {
- c->freelist = get_freepointer(s, object);
+ /*
+ * The cmpxchg will only match if there was not additonal
+ * operation and if we are on the right processor.
+ */
+ if (unlikely(!irqsafe_cmpxchg_double(&s->cpu_slab->freelist, object, tid,
+ get_freepointer(s, object), next_tid(tid)))) {
+#ifdef CONFIG_DEBUG_VM
+ unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
+
+ printk(KERN_INFO "slab_free %s: redo cpu %u->[%u] event=[%ld] %ld->%ld\n",
+ s->name, tid_to_cpu(tid), tid_to_cpu(actual_tid),
+ tid_to_event(actual_tid), tid_to_event(tid),
+ tid_to_event(next_tid(tid)));
+#endif
+ goto redo;
+ }
stat(s, ALLOC_FASTPATH);
}
- local_irq_restore(flags);
if (unlikely(gfpflags & __GFP_ZERO) && object)
memset(object, 0, s->objsize);
@@ -1817,8 +1880,10 @@ static void __slab_free(struct kmem_cach
{
void *prior;
void **object = (void *)x;
+ unsigned long flags;
stat(s, FREE_SLOWPATH);
+ local_irq_save(flags);
slab_lock(page);
if (kmem_cache_debug(s))
@@ -1849,6 +1914,7 @@ checks_ok:
out_unlock:
slab_unlock(page);
+ local_irq_restore(flags);
return;
slab_empty:
@@ -1860,6 +1926,7 @@ slab_empty:
stat(s, FREE_REMOVE_PARTIAL);
}
slab_unlock(page);
+ local_irq_restore(flags);
stat(s, FREE_SLAB);
discard_slab(s, page);
return;
@@ -1886,23 +1953,38 @@ static __always_inline void slab_free(st
{
void **object = (void *)x;
struct kmem_cache_cpu *c;
- unsigned long flags;
+ unsigned long tid;
slab_free_hook(s, x);
- local_irq_save(flags);
- c = __this_cpu_ptr(s->cpu_slab);
-
slab_free_hook_irq(s, x);
- if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
+redo:
+ c = this_cpu_ptr(s->cpu_slab);
+ tid = c->tid;
+ barrier();
+
+ if (likely(page == c->page) &&
+ c->node != NUMA_NO_NODE) {
+
set_freepointer(s, object, c->freelist);
- c->freelist = object;
+
+ if (unlikely(!irqsafe_cmpxchg_double(&s->cpu_slab->freelist,
+ c->freelist, tid,
+ object, next_tid(tid)))) {
+#ifdef CONFIG_DEBUG_VM
+ unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
+
+ printk(KERN_INFO "slab_free %s: redo cpu %d->[%d] event=[%ld] %ld->%ld\n", s->name,
+ tid_to_cpu(tid), tid_to_cpu(actual_tid),
+ tid_to_event(actual_tid), tid_to_event(tid),
+ tid_to_event(next_tid(tid)));
+#endif
+ goto redo;
+ }
stat(s, FREE_FASTPATH);
} else
__slab_free(s, page, x, addr);
-
- local_irq_restore(flags);
}
void kmem_cache_free(struct kmem_cache *s, void *x)
@@ -2102,12 +2184,24 @@ init_kmem_cache_node(struct kmem_cache_n
static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
{
+ int cpu;
+
BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
- s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
+ /*
+ * Must align to double word boundary for the long cmpxchg instructions
+ * to work.
+ */
+ s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *));
+
+ if (!s->cpu_slab)
+ return 0;
+
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
- return s->cpu_slab != NULL;
+ return 1;
}
static struct kmem_cache *kmem_cache_node;
Le mardi 23 novembre 2010 à 17:51 -0600, Christoph Lameter a écrit :
> pièce jointe document texte brut (slub_generation)
> Use the this_cpu_cmpxchg_double functionality to implement a lockless
> allocation algorith.
>
> Each of the per cpu pointers is paired with a transaction id that ensures
> that updates of the per cpu information can only occur in sequence on
> a certain cpu.
>
> A transaction id is a "long" integer that is comprised of an event number
> and the cpu number. The event number is incremented for every change to the
> per cpu state. This means that the cmpxchg instruction can verify for an
> update that nothing interfered and that we are updating the percpu structure
> for the processor where we picked up the information and that we are also
> currently on that processor when we update the information.
>
> This results in a significant decrease of the overhead in the fastpaths. It
> also makes it easy to adopt the fast path for realtime kernels since this
> is lockless and does not require that the use of the current per cpu area
> over the critical section. It is only important that the per cpu area is
> current at the beginning of the critical section and at that end.
>
> So there is no need even to disable preemption which will make the allocations
> scale well in a RT environment.
>
> [Beware: There have been previous attempts at lockless fastpaths that
> did not succeed. We hope to have learned from these experiences but
> review certainly is necessary.]
>
> Cc: Ingo Molnar <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Signed-off-by: Christoph Lameter <[email protected]>
>
> ---
>
> /*
> + * Calculate the next globally unique transaction for disambiguiation
> + * during cmpxchg. The transactions start with the cpu number and are then
> + * incremented by CONFIG_NR_CPUS.
> + */
> +static inline unsigned long next_tid(unsigned long tid)
> +{
> + return tid + CONFIG_NR_CPUS;
> +}
Hmm, this only works for power of two NR_CPUS, or else one cpu 'tid'
could wrap on another cpu tid.
I suggest using 4096 (or roundup_pow_of_two(CONFIG_NR_CPUS))
* Christoph Lameter ([email protected]) wrote:
[...]
> @@ -1737,23 +1770,53 @@ static __always_inline void *slab_alloc(
> {
> void **object;
> struct kmem_cache_cpu *c;
> - unsigned long flags;
> + unsigned long tid;
>
> if (slab_pre_alloc_hook(s, gfpflags))
> return NULL;
>
> - local_irq_save(flags);
> +redo:
> + /*
> + * Must read kmem_cache cpu data via this cpu ptr. Preemption is
> + * enabled. We may switch back and forth between cpus while
> + * reading from one cpu area. That does not matter as long
> + * as we end up on the original cpu again when doing the cmpxchg.
> + */
> c = __this_cpu_ptr(s->cpu_slab);
> +
> + /*
> + * The transaction ids are globally unique per cpu and per operation on
> + * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
> + * occurs on the right processor and that there was no operation on the
> + * linked list in between.
> + */
There seems to be some voodoo magic I don't understand here. I'm curious to see
what happens if we have:
CPU A CPU B
slab_alloc()
c = __this_cpu_ptr(s->cpu_slab);
tid = c->tid
thread migrated to CPU B
slab_alloc()
c = __this_cpu_ptr(s->cpu_slab);
tid = c->tid
... ...
irqsafe_cmpxchg_double
- expect tid, on CPU A, success
migrate back to CPU A
irqsafe_cmpxchg_double
- expect (same) tid, on CPU A, success
So either there is a crucially important point I am missing, or the transaction
ID does not seem to be truly unique due to migration.
Thanks,
Mathieu
> + tid = c->tid;
> + barrier();
> +
> object = c->freelist;
> - if (unlikely(!object || !node_match(c, node)))
> + if (unlikely(!object || !node_match(c, c->node)))
>
> - object = __slab_alloc(s, gfpflags, node, addr, c);
> + object = __slab_alloc(s, gfpflags, c->node, addr);
>
> else {
> - c->freelist = get_freepointer(s, object);
> + /*
> + * The cmpxchg will only match if there was not additonal
> + * operation and if we are on the right processor.
> + */
> + if (unlikely(!irqsafe_cmpxchg_double(&s->cpu_slab->freelist, object, tid,
> + get_freepointer(s, object), next_tid(tid)))) {
--
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com
* Mathieu Desnoyers ([email protected]) wrote:
> * Christoph Lameter ([email protected]) wrote:
>
> [...]
>
> > @@ -1737,23 +1770,53 @@ static __always_inline void *slab_alloc(
> > {
> > void **object;
> > struct kmem_cache_cpu *c;
> > - unsigned long flags;
> > + unsigned long tid;
> >
> > if (slab_pre_alloc_hook(s, gfpflags))
> > return NULL;
> >
> > - local_irq_save(flags);
> > +redo:
> > + /*
> > + * Must read kmem_cache cpu data via this cpu ptr. Preemption is
> > + * enabled. We may switch back and forth between cpus while
> > + * reading from one cpu area. That does not matter as long
> > + * as we end up on the original cpu again when doing the cmpxchg.
> > + */
> > c = __this_cpu_ptr(s->cpu_slab);
> > +
> > + /*
> > + * The transaction ids are globally unique per cpu and per operation on
> > + * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
> > + * occurs on the right processor and that there was no operation on the
> > + * linked list in between.
> > + */
>
> There seems to be some voodoo magic I don't understand here. I'm curious to see
> what happens if we have:
>
> CPU A CPU B
> slab_alloc()
> c = __this_cpu_ptr(s->cpu_slab);
> tid = c->tid
> thread migrated to CPU B
>
> slab_alloc()
> c = __this_cpu_ptr(s->cpu_slab);
> tid = c->tid
> ... ...
> irqsafe_cmpxchg_double
> - expect tid, on CPU A, success
> migrate back to CPU A
> irqsafe_cmpxchg_double
> - expect (same) tid, on CPU A, success
Ah! I knew I was missing something: the second cmpxchg will fail because it
expects "tid", but the value is now the "next_tid". So effectively, many
instances of the same transaction can run concurrently, but only one will
succeed.
Sorry for the noise.
Thanks,
Mathieu
>
> So either there is a crucially important point I am missing, or the transaction
> ID does not seem to be truly unique due to migration.
>
> Thanks,
>
> Mathieu
>
>
> > + tid = c->tid;
> > + barrier();
> > +
> > object = c->freelist;
> > - if (unlikely(!object || !node_match(c, node)))
> > + if (unlikely(!object || !node_match(c, c->node)))
> >
> > - object = __slab_alloc(s, gfpflags, node, addr, c);
> > + object = __slab_alloc(s, gfpflags, c->node, addr);
> >
> > else {
> > - c->freelist = get_freepointer(s, object);
> > + /*
> > + * The cmpxchg will only match if there was not additonal
> > + * operation and if we are on the right processor.
> > + */
> > + if (unlikely(!irqsafe_cmpxchg_double(&s->cpu_slab->freelist, object, tid,
> > + get_freepointer(s, object), next_tid(tid)))) {
>
>
> --
> Mathieu Desnoyers
> Operating System Efficiency R&D Consultant
> EfficiOS Inc.
> http://www.efficios.com
--
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com
On Tue, 23 Nov 2010, Mathieu Desnoyers wrote:
> Ah! I knew I was missing something: the second cmpxchg will fail because it
> expects "tid", but the value is now the "next_tid". So effectively, many
> instances of the same transaction can run concurrently, but only one will
> succeed.
Right.
> Sorry for the noise.
No its good to hear that you were not able to find a hole on first glance.
On Wed, 24 Nov 2010, Eric Dumazet wrote:
> Hmm, this only works for power of two NR_CPUS, or else one cpu 'tid'
> could wrap on another cpu tid.
Hmm... That essentially like using a bitfield then.
> I suggest using 4096 (or roundup_pow_of_two(CONFIG_NR_CPUS))
There could be machines with 64k cpus soon. So I guess roundupp2 it will
have to be.
Use power of two increment in all cases.
Signed-off-by: Christoph Lameter <[email protected]>
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c 2010-11-23 22:22:03.596998064 -0600
+++ linux-2.6/mm/slub.c 2010-11-23 21:53:09.422591631 -0600
@@ -1490,19 +1490,21 @@
* during cmpxchg. The transactions start with the cpu number and are then
* incremented by CONFIG_NR_CPUS.
*/
+#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS)
+
static inline unsigned long next_tid(unsigned long tid)
{
- return tid + CONFIG_NR_CPUS;
+ return tid + TID_STEP;
}
static inline unsigned int tid_to_cpu(unsigned long tid)
{
- return tid % CONFIG_NR_CPUS;
+ return tid % TID_STEP;
}
static inline unsigned long tid_to_event(unsigned long tid)
{
- return tid / CONFIG_NR_CPUS;
+ return tid / TID_STEP;
}
static inline unsigned int init_tid(int cpu)
On Wed, Nov 24, 2010 at 1:51 AM, Christoph Lameter <[email protected]> wrote:
> @@ -1737,23 +1770,53 @@ static __always_inline void *slab_alloc(
> ?{
> ? ? ? ?void **object;
> ? ? ? ?struct kmem_cache_cpu *c;
> - ? ? ? unsigned long flags;
> + ? ? ? unsigned long tid;
>
> ? ? ? ?if (slab_pre_alloc_hook(s, gfpflags))
> ? ? ? ? ? ? ? ?return NULL;
>
> - ? ? ? local_irq_save(flags);
> +redo:
> + ? ? ? /*
> + ? ? ? ?* Must read kmem_cache cpu data via this cpu ptr. Preemption is
> + ? ? ? ?* enabled. We may switch back and forth between cpus while
> + ? ? ? ?* reading from one cpu area. That does not matter as long
> + ? ? ? ?* as we end up on the original cpu again when doing the cmpxchg.
> + ? ? ? ?*/
> ? ? ? ?c = __this_cpu_ptr(s->cpu_slab);
> +
> + ? ? ? /*
> + ? ? ? ?* The transaction ids are globally unique per cpu and per operation on
> + ? ? ? ?* a per cpu queue. Thus they can be guarantee that the cmpxchg_double
> + ? ? ? ?* occurs on the right processor and that there was no operation on the
> + ? ? ? ?* linked list in between.
> + ? ? ? ?*/
> + ? ? ? tid = c->tid;
> + ? ? ? barrier();
You're using a compiler barrier after every load from c->tid. Why?
On Tue, 2010-11-23 at 17:51 -0600, Christoph Lameter wrote:
> So there is no need even to disable preemption which will make the allocations
> scale well in a RT environment.
The RT thing isn't particularly about scaling per-se, its mostly about
working at all.
This thing still relies on disabling IRQs in the slow path, which means
its still going to be a lot of work to make it work on -rt.
It also heavily relies on bit-spinlocks, which again is going to need
changes for -rt.
But yes, the lockless fast path is nice, I'll try and get around to
reading it.
On Wed, 24 Nov 2010, Peter Zijlstra wrote:
> This thing still relies on disabling IRQs in the slow path, which means
> its still going to be a lot of work to make it work on -rt.
The disabling of irqs is because slab operations are used from interrupt
context. If we can avoid slab operations from interrupt contexts then we
can drop the interrupt disable in the slab allocators.
On Wed, 24 Nov 2010, Pekka Enberg wrote:
> > + ? ? ? /*
> > + ? ? ? ?* The transaction ids are globally unique per cpu and per operation on
> > + ? ? ? ?* a per cpu queue. Thus they can be guarantee that the cmpxchg_double
> > + ? ? ? ?* occurs on the right processor and that there was no operation on the
> > + ? ? ? ?* linked list in between.
> > + ? ? ? ?*/
> > + ? ? ? tid = c->tid;
> > + ? ? ? barrier();
>
> You're using a compiler barrier after every load from c->tid. Why?
To make sure that the compiler does not do something like loading the tid
later. The tid must be obtained before the rest of the information from
the per cpu slab data is retrieved in order to ensure that we have a
consistent set of data to operate on.
The critical section begins with the retrieval of the tid and it ends with
the replacement of the tid with the newly generated one. This means that
all state data for the alloc and free operation needs to be retrieved in
that critical section. The change must be saved with the final
cmpxchg_double of the critical section.
On Wed, Nov 24, 2010 at 6:17 PM, Christoph Lameter <[email protected]> wrote:
> On Wed, 24 Nov 2010, Pekka Enberg wrote:
>
>> > + ? ? ? /*
>> > + ? ? ? ?* The transaction ids are globally unique per cpu and per operation on
>> > + ? ? ? ?* a per cpu queue. Thus they can be guarantee that the cmpxchg_double
>> > + ? ? ? ?* occurs on the right processor and that there was no operation on the
>> > + ? ? ? ?* linked list in between.
>> > + ? ? ? ?*/
>> > + ? ? ? tid = c->tid;
>> > + ? ? ? barrier();
>>
>> You're using a compiler barrier after every load from c->tid. Why?
>
> To make sure that the compiler does not do something like loading the tid
> later. The tid must be obtained before the rest of the information from
> the per cpu slab data is retrieved in order to ensure that we have a
> consistent set of data to operate on.
>
> The critical section begins with the retrieval of the tid and it ends with
> the replacement of the tid with the newly generated one. This means that
> all state data for the alloc and free operation needs to be retrieved in
> that critical section. The change must be saved with the final
> cmpxchg_double of the critical section.
Right and we don't need a *memory barrier* here because we're
accessing a per-CPU variable which means operations appear in-order.
Pekka
On Wed, 24 Nov 2010, Pekka Enberg wrote:
> > The critical section begins with the retrieval of the tid and it ends with
> > the replacement of the tid with the newly generated one. This means that
> > all state data for the alloc and free operation needs to be retrieved in
> > that critical section. The change must be saved with the final
> > cmpxchg_double of the critical section.
>
> Right and we don't need a *memory barrier* here because we're
> accessing a per-CPU variable which means operations appear in-order.
The compiler is still free to rearrange the tid fetch. A possible
optimization that the compiler may do is to move the tid fetch into the
next if statement since that is the only block in which the tid variable
is actually used.
On Wed, 24 Nov 2010, Pekka Enberg wrote:
>> > The critical section begins with the retrieval of the tid and it ends with
>> > the replacement of the tid with the newly generated one. This means that
>> > all state data for the alloc and free operation needs to be retrieved in
>> > that critical section. The change must be saved with the final
>> > cmpxchg_double of the critical section.
>>
>> Right and we don't need a *memory barrier* here because we're
>> accessing a per-CPU variable which means operations appear in-order.
On Wed, Nov 24, 2010 at 6:45 PM, Christoph Lameter <[email protected]> wrote:
> The compiler is still free to rearrange the tid fetch. A possible
> optimization that the compiler may do is to move the tid fetch into the
> next if statement since that is the only block in which the tid variable
> is actually used.
Yes, which is why we need a *compiler barrier* but not a *memory barrier*.
On Wed, 24 Nov 2010, Pekka Enberg wrote:
> On Wed, 24 Nov 2010, Pekka Enberg wrote:
> >> > The critical section begins with the retrieval of the tid and it ends with
> >> > the replacement of the tid with the newly generated one. This means that
> >> > all state data for the alloc and free operation needs to be retrieved in
> >> > that critical section. The change must be saved with the final
> >> > cmpxchg_double of the critical section.
> >>
> >> Right and we don't need a *memory barrier* here because we're
> >> accessing a per-CPU variable which means operations appear in-order.
>
> On Wed, Nov 24, 2010 at 6:45 PM, Christoph Lameter <[email protected]> wrote:
> > The compiler is still free to rearrange the tid fetch. A possible
> > optimization that the compiler may do is to move the tid fetch into the
> > next if statement since that is the only block in which the tid variable
> > is actually used.
>
> Yes, which is why we need a *compiler barrier* but not a *memory barrier*.
Exactly. That is the reason there is a compiler barrier there. A memory
barrier would be smp_mb() or so.
On Wed, 2010-11-24 at 10:14 -0600, Christoph Lameter wrote:
> On Wed, 24 Nov 2010, Peter Zijlstra wrote:
>
> > This thing still relies on disabling IRQs in the slow path, which means
> > its still going to be a lot of work to make it work on -rt.
>
> The disabling of irqs is because slab operations are used from interrupt
> context. If we can avoid slab operations from interrupt contexts then we
> can drop the interrupt disable in the slab allocators.
That's not so much the point, there's per-cpu assumptions due to that.
Not everything is under a proper lock, see for example this bit:
new = new_slab(s, gfpflags, node);
if (gfpflags & __GFP_WAIT)
local_irq_disable();
if (new) {
c = __this_cpu_ptr(s->cpu_slab);
stat(s, ALLOC_SLAB);
if (c->page)
flush_slab(s, c);
slab_lock(new);
__SetPageSlubFrozen(new);
c->page = new;
goto load_freelist;
}
There we have the __this_cpu_ptr, c->page deref and flush_slab()->stat()
call all before we take a lock.
On Wed, 24 Nov 2010, Peter Zijlstra wrote:
> On Wed, 2010-11-24 at 10:14 -0600, Christoph Lameter wrote:
> > On Wed, 24 Nov 2010, Peter Zijlstra wrote:
> >
> > > This thing still relies on disabling IRQs in the slow path, which means
> > > its still going to be a lot of work to make it work on -rt.
> >
> > The disabling of irqs is because slab operations are used from interrupt
> > context. If we can avoid slab operations from interrupt contexts then we
> > can drop the interrupt disable in the slab allocators.
>
> That's not so much the point, there's per-cpu assumptions due to that.
Sure the exclusive access to per cpu area is exploited during
irq off sections. That would have to change.
> Not everything is under a proper lock, see for example this bit:
>
> new = new_slab(s, gfpflags, node);
>
> if (gfpflags & __GFP_WAIT)
> local_irq_disable();
>
> if (new) {
> c = __this_cpu_ptr(s->cpu_slab);
> stat(s, ALLOC_SLAB);
> if (c->page)
> flush_slab(s, c);
> slab_lock(new);
> __SetPageSlubFrozen(new);
> c->page = new;
> goto load_freelist;
> }
>
> There we have the __this_cpu_ptr, c->page deref and flush_slab()->stat()
> call all before we take a lock.
All per cpu data and therefore would have get different treatment
if we wanted to drop the processing in interrupt off mode.
Disabling preempt may be initially sufficient there.
On 11/24/2010 08:17 AM, Christoph Lameter wrote:
> On Wed, 24 Nov 2010, Pekka Enberg wrote:
>
>>> + /*
>>> + * The transaction ids are globally unique per cpu and per operation on
>>> + * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
>>> + * occurs on the right processor and that there was no operation on the
>>> + * linked list in between.
>>> + */
>>> + tid = c->tid;
>>> + barrier();
>> You're using a compiler barrier after every load from c->tid. Why?
> To make sure that the compiler does not do something like loading the tid
> later. The tid must be obtained before the rest of the information from
> the per cpu slab data is retrieved in order to ensure that we have a
> consistent set of data to operate on.
Isn't that best expressed with ACCESS_ONCE()?
J
On Wed, 24 Nov 2010, Jeremy Fitzhardinge wrote:
> On 11/24/2010 08:17 AM, Christoph Lameter wrote:
> > On Wed, 24 Nov 2010, Pekka Enberg wrote:
> >
> >>> + /*
> >>> + * The transaction ids are globally unique per cpu and per operation on
> >>> + * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
> >>> + * occurs on the right processor and that there was no operation on the
> >>> + * linked list in between.
> >>> + */
> >>> + tid = c->tid;
> >>> + barrier();
> >> You're using a compiler barrier after every load from c->tid. Why?
> > To make sure that the compiler does not do something like loading the tid
> > later. The tid must be obtained before the rest of the information from
> > the per cpu slab data is retrieved in order to ensure that we have a
> > consistent set of data to operate on.
>
> Isn't that best expressed with ACCESS_ONCE()?
ACCESS_ONCE does not prevent reordering if used once it seems when one
reads the comments. ACCESS_ONCE() uses volatile? Uggh.
* Jeremy Fitzhardinge ([email protected]) wrote:
> On 11/24/2010 08:17 AM, Christoph Lameter wrote:
> > On Wed, 24 Nov 2010, Pekka Enberg wrote:
> >
> >>> + /*
> >>> + * The transaction ids are globally unique per cpu and per operation on
> >>> + * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
> >>> + * occurs on the right processor and that there was no operation on the
> >>> + * linked list in between.
> >>> + */
> >>> + tid = c->tid;
> >>> + barrier();
> >> You're using a compiler barrier after every load from c->tid. Why?
> > To make sure that the compiler does not do something like loading the tid
> > later. The tid must be obtained before the rest of the information from
> > the per cpu slab data is retrieved in order to ensure that we have a
> > consistent set of data to operate on.
>
> Isn't that best expressed with ACCESS_ONCE()?
ACCESS_ONCE() use of volatile only ensures that volatile accesses are not
reordered wrt each other. It does not ensure anything about other unrelated
memory accesses (which we want to make sure the compiler won't move before the
c->tid read). The compiler barrier() is really what seems to be needed here.
Thanks,
Mathieu
--
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com
On 11/24/2010 11:53 AM, Christoph Lameter wrote:
> ACCESS_ONCE does not prevent reordering if used once it seems when one
> reads the comments. ACCESS_ONCE() uses volatile? Uggh.
Hm, good point. I'll need to review my uses of it.
J