The major portions are there but there is no support yet for
directly allocating per cpu objects. There could also be more
sophisticated code to exploit the batch freeing.
Signed-off-by: Christoph Lameter <[email protected]>
Index: linux/include/linux/slub_def.h
===================================================================
--- linux.orig/include/linux/slub_def.h
+++ linux/include/linux/slub_def.h
@@ -110,4 +110,5 @@ static inline void sysfs_slab_remove(str
}
#endif
+#define _HAVE_SLAB_ALLOCATOR_ARRAY_OPERATIONS
#endif /* _LINUX_SLUB_DEF_H */
Index: linux/mm/slub.c
===================================================================
--- linux.orig/mm/slub.c
+++ linux/mm/slub.c
@@ -1379,13 +1379,9 @@ static void setup_object(struct kmem_cac
s->ctor(object);
}
-static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
+static struct page *__new_slab(struct kmem_cache *s, gfp_t flags, int node)
{
struct page *page;
- void *start;
- void *p;
- int order;
- int idx;
if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
@@ -1394,33 +1390,42 @@ static struct page *new_slab(struct kmem
page = allocate_slab(s,
flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
- if (!page)
- goto out;
+ if (page) {
+ inc_slabs_node(s, page_to_nid(page), page->objects);
+ page->slab_cache = s;
+ __SetPageSlab(page);
+ if (page->pfmemalloc)
+ SetPageSlabPfmemalloc(page);
+ }
- order = compound_order(page);
- inc_slabs_node(s, page_to_nid(page), page->objects);
- page->slab_cache = s;
- __SetPageSlab(page);
- if (page->pfmemalloc)
- SetPageSlabPfmemalloc(page);
-
- start = page_address(page);
-
- if (unlikely(s->flags & SLAB_POISON))
- memset(start, POISON_INUSE, PAGE_SIZE << order);
-
- for_each_object_idx(p, idx, s, start, page->objects) {
- setup_object(s, page, p);
- if (likely(idx < page->objects))
- set_freepointer(s, p, p + s->size);
- else
- set_freepointer(s, p, NULL);
- }
-
- page->freelist = start;
- page->inuse = page->objects;
- page->frozen = 1;
-out:
+ return page;
+}
+
+static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
+{
+ struct page *page = __new_slab(s, flags, node);
+
+ if (page) {
+ void *p;
+ int idx;
+ void *start = page_address(page);
+
+ if (unlikely(s->flags & SLAB_POISON))
+ memset(start, POISON_INUSE,
+ PAGE_SIZE << compound_order(page));
+
+ for_each_object_idx(p, idx, s, start, page->objects) {
+ setup_object(s, page, p);
+ if (likely(idx < page->objects))
+ set_freepointer(s, p, p + s->size);
+ else
+ set_freepointer(s, p, NULL);
+ }
+
+ page->freelist = start;
+ page->inuse = page->objects;
+ page->frozen = 1;
+ }
return page;
}
@@ -2516,8 +2521,78 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trac
#endif
#endif
+int slab_array_alloc_from_partial(struct kmem_cache *s,
+ size_t nr, void **p)
+{
+ void **end = p + nr;
+ struct kmem_cache_node *n = get_node(s, numa_mem_id());
+ int allocated = 0;
+ unsigned long flags;
+ struct page *page, *page2;
+
+ if (!n->nr_partial)
+ return 0;
+
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ list_for_each_entry_safe(page, page2, &n->partial, lru) {
+ void *freelist;
+
+ if (page->objects - page->inuse > end - p)
+ /* More objects free in page than we want */
+ break;
+ list_del(&page->lru);
+ slab_lock(page);
+ freelist = page->freelist;
+ page->inuse = page->objects;
+ page->freelist = NULL;
+ slab_unlock(page);
+ /* Grab all available objects */
+ while (freelist) {
+ *p++ = freelist;
+ freelist = get_freepointer(s, freelist);
+ allocated++;
+ }
+ }
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ return allocated;
+}
+
+int slab_array_alloc_from_page_allocator(struct kmem_cache *s,
+ gfp_t flags, size_t nr, void **p)
+{
+ void **end = p + nr;
+ int allocated = 0;
+
+ while (end - p >= oo_objects(s->oo)) {
+ struct page *page = __new_slab(s, flags, NUMA_NO_NODE);
+ void *q = page_address(page);
+ int i;
+
+ /* Use all the objects */
+ for (i = 0; i < page->objects; i++) {
+ setup_object(s, page, q);
+ *p++ = q;
+ q += s->size;
+ }
+
+ page->inuse = page->objects;
+ page->freelist = NULL;
+ allocated += page->objects;
+ }
+ return allocated;
+}
+
+int slab_array_alloc_from_local(struct kmem_cache *s,
+ size_t nr, void **p)
+{
+ /* Go for the per cpu partials list first */
+ /* Use the cpu_slab if objects are still needed */
+ return 0;
+}
+
/*
- * Slow patch handling. This may still be called frequently since objects
+ * Slow path handling. This may still be called frequently since objects
* have a longer lifetime than the cpu slabs in most processing loads.
*
* So we still attempt to reduce cache line usage. Just take the slab
@@ -2637,6 +2712,14 @@ slab_empty:
discard_slab(s, page);
}
+void kmem_cache_free_array(struct kmem_cache *s, size_t nr, void **p)
+{
+ void **end = p + nr;
+
+ for ( ; p < end; p++)
+ __slab_free(s, virt_to_head_page(p), p, 0);
+}
+
/*
* Fastpath with forced inlining to produce a kfree and kmem_cache_free that
* can perform fastpath freeing without additional function calls.
On Tue, 10 Feb 2015 13:48:06 -0600 Christoph Lameter <[email protected]> wrote:
> The major portions are there but there is no support yet for
> directly allocating per cpu objects. There could also be more
> sophisticated code to exploit the batch freeing.
>
> Signed-off-by: Christoph Lameter <[email protected]>
>
[...]
> Index: linux/mm/slub.c
> ===================================================================
> --- linux.orig/mm/slub.c
> +++ linux/mm/slub.c
[...]
> @@ -2516,8 +2521,78 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trac
> #endif
> #endif
>
> +int slab_array_alloc_from_partial(struct kmem_cache *s,
> + size_t nr, void **p)
> +{
> + void **end = p + nr;
> + struct kmem_cache_node *n = get_node(s, numa_mem_id());
> + int allocated = 0;
> + unsigned long flags;
> + struct page *page, *page2;
> +
> + if (!n->nr_partial)
> + return 0;
> +
> +
> + spin_lock_irqsave(&n->list_lock, flags);
This is quite an expensive lock with irqsave.
> + list_for_each_entry_safe(page, page2, &n->partial, lru) {
> + void *freelist;
> +
> + if (page->objects - page->inuse > end - p)
> + /* More objects free in page than we want */
> + break;
> + list_del(&page->lru);
> + slab_lock(page);
Yet another lock cost.
> + freelist = page->freelist;
> + page->inuse = page->objects;
> + page->freelist = NULL;
> + slab_unlock(page);
> + /* Grab all available objects */
> + while (freelist) {
> + *p++ = freelist;
> + freelist = get_freepointer(s, freelist);
> + allocated++;
> + }
> + }
> + spin_unlock_irqrestore(&n->list_lock, flags);
> + return allocated;
I estimate (on my CPU) the locking cost itself is more than 32ns, plus
the irqsave (which I've also found quite expensive, alone 14ns). Thus,
estimated 46ns. Single elem slub fast path cost is 18-19ns. Thus 3-4
elem bulking should be enough to amortized the cost, guess we are still
good :-)
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Sr. Network Kernel Developer at Red Hat
Author of http://www.iptv-analyzer.org
LinkedIn: http://www.linkedin.com/in/brouer
On Wed, 11 Feb 2015, Jesper Dangaard Brouer wrote:
> > +
> > +
> > + spin_lock_irqsave(&n->list_lock, flags);
>
> This is quite an expensive lock with irqsave.
Yes but we take it for all partial pages.
> Yet another lock cost.
Yup the page access is shared but there is one per page. Contention is
unlikely.
> > + spin_unlock_irqrestore(&n->list_lock, flags);
> > + return allocated;
>
> I estimate (on my CPU) the locking cost itself is more than 32ns, plus
> the irqsave (which I've also found quite expensive, alone 14ns). Thus,
> estimated 46ns. Single elem slub fast path cost is 18-19ns. Thus 3-4
> elem bulking should be enough to amortized the cost, guess we are still
> good :-)
We can require that interrupt are off when the functions are called. Then
we can avoid the "save" part?
On Wed, 11 Feb 2015 13:07:24 -0600 (CST)
Christoph Lameter <[email protected]> wrote:
> On Wed, 11 Feb 2015, Jesper Dangaard Brouer wrote:
>
> > > +
> > > +
> > > + spin_lock_irqsave(&n->list_lock, flags);
> >
> > This is quite an expensive lock with irqsave.
>
> Yes but we take it for all partial pages.
Sure, that is good, but this might be a contention point. In a micro
benchmark, this contention should be visible, but in real use-cases the
given subsystem also need to spend time to use these elements before
requesting a new batch (as long as NIC cleanup cycles don't get too
synchronized)
> > Yet another lock cost.
>
> Yup the page access is shared but there is one per page. Contention is
> unlikely.
Yes, contention is unlikely, but every atomic operation is expensive.
On my system the measured cost is 8ns, and a lock/unlock does two, thus
16ns. Which we then do per page freelist.
> > > + spin_unlock_irqrestore(&n->list_lock, flags);
> > > + return allocated;
> >
> > I estimate (on my CPU) the locking cost itself is more than 32ns, plus
> > the irqsave (which I've also found quite expensive, alone 14ns). Thus,
> > estimated 46ns. Single elem slub fast path cost is 18-19ns. Thus 3-4
> > elem bulking should be enough to amortized the cost, guess we are still
> > good :-)
>
> We can require that interrupt are off when the functions are called. Then
> we can avoid the "save" part?
Yes, we could also do so with an "_irqoff" variant of the func call,
but given we are defining the API we can just require this from the
start.
I plan to use this in softirq, where I know interrupts are on, but I
can use the less-expensive "non-save" variant local_irq_{disable,enable}.
Measurements show (x86_64 E5-2695):
* 2.860 ns cost for local_irq_{disable,enable}
* 14.840 ns cost for local_irq_save()+local_irq_restore()
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Sr. Network Kernel Developer at Red Hat
Author of http://www.iptv-analyzer.org
LinkedIn: http://www.linkedin.com/in/brouer
On Thu, 12 Feb 2015, Jesper Dangaard Brouer wrote:
> > > This is quite an expensive lock with irqsave.
> >
> > Yes but we take it for all partial pages.
>
> Sure, that is good, but this might be a contention point. In a micro
> benchmark, this contention should be visible, but in real use-cases the
> given subsystem also need to spend time to use these elements before
> requesting a new batch (as long as NIC cleanup cycles don't get too
> synchronized)
Yes definitely it will be a contention point. There is no way around it.
> > Yup the page access is shared but there is one per page. Contention is
> > unlikely.
>
> Yes, contention is unlikely, but every atomic operation is expensive.
> On my system the measured cost is 8ns, and a lock/unlock does two, thus
> 16ns. Which we then do per page freelist.
Not sure what we can do about this.
> > We can require that interrupt are off when the functions are called. Then
> > we can avoid the "save" part?
>
> Yes, we could also do so with an "_irqoff" variant of the func call,
> but given we are defining the API we can just require this from the
> start.
Allright. Lets do that then.
On Wed, 11 Feb 2015 16:06:50 -0600 (CST)
Christoph Lameter <[email protected]> wrote:
> On Thu, 12 Feb 2015, Jesper Dangaard Brouer wrote:
>
> > > > This is quite an expensive lock with irqsave.
[...]
> > > We can require that interrupt are off when the functions are called. Then
> > > we can avoid the "save" part?
> >
> > Yes, we could also do so with an "_irqoff" variant of the func call,
> > but given we are defining the API we can just require this from the
> > start.
>
> Allright. Lets do that then.
Okay. Some measurements to guide this choice.
Measured on my laptop CPU i7-2620M CPU @ 2.70GHz:
* 12.775 ns - "clean" spin_lock_unlock
* 21.099 ns - irqsave variant spinlock
* 22.808 ns - "manual" irqsave before spin_lock
* 14.618 ns - "manual" local_irq_disable + spin_lock
Reproducible via my github repo:
https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/time_bench_sample.c
The clean spin_lock_unlock is 8.324 ns faster than irqsave variant.
The irqsave variant is actually faster than expected, as the measurement
of an isolated local_irq_save_restore were 13.256 ns.
The difference to the "manual" irqsave is only 1.709 ns, which is approx
the cost of an extra function call.
If one can use the non-flags-save version of local_irq_disable, then one
can save 6.481 ns (on this specific CPU and kernel config 3.17.8-200.fc20.x86_64).
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Sr. Network Kernel Developer at Red Hat
Author of http://www.iptv-analyzer.org
LinkedIn: http://www.linkedin.com/in/brouer
https://github.com/netoptimizer/prototype-kernel/commit/1471ac60
On Thu, 12 Feb 2015, Jesper Dangaard Brouer wrote:
> Measured on my laptop CPU i7-2620M CPU @ 2.70GHz:
>
> * 12.775 ns - "clean" spin_lock_unlock
> * 21.099 ns - irqsave variant spinlock
> * 22.808 ns - "manual" irqsave before spin_lock
> * 14.618 ns - "manual" local_irq_disable + spin_lock
>
> Reproducible via my github repo:
> https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/time_bench_sample.c
>
> The clean spin_lock_unlock is 8.324 ns faster than irqsave variant.
> The irqsave variant is actually faster than expected, as the measurement
> of an isolated local_irq_save_restore were 13.256 ns.
I am using spin_lock_irq() in the current version on my system. If the
performance of that is a problem then please optimize that function.
On Tue, Feb 10, 2015 at 01:48:06PM -0600, Christoph Lameter wrote:
> The major portions are there but there is no support yet for
> directly allocating per cpu objects. There could also be more
> sophisticated code to exploit the batch freeing.
>
> Signed-off-by: Christoph Lameter <[email protected]>
>
> Index: linux/include/linux/slub_def.h
> ===================================================================
> --- linux.orig/include/linux/slub_def.h
> +++ linux/include/linux/slub_def.h
> @@ -110,4 +110,5 @@ static inline void sysfs_slab_remove(str
> }
> #endif
>
> +#define _HAVE_SLAB_ALLOCATOR_ARRAY_OPERATIONS
> #endif /* _LINUX_SLUB_DEF_H */
> Index: linux/mm/slub.c
> ===================================================================
> --- linux.orig/mm/slub.c
> +++ linux/mm/slub.c
> @@ -1379,13 +1379,9 @@ static void setup_object(struct kmem_cac
> s->ctor(object);
> }
>
> -static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
> +static struct page *__new_slab(struct kmem_cache *s, gfp_t flags, int node)
> {
> struct page *page;
> - void *start;
> - void *p;
> - int order;
> - int idx;
>
> if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
> pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
> @@ -1394,33 +1390,42 @@ static struct page *new_slab(struct kmem
>
> page = allocate_slab(s,
> flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
> - if (!page)
> - goto out;
> + if (page) {
> + inc_slabs_node(s, page_to_nid(page), page->objects);
> + page->slab_cache = s;
> + __SetPageSlab(page);
> + if (page->pfmemalloc)
> + SetPageSlabPfmemalloc(page);
> + }
>
> - order = compound_order(page);
> - inc_slabs_node(s, page_to_nid(page), page->objects);
> - page->slab_cache = s;
> - __SetPageSlab(page);
> - if (page->pfmemalloc)
> - SetPageSlabPfmemalloc(page);
> -
> - start = page_address(page);
> -
> - if (unlikely(s->flags & SLAB_POISON))
> - memset(start, POISON_INUSE, PAGE_SIZE << order);
> -
> - for_each_object_idx(p, idx, s, start, page->objects) {
> - setup_object(s, page, p);
> - if (likely(idx < page->objects))
> - set_freepointer(s, p, p + s->size);
> - else
> - set_freepointer(s, p, NULL);
> - }
> -
> - page->freelist = start;
> - page->inuse = page->objects;
> - page->frozen = 1;
> -out:
> + return page;
> +}
> +
> +static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
> +{
> + struct page *page = __new_slab(s, flags, node);
> +
> + if (page) {
> + void *p;
> + int idx;
> + void *start = page_address(page);
> +
> + if (unlikely(s->flags & SLAB_POISON))
> + memset(start, POISON_INUSE,
> + PAGE_SIZE << compound_order(page));
I'm not sure, but, this poisoning is also needed for
slab_array_alloc_from_page_allocator()?
> +
> + for_each_object_idx(p, idx, s, start, page->objects) {
> + setup_object(s, page, p);
> + if (likely(idx < page->objects))
> + set_freepointer(s, p, p + s->size);
> + else
> + set_freepointer(s, p, NULL);
> + }
> +
> + page->freelist = start;
> + page->inuse = page->objects;
> + page->frozen = 1;
> + }
> return page;
> }
>
> @@ -2516,8 +2521,78 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trac
> #endif
> #endif
>
> +int slab_array_alloc_from_partial(struct kmem_cache *s,
> + size_t nr, void **p)
> +{
> + void **end = p + nr;
> + struct kmem_cache_node *n = get_node(s, numa_mem_id());
> + int allocated = 0;
> + unsigned long flags;
> + struct page *page, *page2;
> +
> + if (!n->nr_partial)
> + return 0;
> +
> +
> + spin_lock_irqsave(&n->list_lock, flags);
> + list_for_each_entry_safe(page, page2, &n->partial, lru) {
> + void *freelist;
> +
> + if (page->objects - page->inuse > end - p)
> + /* More objects free in page than we want */
> + break;
> + list_del(&page->lru);
> + slab_lock(page);
slab_lock() doesn't protect freelist if CONFIG_HAVE_CMPXCHG_DOUBLE is
enabled. You should use cmpxchg_double_slab() things.
And, better solution is to use acquire_slab() rather than
re-implementation of detaching freelist.
> + freelist = page->freelist;
> + page->inuse = page->objects;
> + page->freelist = NULL;
> + slab_unlock(page);
> + /* Grab all available objects */
> + while (freelist) {
> + *p++ = freelist;
> + freelist = get_freepointer(s, freelist);
> + allocated++;
> + }
Fetching all objects with holding node lock could result in enomourous
lock contention. How about getting free ojbect pointer without holding
the node lock? We can temporarilly store all head of freelists in
array p and can fetch each object pointer without holding node lock.
Thanks.
On Fri, 13 Feb 2015, Joonsoo Kim wrote:
> > + *p++ = freelist;
> > + freelist = get_freepointer(s, freelist);
> > + allocated++;
> > + }
>
> Fetching all objects with holding node lock could result in enomourous
> lock contention. How about getting free ojbect pointer without holding
> the node lock? We can temporarilly store all head of freelists in
> array p and can fetch each object pointer without holding node lock.
Could do that but lets first see if there is really an issue. The other
cpu sharing the same partial lists presumaly have cpu local objects to get
through first before they hit this lock.
On Fri, Feb 13, 2015 at 09:49:24AM -0600, Christoph Lameter wrote:
> On Fri, 13 Feb 2015, Joonsoo Kim wrote:
>
> > > + *p++ = freelist;
> > > + freelist = get_freepointer(s, freelist);
> > > + allocated++;
> > > + }
> >
> > Fetching all objects with holding node lock could result in enomourous
> > lock contention. How about getting free ojbect pointer without holding
> > the node lock? We can temporarilly store all head of freelists in
> > array p and can fetch each object pointer without holding node lock.
>
>
> Could do that but lets first see if there is really an issue. The other
> cpu sharing the same partial lists presumaly have cpu local objects to get
> through first before they hit this lock.
Okay.
Thanks.