2003-11-29 16:21:51

by Manfred Spraul

[permalink] [raw]
Subject: [CFT] kmem_cache_alloc_node

// $Header$
// Kernel Version:
// VERSION = 2
// PATCHLEVEL = 6
// SUBLEVEL = 0
// EXTRAVERSION = -test11
--- 2.6/include/linux/slab.h 2003-10-25 20:44:55.000000000 +0200
+++ build-2.6/include/linux/slab.h 2003-11-29 15:42:59.000000000 +0100
@@ -62,6 +62,7 @@
extern int kmem_cache_destroy(kmem_cache_t *);
extern int kmem_cache_shrink(kmem_cache_t *);
extern void *kmem_cache_alloc(kmem_cache_t *, int);
+extern void *kmem_cache_alloc_node(kmem_cache_t *, int);
extern void kmem_cache_free(kmem_cache_t *, void *);
extern unsigned int kmem_cache_size(kmem_cache_t *);

--- 2.6/mm/slab.c 2003-11-29 09:46:35.000000000 +0100
+++ build-2.6/mm/slab.c 2003-11-29 17:11:46.000000000 +0100
@@ -578,12 +578,26 @@
}
}

-/*
- * Note: if someone calls kmem_cache_alloc() on the new
- * cpu before the cpuup callback had a chance to allocate
- * the head arrays, it will oops.
- * Is CPU_ONLINE early enough?
- */
+static struct array_cache *alloc_arraycache(int cpu, int entries, int batchcount)
+{
+ int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
+ struct array_cache *nc = NULL;
+
+ if (cpu != -1) {
+ nc = kmem_cache_alloc_node(kmem_find_general_cachep(memsize, GFP_KERNEL),
+ cpu_to_node(cpu));
+ }
+ if (!nc)
+ nc = kmalloc(memsize, GFP_KERNEL);
+ if (nc) {
+ nc->avail = 0;
+ nc->limit = entries;
+ nc->batchcount = batchcount;
+ nc->touched = 0;
+ }
+ return nc;
+}
+
static int __devinit cpuup_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
@@ -595,25 +609,18 @@
case CPU_UP_PREPARE:
down(&cache_chain_sem);
list_for_each(p, &cache_chain) {
- int memsize;
+ kmem_cache_t* cachep = list_entry(p, kmem_cache_t, next);
struct array_cache *nc;

- kmem_cache_t* cachep = list_entry(p, kmem_cache_t, next);
- memsize = sizeof(void*)*cachep->limit+sizeof(struct array_cache);
- nc = kmalloc(memsize, GFP_KERNEL);
+ nc = alloc_arraycache(cpu, cachep->limit, cachep->batchcount);
if (!nc)
goto bad;
- nc->avail = 0;
- nc->limit = cachep->limit;
- nc->batchcount = cachep->batchcount;
- nc->touched = 0;

spin_lock_irq(&cachep->spinlock);
cachep->array[cpu] = nc;
cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
+ cachep->num;
spin_unlock_irq(&cachep->spinlock);
-
}
up(&cache_chain_sem);
break;
@@ -800,24 +807,27 @@
* did not request dmaable memory, we might get it, but that
* would be relatively rare and ignorable.
*/
-static inline void *kmem_getpages(kmem_cache_t *cachep, unsigned long flags)
+static void *kmem_getpages(kmem_cache_t *cachep, int flags, int nodeid)
{
- void *addr;
+ struct page *page;

flags |= cachep->gfpflags;
- if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
- atomic_add(1<<cachep->gfporder, &slab_reclaim_pages);
- addr = (void*)__get_free_pages(flags, cachep->gfporder);
- if (addr) {
+ page = alloc_pages_node(nodeid, flags, cachep->gfporder);
+ if (page) {
int i = (1 << cachep->gfporder);
- struct page *page = virt_to_page(addr);
+ void *ptr;

+ add_page_state(nr_slab, i);
+ if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
+ atomic_add(i, &slab_reclaim_pages);
+ ptr = page_address(page);
while (i--) {
SetPageSlab(page);
page++;
}
+ return ptr;
}
- return addr;
+ return NULL;
}

/*
@@ -1539,6 +1549,21 @@
}
}

+static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
+{
+ int i;
+ struct page *page;
+
+ /* Nasty!!!!!! I hope this is OK. */
+ i = 1 << cachep->gfporder;
+ page = virt_to_page(objp);
+ do {
+ SET_PAGE_CACHE(page, cachep);
+ SET_PAGE_SLAB(page, slabp);
+ page++;
+ } while (--i);
+}
+
/*
* Grow (by 1) the number of slabs within a cache. This is called by
* kmem_cache_alloc() when there are no active objs left in a cache.
@@ -1546,10 +1571,9 @@
static int cache_grow (kmem_cache_t * cachep, int flags)
{
struct slab *slabp;
- struct page *page;
void *objp;
size_t offset;
- unsigned int i, local_flags;
+ int local_flags;
unsigned long ctor_flags;

/* Be lazy and only check for valid flags here,
@@ -1593,25 +1617,15 @@
*/
kmem_flagcheck(cachep, flags);

-
/* Get mem for the objs. */
- if (!(objp = kmem_getpages(cachep, flags)))
+ if (!(objp = kmem_getpages(cachep, flags, numa_node_id())))
goto failed;

/* Get slab management. */
if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
goto opps1;

- /* Nasty!!!!!! I hope this is OK. */
- i = 1 << cachep->gfporder;
- page = virt_to_page(objp);
- do {
- SET_PAGE_CACHE(page, cachep);
- SET_PAGE_SLAB(page, slabp);
- inc_page_state(nr_slab);
- page++;
- } while (--i);
-
+ set_slab_attr(cachep, slabp, objp);
cache_init_objs(cachep, slabp, ctor_flags);

if (local_flags & __GFP_WAIT)
@@ -2074,6 +2088,82 @@
EXPORT_SYMBOL(kmem_cache_alloc);

/**
+ * kmem_cache_alloc_node - Allocate an object on the specified node
+ * @cachep: The cache to allocate from.
+ * @flags: See kmalloc().
+ * @nodeid: node number of the target node.
+ *
+ * Identical to kmem_cache_alloc, except that this function is slow
+ * and can sleep. And it will allocate memory on the given node, which
+ * can improve the performance for cpu bound structures.
+ */
+void *kmem_cache_alloc_node(kmem_cache_t *cachep, int nodeid)
+{
+ size_t offset;
+ void *objp;
+ struct slab *slabp;
+ kmem_bufctl_t next;
+
+ /* The main algorithms are not node aware, thus we have to cheat:
+ * We bypass all caches and allocate a new slab.
+ * The following code is a streamlined copy of cache_grow().
+ */
+
+ /* Get colour for the slab, and update the next value. */
+ spin_lock_irq(&cachep->spinlock);
+ offset = cachep->colour_next;
+ cachep->colour_next++;
+ if (cachep->colour_next >= cachep->colour)
+ cachep->colour_next = 0;
+ offset *= cachep->colour_off;
+ spin_unlock_irq(&cachep->spinlock);
+
+ /* Get mem for the objs. */
+ if (!(objp = kmem_getpages(cachep, GFP_KERNEL, nodeid)))
+ goto failed;
+
+ /* Get slab management. */
+ if (!(slabp = alloc_slabmgmt(cachep, objp, offset, GFP_KERNEL)))
+ goto opps1;
+
+ set_slab_attr(cachep, slabp, objp);
+ cache_init_objs(cachep, slabp, SLAB_CTOR_CONSTRUCTOR);
+
+ /* The first object is ours: */
+ objp = slabp->s_mem + slabp->free*cachep->objsize;
+ slabp->inuse++;
+ next = slab_bufctl(slabp)[slabp->free];
+#if DEBUG
+ slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+#endif
+ slabp->free = next;
+
+ /* add the remaining objects into the cache */
+ spin_lock_irq(&cachep->spinlock);
+ check_slabp(cachep, slabp);
+ STATS_INC_GROWN(cachep);
+ /* Make slab active. */
+ if (slabp->free == BUFCTL_END) {
+ list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_full));
+ } else {
+ list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_partial));
+ list3_data(cachep)->free_objects += cachep->num-1;
+ }
+ spin_unlock_irq(&cachep->spinlock);
+ objp = cache_alloc_debugcheck_after(cachep, GFP_KERNEL, objp, __builtin_return_address(0));
+ return objp;
+
+ return objp;
+opps1:
+ kmem_freepages(cachep, objp);
+failed:
+ return NULL;
+
+}
+
+EXPORT_SYMBOL(kmem_cache_alloc_node);
+
+/**
* kmalloc - allocate memory
* @size: how many bytes of memory are required.
* @flags: the type of memory to allocate.
@@ -2267,7 +2357,6 @@
new->new[smp_processor_id()] = old;
}

-
static int do_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount, int shared)
{
struct ccupdate_struct new;
@@ -2276,19 +2365,11 @@

memset(&new.new,0,sizeof(new.new));
for (i = 0; i < NR_CPUS; i++) {
- struct array_cache *ccnew;
-
- ccnew = kmalloc(sizeof(void*)*limit+
- sizeof(struct array_cache), GFP_KERNEL);
- if (!ccnew) {
+ new.new[i] = alloc_arraycache(i, limit, batchcount);
+ if (!new.new[i]) {
for (i--; i >= 0; i--) kfree(new.new[i]);
return -ENOMEM;
}
- ccnew->avail = 0;
- ccnew->limit = limit;
- ccnew->batchcount = batchcount;
- ccnew->touched = 0;
- new.new[i] = ccnew;
}
new.cachep = cachep;

@@ -2310,14 +2391,9 @@
spin_unlock_irq(&cachep->spinlock);
kfree(ccold);
}
- new_shared = kmalloc(sizeof(void*)*batchcount*shared+
- sizeof(struct array_cache), GFP_KERNEL);
+ new_shared = alloc_arraycache(-1, batchcount*shared, 0xbaadf00d);
if (new_shared) {
struct array_cache *old;
- new_shared->avail = 0;
- new_shared->limit = batchcount*shared;
- new_shared->batchcount = 0xbaadf00d;
- new_shared->touched = 0;

spin_lock_irq(&cachep->spinlock);
old = cachep->lists.shared;
--- 2.6/include/linux/page-flags.h 2003-10-25 20:44:06.000000000 +0200
+++ build-2.6/include/linux/page-flags.h 2003-11-29 14:45:57.000000000 +0100
@@ -133,6 +133,7 @@

#define inc_page_state(member) mod_page_state(member, 1UL)
#define dec_page_state(member) mod_page_state(member, 0UL - 1)
+#define add_page_state(member,delta) mod_page_state(member, (delta))
#define sub_page_state(member,delta) mod_page_state(member, 0UL - (delta))



Attachments:
patch-slab-nodelink (8.71 kB)

2003-11-29 17:13:29

by Zwane Mwaikambo

[permalink] [raw]
Subject: Re: [CFT] kmem_cache_alloc_node

On Sat, 29 Nov 2003, Manfred Spraul wrote:

> I've written a prototype kmem_cache_alloc_node function: I'm not yet
> convinced that it's really necessary to guarantee that kmalloc and
> kmem_cache_alloc are strictly node-local (adds noticable costs to the
> hot paths, and many objects will be touched by multiple nodes anyway),
> but at least the cpu bound data structures should be allocated from the
> right node.
> The attached patch adds a simple kmem_cache_alloc_node function and
> moves the cpu local structures within slab onto the right node.
> One problem is the bootstrap: there is an
> alloc_pages_node(,,cpu_to_node()) during CPU_UP_PREPARE: Does that work,
> or is that too early?
> Please test it, I don't have access to suitable hardware.
>
> The patch also includes fixes for two accounting bugs in error paths,
> I'll send them seperately to Andrew.

Hi Manfred,
I just booted this patch on a 4 quad NUMAQ successfully, this was
only a test boot, i'll get around to having a closer look a bit later on.

Thanks