2005-05-11 15:23:11

by Christoph Lameter

[permalink] [raw]
Subject: NUMA aware slab allocator V2

The NUMA API change that introduced kmalloc_node was accepted last week by
Linus. Now it is possible to do slab allocations on a node to localize
memory structures. This API was used by the pageset localization patch and
the block layer localization patch now in mm. The existing kmalloc_node is
slow since it simply searches through all pages of the slab to find a page
that is on the node requested. The two patches do a one time allocation of
slab structures at initialization and therefore the speed of kmalloc node
does not matter.

This patch allows kmalloc_node to be as fast as kmalloc by introducing
node specific page lists for partial, free and full slabs. Slab allocation
improves in a NUMA system so that we are seeing a performance gain in
AIM7 of about 5% with this patch alone.

More NUMA localizations are possible if kmalloc_node operates
in an fast way like kmalloc.

Test run on a 32p systems with 32G Ram.

w/o patch
Tasks jobs/min jti jobs/min/task real cpu
1 485.36 100 485.3640 11.99 1.91 Sat Apr 30 14:01:51 2005
100 26582.63 88 265.8263 21.89 144.96 Sat Apr 30 14:02:14 2005
200 29866.83 81 149.3342 38.97 286.08 Sat Apr 30 14:02:53 2005
300 33127.16 78 110.4239 52.71 426.54 Sat Apr 30 14:03:46 2005
400 34889.47 80 87.2237 66.72 568.90 Sat Apr 30 14:04:53 2005
500 35654.34 76 71.3087 81.62 714.55 Sat Apr 30 14:06:15 2005
600 36460.83 75 60.7681 95.77 853.42 Sat Apr 30 14:07:51 2005
700 35957.00 75 51.3671 113.30 990.67 Sat Apr 30 14:09:45 2005
800 33380.65 73 41.7258 139.48 1140.86 Sat Apr 30 14:12:05 2005
900 35095.01 76 38.9945 149.25 1281.30 Sat Apr 30 14:14:35 2005
1000 36094.37 74 36.0944 161.24 1419.66 Sat Apr 30 14:17:17 2005

w/patch
Tasks jobs/min jti jobs/min/task real cpu
1 484.27 100 484.2736 12.02 1.93 Sat Apr 30 15:59:45 2005
100 28262.03 90 282.6203 20.59 143.57 Sat Apr 30 16:00:06 2005
200 32246.45 82 161.2322 36.10 282.89 Sat Apr 30 16:00:42 2005
300 37945.80 83 126.4860 46.01 418.75 Sat Apr 30 16:01:28 2005
400 40000.69 81 100.0017 58.20 561.48 Sat Apr 30 16:02:27 2005
500 40976.10 78 81.9522 71.02 696.95 Sat Apr 30 16:03:38 2005
600 41121.54 78 68.5359 84.92 834.86 Sat Apr 30 16:05:04 2005
700 44052.77 78 62.9325 92.48 971.53 Sat Apr 30 16:06:37 2005
800 41066.89 79 51.3336 113.38 1111.15 Sat Apr 30 16:08:31 2005
900 38918.77 79 43.2431 134.59 1252.57 Sat Apr 30 16:10:46 2005
1000 41842.21 76 41.8422 139.09 1392.33 Sat Apr 30 16:13:05 2005

These are measurement taken directly after boot and show a greater improvement than 5%.
However, the performance improvements become less over time if the AIM7 runs are repeated
and settle down at around 5%.

Link to earlier discussions:
http://marc.theaimsgroup.com/?t=111094594500003&r=1&w=2

Changelog:
- Batching for freeing of wrong-node objects (alien caches)
- Locking changes and NUMA #ifdefs as requested by Manfred

Signed-off-by: Alok N Kataria <[email protected]>
Signed-off-by: Shobhit Dayal <[email protected]>
Signed-off-by: Shai Fultheim <[email protected]>
Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.11/mm/slab.c
===================================================================
--- linux-2.6.11.orig/mm/slab.c 2005-04-30 11:41:28.000000000 -0700
+++ linux-2.6.11/mm/slab.c 2005-05-04 09:18:16.000000000 -0700
@@ -75,6 +75,13 @@
*
* At present, each engine can be growing a cache. This should be blocked.
*
+ * 15 March 2005. NUMA slab allocator.
+ * Shobhit Dayal <[email protected]>
+ * Alok N Kataria <[email protected]>
+ *
+ * Modified the slab allocator to be node aware on NUMA systems.
+ * Each node has its own list of partial, free and full slabs.
+ * All object allocations for a node occur from node specific slab lists.
*/

#include <linux/config.h>
@@ -92,7 +99,7 @@
#include <linux/sysctl.h>
#include <linux/module.h>
#include <linux/rcupdate.h>
-
+#include <linux/nodemask.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -210,6 +217,9 @@ struct slab {
void *s_mem; /* including colour offset */
unsigned int inuse; /* num of objs active in slab */
kmem_bufctl_t free;
+#ifdef CONFIG_NUMA
+ unsigned short nodeid;
+#endif
};

/*
@@ -252,6 +262,10 @@ struct array_cache {
unsigned int limit;
unsigned int batchcount;
unsigned int touched;
+#ifdef CONFIG_NUMA
+ spinlock_t lock;
+#endif
+ void *entry[];
};

/* bootstrap: The caches do not work without cpuarrays anymore,
@@ -275,24 +289,77 @@ struct kmem_list3 {
struct list_head slabs_full;
struct list_head slabs_free;
unsigned long free_objects;
- int free_touched;
unsigned long next_reap;
+ int free_touched;
+ unsigned int free_limit;
+ spinlock_t list_lock;
struct array_cache *shared;
+#ifdef CONFIG_NUMA
+ struct array_cache **alien;
+#endif
};

+/*
+ * Need this for bootstrapping a per node allocator.
+ */
+#define NUM_INIT_LISTS 3
+struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
+struct kmem_list3 __initdata kmem64_list3[MAX_NUMNODES];
+
+#ifdef CONFIG_NUMA
+
#define LIST3_INIT(parent) \
- { \
- .slabs_full = LIST_HEAD_INIT(parent.slabs_full), \
- .slabs_partial = LIST_HEAD_INIT(parent.slabs_partial), \
- .slabs_free = LIST_HEAD_INIT(parent.slabs_free) \
- }
+ do { \
+ INIT_LIST_HEAD(&(parent)->slabs_full); \
+ INIT_LIST_HEAD(&(parent)->slabs_partial); \
+ INIT_LIST_HEAD(&(parent)->slabs_free); \
+ (parent)->shared = NULL; \
+ (parent)->alien = NULL; \
+ (parent)->list_lock = SPIN_LOCK_UNLOCKED; \
+ (parent)->free_objects = 0; \
+ (parent)->free_touched = 0; \
+ } while(0)
+#else
+
+#define LIST3_INIT(parent) \
+ do { \
+ INIT_LIST_HEAD(&(parent)->slabs_full); \
+ INIT_LIST_HEAD(&(parent)->slabs_partial); \
+ INIT_LIST_HEAD(&(parent)->slabs_free); \
+ (parent)->shared = NULL; \
+ (parent)->list_lock = SPIN_LOCK_UNLOCKED; \
+ (parent)->free_objects = 0; \
+ (parent)->free_touched = 0; \
+ } while(0)
+#endif
+
+#define MAKE_LIST(cachep, listp, slab, nodeid) \
+ do { \
+ INIT_LIST_HEAD(listp); \
+ list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
+ }while(0)
+
+#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
+ do { \
+ MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \
+ MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
+ MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
+ }while(0)
+
#define list3_data(cachep) \
- (&(cachep)->lists)
+ ((cachep->nodelists[numa_node_id()]))

/* NUMA: per-node */
#define list3_data_ptr(cachep, ptr) \
list3_data(cachep)

+#ifdef CONFIG_NUMA
+#define is_node_online(node) node_online(node)
+#else
+#define is_node_online(node) \
+ ({ BUG_ON(node != 0); 1; })
+#endif /* CONFIG_NUMA */
+
/*
* kmem_cache_t
*
@@ -304,13 +371,12 @@ struct kmem_cache_s {
struct array_cache *array[NR_CPUS];
unsigned int batchcount;
unsigned int limit;
-/* 2) touched by every alloc & free from the backend */
- struct kmem_list3 lists;
- /* NUMA: kmem_3list_t *nodelists[MAX_NUMNODES] */
+ unsigned int shared;
unsigned int objsize;
+/* 2) touched by every alloc & free from the backend */
+ struct kmem_list3 *nodelists[MAX_NUMNODES];
unsigned int flags; /* constant flags */
unsigned int num; /* # of objs per slab */
- unsigned int free_limit; /* upper limit of objects in the lists */
spinlock_t spinlock;

/* 3) cache_grow/shrink */
@@ -347,6 +413,7 @@ struct kmem_cache_s {
unsigned long errors;
unsigned long max_freeable;
unsigned long node_allocs;
+ unsigned long node_frees;
atomic_t allochit;
atomic_t allocmiss;
atomic_t freehit;
@@ -382,6 +449,7 @@ struct kmem_cache_s {
} while (0)
#define STATS_INC_ERR(x) ((x)->errors++)
#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
+#define STATS_INC_NODEFREES(x) ((x)->node_frees++)
#define STATS_SET_FREEABLE(x, i) \
do { if ((x)->max_freeable < i) \
(x)->max_freeable = i; \
@@ -400,6 +468,7 @@ struct kmem_cache_s {
#define STATS_SET_HIGH(x) do { } while (0)
#define STATS_INC_ERR(x) do { } while (0)
#define STATS_INC_NODEALLOCS(x) do { } while (0)
+#define STATS_INC_NODEFREES(x) do { } while (0)
#define STATS_SET_FREEABLE(x, i) \
do { } while (0)

@@ -532,9 +601,9 @@ static struct arraycache_init initarray_

/* internal cache of cache description objs */
static kmem_cache_t cache_cache = {
- .lists = LIST3_INIT(cache_cache.lists),
.batchcount = 1,
.limit = BOOT_CPUCACHE_ENTRIES,
+ .shared = 1,
.objsize = sizeof(kmem_cache_t),
.flags = SLAB_NO_REAP,
.spinlock = SPIN_LOCK_UNLOCKED,
@@ -567,16 +636,20 @@ static enum {
FULL
} g_cpucache_up;

+static enum {
+ CACHE_CACHE,
+ SIZE_32,
+ SIZE_DMA_32,
+ SIZE_64,
+ ALL
+} cpucache_up_64;
+
static DEFINE_PER_CPU(struct work_struct, reap_work);

static void free_block(kmem_cache_t* cachep, void** objpp, int len);
static void enable_cpucache (kmem_cache_t *cachep);
static void cache_reap (void *unused);
-
-static inline void **ac_entry(struct array_cache *ac)
-{
- return (void**)(ac+1);
-}
+static int __node_shrink(kmem_cache_t *cachep, int node);

static inline struct array_cache *ac_data(kmem_cache_t *cachep)
{
@@ -678,42 +751,151 @@ static struct array_cache *alloc_arrayca
int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
struct array_cache *nc = NULL;

- if (cpu == -1)
- nc = kmalloc(memsize, GFP_KERNEL);
- else
- nc = kmalloc_node(memsize, GFP_KERNEL, cpu_to_node(cpu));
-
+ nc = kmalloc_node(memsize, GFP_KERNEL, cpu_to_node(cpu));
if (nc) {
nc->avail = 0;
nc->limit = entries;
nc->batchcount = batchcount;
nc->touched = 0;
+#ifdef CONFIG_NUMA
+ spin_lock_init(&nc->lock);
+#endif
}
return nc;
}
+#ifdef CONFIG_NUMA
+static inline struct array_cache **alloc_alien_cache(int cpu, int limit)
+{
+ struct array_cache **ac_ptr;
+ int memsize = sizeof(void*)*MAX_NUMNODES;
+ int node = cpu_to_node(cpu);
+ int i;
+
+ if (limit > 1)
+ limit = 12;
+ ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
+ if(ac_ptr) {
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ if (i == node) {
+ ac_ptr[i] = NULL;
+ continue;
+ }
+ ac_ptr[i] = alloc_arraycache(cpu, limit, 0xbaadf00d);
+ if(!ac_ptr[i]) {
+ for(i--; i <=0; i--)
+ kfree(ac_ptr[i]);
+ kfree(ac_ptr);
+ return NULL;
+ }
+ }
+ }
+ return ac_ptr;
+}
+
+static inline void free_alien_cache(struct array_cache **ac_ptr)
+{
+ int i;
+
+ if(!ac_ptr)
+ return;
+ for (i = 0; i < MAX_NUMNODES; i++)
+ kfree(ac_ptr[i]);
+
+ kfree(ac_ptr);
+}
+
+static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node)
+{
+ struct kmem_list3 *rl3 = cachep->nodelists[node];
+
+ if(ac->avail) {
+ spin_lock(&rl3->list_lock);
+ free_block(cachep, ac->entry, ac->avail);
+ ac->avail = 0;
+ spin_unlock(&rl3->list_lock);
+ }
+}
+
+static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
+{
+ int i=0;
+ struct array_cache *ac;
+ unsigned long flags;
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ ac = l3->alien[i];
+ if(ac) {
+ spin_lock_irqsave(&ac->lock, flags);
+ __drain_alien_cache(cachep, ac, i);
+ spin_unlock_irqrestore(&ac->lock, flags);
+ }
+ }
+}
+#endif

static int __devinit cpuup_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
kmem_cache_t* cachep;
+ struct kmem_list3 *l3 = NULL;
+ int node = cpu_to_node(cpu);
+ int memsize = sizeof(struct kmem_list3);
+ struct array_cache *nc = NULL;

switch (action) {
case CPU_UP_PREPARE:
down(&cache_chain_sem);
+ /* we need to do this right in the begining since
+ * alloc_arraycache's are going to use this list.
+ * kmalloc_node allows us to add the slab to the right
+ * kmem_list3 and not this cpu's kmem_list3
+ */
+
list_for_each_entry(cachep, &cache_chain, next) {
- struct array_cache *nc;
+ /* setup the size64 kmemlist for hcpu before we can
+ * begin anything. Make sure some other cpu on this
+ * node has not already allocated this
+ */
+ if (!cachep->nodelists[node]) {
+ if(!(l3 = kmalloc_node(memsize,
+ GFP_KERNEL, node)))
+ goto bad;
+ LIST3_INIT(l3);
+ l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep)%REAPTIMEOUT_LIST3;

- nc = alloc_arraycache(cpu, cachep->limit, cachep->batchcount);
+ cachep->nodelists[node] = l3;
+ }
+
+ spin_lock_irq(&cachep->nodelists[node]->list_lock);
+ cachep->nodelists[node]->free_limit =
+ (1 + nr_cpus_node(node)) *
+ cachep->batchcount + cachep->num;
+ spin_unlock_irq(&cachep->nodelists[node]->list_lock);
+ }
+
+ /* Now we can go ahead with allocating the shared array's
+ & array cache's */
+ list_for_each_entry(cachep, &cache_chain, next) {
+ nc = alloc_arraycache(cpu, cachep->limit,
+ cachep->batchcount);
if (!nc)
goto bad;
-
- spin_lock_irq(&cachep->spinlock);
cachep->array[cpu] = nc;
- cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
- + cachep->num;
- spin_unlock_irq(&cachep->spinlock);

+ l3 = cachep->nodelists[node];
+ BUG_ON(!l3);
+ if(!l3->shared) {
+ if(!(nc = alloc_arraycache(cpu,
+ cachep->shared*cachep->batchcount,
+ 0xbaadf00d)))
+ goto bad;
+
+ /* we are serialised from CPU_DEAD or
+ CPU_UP_CANCELLED by the cpucontrol lock */
+ l3->shared = nc;
+ }
}
up(&cache_chain_sem);
break;
@@ -728,13 +910,53 @@ static int __devinit cpuup_callback(stru

list_for_each_entry(cachep, &cache_chain, next) {
struct array_cache *nc;
+ cpumask_t mask;

+ mask = node_to_cpumask(node);
spin_lock_irq(&cachep->spinlock);
/* cpu is dead; no one can alloc from it. */
nc = cachep->array[cpu];
cachep->array[cpu] = NULL;
- cachep->free_limit -= cachep->batchcount;
- free_block(cachep, ac_entry(nc), nc->avail);
+ l3 = cachep->nodelists[node];
+
+ if(!l3)
+ goto unlock_cache;
+
+ spin_lock(&l3->list_lock);
+
+ /* Free limit for this kmem_list3 */
+ l3->free_limit -= cachep->batchcount;
+ if(nc)
+ free_block(cachep, nc->entry, nc->avail);
+
+ if(!cpus_empty(mask)) {
+ spin_unlock(&l3->list_lock);
+ goto unlock_cache;
+ }
+
+ if(l3->shared) {
+ free_block(cachep, l3->shared->entry,
+ l3->shared->avail);
+ kfree(l3->shared);
+ l3->shared = NULL;
+ }
+#ifdef CONFIG_NUMA
+ if(l3->alien) {
+ drain_alien_cache(cachep, l3);
+ free_alien_cache(l3->alien);
+ l3->alien = NULL;
+ }
+#endif
+
+ /* free slabs belonging to this node */
+ if(__node_shrink(cachep, node)) {
+ cachep->nodelists[node] = NULL;
+ spin_unlock(&l3->list_lock);
+ kfree(l3);
+ }
+ else
+ spin_unlock(&l3->list_lock);
+unlock_cache:
spin_unlock_irq(&cachep->spinlock);
kfree(nc);
}
@@ -750,6 +972,25 @@ bad:

static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };

+/*
+ * swap the static kmem_list3 with kmalloced memory
+ */
+static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list,
+ int nodeid)
+{
+ struct kmem_list3 *ptr;
+
+ BUG_ON((cachep->nodelists[nodeid]) != list);
+ ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
+ BUG_ON(!ptr);
+
+ local_irq_disable();
+ memcpy(ptr, list, sizeof(struct kmem_list3));
+ MAKE_ALL_LISTS(cachep, ptr, nodeid);
+ cachep->nodelists[nodeid] = ptr;
+ local_irq_enable();
+}
+
/* Initialisation.
* Called after the gfp() functions have been enabled, and before smp_init().
*/
@@ -758,7 +999,15 @@ void __init kmem_cache_init(void)
size_t left_over;
struct cache_sizes *sizes;
struct cache_names *names;
+ int i;

+ for(i = 0; i < NUM_INIT_LISTS; i++)
+ LIST3_INIT(&initkmem_list3[i]);
+
+ for(i = 0; i < MAX_NUMNODES; i++) {
+ LIST3_INIT(&kmem64_list3[i]);
+ cache_cache.nodelists[i] = NULL;
+ }
/*
* Fragmentation resistance on low memory - only use bigger
* page orders on machines with more than 32MB of memory.
@@ -766,21 +1015,24 @@ void __init kmem_cache_init(void)
if (num_physpages > (32 << 20) >> PAGE_SHIFT)
slab_break_gfp_order = BREAK_GFP_ORDER_HI;

-
/* Bootstrap is tricky, because several objects are allocated
* from caches that do not exist yet:
* 1) initialize the cache_cache cache: it contains the kmem_cache_t
* structures of all caches, except cache_cache itself: cache_cache
* is statically allocated.
- * Initially an __init data area is used for the head array, it's
- * replaced with a kmalloc allocated array at the end of the bootstrap.
+ * Initially an __init data area is used for the head array and the
+ * kmem_list3 structures, it's replaced with a kmalloc allocated
+ * array at the end of the bootstrap.
* 2) Create the first kmalloc cache.
- * The kmem_cache_t for the new cache is allocated normally. An __init
- * data area is used for the head array.
- * 3) Create the remaining kmalloc caches, with minimally sized head arrays.
+ * The kmem_cache_t for the new cache is allocated normally.
+ * An __init data area is used for the head array.
+ * 3) Create the remaining kmalloc caches, with minimally sized
+ * head arrays.
* 4) Replace the __init data head arrays for cache_cache and the first
* kmalloc cache with kmalloc allocated arrays.
- * 5) Resize the head arrays of the kmalloc caches to their final sizes.
+ * 5) Replace the __init data for kmem_list3 for cache_cache and
+ * the other cache's with kmalloc allocated memory.
+ * 6) Resize the head arrays of the kmalloc caches to their final sizes.
*/

/* 1) create the cache_cache */
@@ -789,6 +1041,7 @@ void __init kmem_cache_init(void)
list_add(&cache_cache.next, &cache_chain);
cache_cache.colour_off = cache_line_size();
cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
+ cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];

cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());

@@ -833,24 +1086,54 @@ void __init kmem_cache_init(void)
/* 4) Replace the bootstrap head arrays */
{
void * ptr;
-
+
ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+
local_irq_disable();
BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
- memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init));
+ memcpy(ptr, ac_data(&cache_cache),
+ sizeof(struct arraycache_init));
cache_cache.array[smp_processor_id()] = ptr;
local_irq_enable();
-
+
ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+
local_irq_disable();
- BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache);
+ BUG_ON(ac_data(malloc_sizes[0].cs_cachep)
+ != &initarray_generic.cache);
memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep),
sizeof(struct arraycache_init));
malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr;
+ }
+ /* 5) Replace the bootstrap kmem_list3's */
+ {
+ int i, j;
+ for (i=0; malloc_sizes[i].cs_size &&
+ (malloc_sizes[i].cs_size < sizeof(struct kmem_list3));
+ i++);
+
+ BUG_ON(!malloc_sizes[i].cs_size);
+ /* Replace the static kmem_list3 structures for the boot cpu */
+ init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
+ numa_node_id());
+ if(i) {
+ init_list(malloc_sizes[0].cs_cachep,
+ &initkmem_list3[SIZE_32],
+ numa_node_id());
+ init_list(malloc_sizes[0].cs_dmacachep,
+ &initkmem_list3[SIZE_DMA_32],
+ numa_node_id());
+ }
+
+ for (j=0; j < MAX_NUMNODES; j++) {
+ if(is_node_online(j))
+ init_list(malloc_sizes[i].cs_cachep,
+ &kmem64_list3[j], j);
+ }
local_irq_enable();
}

- /* 5) resize the head arrays to their final sizes */
+ /* 6) resize the head arrays to their final sizes */
{
kmem_cache_t *cachep;
down(&cache_chain_sem);
@@ -866,7 +1149,6 @@ void __init kmem_cache_init(void)
* that initializes ac_data for all new cpus
*/
register_cpu_notifier(&cpucache_notifier);
-

/* The reap timers are started later, with a module init call:
* That part of the kernel is not yet operational.
@@ -1163,6 +1445,21 @@ static void slab_destroy (kmem_cache_t *
}
}

+/* For setting up all the kmem_list3s for cache whose objsize is same
+ as size of kmem_list3. */
+static inline void set_up_list3s(kmem_cache_t *cachep)
+{
+ int i;
+ for(i = 0; i < MAX_NUMNODES; i++) {
+ if(is_node_online(i)) {
+ cachep->nodelists[i] = &kmem64_list3[i];
+ cachep->nodelists[i]->next_reap = jiffies +
+ REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+ }
+ }
+}
+
/**
* kmem_cache_create - Create a cache.
* @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -1418,10 +1715,6 @@ next:
cachep->gfpflags |= GFP_DMA;
spin_lock_init(&cachep->spinlock);
cachep->objsize = size;
- /* NUMA */
- INIT_LIST_HEAD(&cachep->lists.slabs_full);
- INIT_LIST_HEAD(&cachep->lists.slabs_partial);
- INIT_LIST_HEAD(&cachep->lists.slabs_free);

if (flags & CFLGS_OFF_SLAB)
cachep->slabp_cache = kmem_find_general_cachep(slab_size,0);
@@ -1436,28 +1729,66 @@ next:
enable_cpucache(cachep);
} else {
if (g_cpucache_up == NONE) {
+ int i;
/* Note: the first kmem_cache_create must create
* the cache that's used by kmalloc(24), otherwise
* the creation of further caches will BUG().
*/
- cachep->array[smp_processor_id()] = &initarray_generic.cache;
+ cachep->array[smp_processor_id()] =
+ &initarray_generic.cache;
+
+ /* If the cache that's used by
+ * kmalloc(sizeof(kmem_list3)) is the first cache,
+ * then we need to set up all its list3s, otherwise
+ * the creation of further caches will BUG().
+ */
+ for (i=0; malloc_sizes[i].cs_size &&
+ (malloc_sizes[i].cs_size <
+ sizeof(struct kmem_list3)); i++);
+ if(i == 0) {
+ set_up_list3s(cachep);
+ cpucache_up_64 = ALL;
+ }
+ else {
+ cachep->nodelists[numa_node_id()] =
+ &initkmem_list3[SIZE_32];
+ cpucache_up_64 = SIZE_DMA_32;
+ }
+
g_cpucache_up = PARTIAL;
} else {
- cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL);
+ cachep->array[smp_processor_id()] =
+ kmalloc(sizeof(struct arraycache_init),
+ GFP_KERNEL);
+ if(cpucache_up_64 == SIZE_DMA_32) {
+ cachep->nodelists[numa_node_id()] =
+ &initkmem_list3[SIZE_DMA_32];
+ cpucache_up_64 = SIZE_64;
+ }
+ else if(cpucache_up_64 == SIZE_64) {
+ set_up_list3s(cachep);
+ cpucache_up_64 = ALL;
+ }
+ else {
+ cachep->nodelists[numa_node_id()] =
+ kmalloc(sizeof(struct kmem_list3),
+ GFP_KERNEL);
+ LIST3_INIT(cachep->nodelists[numa_node_id()]);
+ }
}
+ cachep->nodelists[numa_node_id()]->next_reap =
+ jiffies + REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+
BUG_ON(!ac_data(cachep));
+ BUG_ON(!cachep->nodelists[numa_node_id()]);
ac_data(cachep)->avail = 0;
ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
ac_data(cachep)->batchcount = 1;
ac_data(cachep)->touched = 0;
cachep->batchcount = 1;
cachep->limit = BOOT_CPUCACHE_ENTRIES;
- cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
- + cachep->num;
- }
-
- cachep->lists.next_reap = jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+ }

/* Need the semaphore to access the chain. */
down(&cache_chain_sem);
@@ -1515,13 +1846,23 @@ static void check_spinlock_acquired(kmem
{
#ifdef CONFIG_SMP
check_irq_off();
- BUG_ON(spin_trylock(&cachep->spinlock));
+ BUG_ON(spin_trylock(&list3_data(cachep)->list_lock));
#endif
}
+
+static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
+{
+#ifdef CONFIG_SMP
+ check_irq_off();
+ BUG_ON(spin_trylock(&(cachep->nodelists[node])->list_lock));
+#endif
+}
+
#else
#define check_irq_off() do { } while(0)
#define check_irq_on() do { } while(0)
#define check_spinlock_acquired(x) do { } while(0)
+#define check_spinlock_acquired_node(x, y) do { } while(0)
#endif

/*
@@ -1543,7 +1884,7 @@ static void smp_call_function_all_cpus(v
}

static void drain_array_locked(kmem_cache_t* cachep,
- struct array_cache *ac, int force);
+ struct array_cache *ac, int force, int node);

static void do_drain(void *arg)
{
@@ -1552,59 +1893,84 @@ static void do_drain(void *arg)

check_irq_off();
ac = ac_data(cachep);
- spin_lock(&cachep->spinlock);
- free_block(cachep, &ac_entry(ac)[0], ac->avail);
- spin_unlock(&cachep->spinlock);
+ spin_lock(&list3_data(cachep)->list_lock);
+ free_block(cachep, ac->entry, ac->avail);
+ spin_unlock(&list3_data(cachep)->list_lock);
ac->avail = 0;
}

static void drain_cpu_caches(kmem_cache_t *cachep)
{
+ struct kmem_list3 *l3;
+ int i;
+
smp_call_function_all_cpus(do_drain, cachep);
check_irq_on();
spin_lock_irq(&cachep->spinlock);
- if (cachep->lists.shared)
- drain_array_locked(cachep, cachep->lists.shared, 1);
+ for(i = 0; i < MAX_NUMNODES; i++) {
+ l3 = cachep->nodelists[i];
+ if (l3) {
+ spin_lock(&l3->list_lock);
+ drain_array_locked(cachep, l3->shared, 1, i);
+ spin_unlock(&l3->list_lock);
+#ifdef CONFIG_NUMA
+ if(l3->alien)
+ drain_alien_cache(cachep, l3);
+#endif
+ }
+ }
spin_unlock_irq(&cachep->spinlock);
}

-
-/* NUMA shrink all list3s */
-static int __cache_shrink(kmem_cache_t *cachep)
+static int __node_shrink(kmem_cache_t *cachep, int node)
{
struct slab *slabp;
+ struct kmem_list3 *l3 = cachep->nodelists[node];
int ret;

- drain_cpu_caches(cachep);
-
- check_irq_on();
- spin_lock_irq(&cachep->spinlock);
-
for(;;) {
struct list_head *p;

- p = cachep->lists.slabs_free.prev;
- if (p == &cachep->lists.slabs_free)
+ p = l3->slabs_free.prev;
+ if (p == &l3->slabs_free)
break;

- slabp = list_entry(cachep->lists.slabs_free.prev, struct slab, list);
+ slabp = list_entry(l3->slabs_free.prev, struct slab, list);
#if DEBUG
if (slabp->inuse)
BUG();
#endif
list_del(&slabp->list);

- cachep->lists.free_objects -= cachep->num;
- spin_unlock_irq(&cachep->spinlock);
+ l3->free_objects -= cachep->num;
+ spin_unlock_irq(&l3->list_lock);
slab_destroy(cachep, slabp);
- spin_lock_irq(&cachep->spinlock);
+ spin_lock_irq(&l3->list_lock);
}
- ret = !list_empty(&cachep->lists.slabs_full) ||
- !list_empty(&cachep->lists.slabs_partial);
- spin_unlock_irq(&cachep->spinlock);
+ ret = !list_empty(&l3->slabs_full) ||
+ !list_empty(&l3->slabs_partial);
return ret;
}

+static int __cache_shrink(kmem_cache_t *cachep)
+{
+ int ret = 0, i = 0;
+ struct kmem_list3 *l3;
+
+ drain_cpu_caches(cachep);
+
+ check_irq_on();
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ l3 = cachep->nodelists[i];
+ if(l3) {
+ spin_lock_irq(&l3->list_lock);
+ ret += __node_shrink(cachep, i);
+ spin_unlock_irq(&l3->list_lock);
+ }
+ }
+ return (ret ? 1 : 0);
+}
+
/**
* kmem_cache_shrink - Shrink a cache.
* @cachep: The cache to shrink.
@@ -1641,6 +2007,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
int kmem_cache_destroy(kmem_cache_t * cachep)
{
int i;
+ struct kmem_list3 *l3;

if (!cachep || in_interrupt())
BUG();
@@ -1675,8 +2042,15 @@ int kmem_cache_destroy(kmem_cache_t * ca
kfree(cachep->array[i]);

/* NUMA: free the list3 structures */
- kfree(cachep->lists.shared);
- cachep->lists.shared = NULL;
+ for(i = 0; i < MAX_NUMNODES; i++) {
+ if((l3 = cachep->nodelists[i])) {
+ kfree(l3->shared);
+#ifdef CONFIG_NUMA
+ free_alien_cache(l3->alien);
+#endif
+ kfree(l3);
+ }
+ }
kmem_cache_free(&cache_cache, cachep);

unlock_cpu_hotplug();
@@ -1795,6 +2169,7 @@ static int cache_grow(kmem_cache_t *cach
size_t offset;
unsigned int local_flags;
unsigned long ctor_flags;
+ struct kmem_list3 *l3;

/* Be lazy and only check for valid flags here,
* keeping it out of the critical path in kmem_cache_alloc().
@@ -1826,6 +2201,7 @@ static int cache_grow(kmem_cache_t *cach

spin_unlock(&cachep->spinlock);

+ check_irq_off();
if (local_flags & __GFP_WAIT)
local_irq_enable();

@@ -1837,8 +2213,9 @@ static int cache_grow(kmem_cache_t *cach
*/
kmem_flagcheck(cachep, flags);

-
- /* Get mem for the objs. */
+ /* Get mem for the objs.
+ * Attempt to allocate a physical page from 'nodeid',
+ */
if (!(objp = kmem_getpages(cachep, flags, nodeid)))
goto failed;

@@ -1846,6 +2223,9 @@ static int cache_grow(kmem_cache_t *cach
if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
goto opps1;

+#ifdef CONFIG_NUMA
+ slabp->nodeid = nodeid;
+#endif
set_slab_attr(cachep, slabp, objp);

cache_init_objs(cachep, slabp, ctor_flags);
@@ -1853,13 +2233,14 @@ static int cache_grow(kmem_cache_t *cach
if (local_flags & __GFP_WAIT)
local_irq_disable();
check_irq_off();
- spin_lock(&cachep->spinlock);
+ l3 = cachep->nodelists[nodeid];
+ spin_lock(&l3->list_lock);

/* Make slab active. */
- list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_free));
+ list_add_tail(&slabp->list, &(l3->slabs_free));
STATS_INC_GROWN(cachep);
- list3_data(cachep)->free_objects += cachep->num;
- spin_unlock(&cachep->spinlock);
+ l3->free_objects += cachep->num;
+ spin_unlock(&l3->list_lock);
return 1;
opps1:
kmem_freepages(cachep, objp);
@@ -1965,7 +2346,6 @@ static void check_slabp(kmem_cache_t *ca
kmem_bufctl_t i;
int entries = 0;

- check_spinlock_acquired(cachep);
/* Check slab's freelist to see if this obj is there. */
for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
entries++;
@@ -2010,8 +2390,9 @@ retry:
}
l3 = list3_data(cachep);

- BUG_ON(ac->avail > 0);
- spin_lock(&cachep->spinlock);
+ BUG_ON(ac->avail > 0 || !l3);
+ spin_lock(&l3->list_lock);
+
if (l3->shared) {
struct array_cache *shared_array = l3->shared;
if (shared_array->avail) {
@@ -2019,8 +2400,9 @@ retry:
batchcount = shared_array->avail;
shared_array->avail -= batchcount;
ac->avail = batchcount;
- memcpy(ac_entry(ac), &ac_entry(shared_array)[shared_array->avail],
- sizeof(void*)*batchcount);
+ memcpy(ac->entry,
+ &(shared_array->entry[shared_array->avail]),
+ sizeof(void*)*batchcount);
shared_array->touched = 1;
goto alloc_done;
}
@@ -2047,7 +2429,8 @@ retry:
STATS_SET_HIGH(cachep);

/* get obj pointer */
- ac_entry(ac)[ac->avail++] = slabp->s_mem + slabp->free*cachep->objsize;
+ ac->entry[ac->avail++] = slabp->s_mem +
+ slabp->free*cachep->objsize;

slabp->inuse++;
next = slab_bufctl(slabp)[slabp->free];
@@ -2069,12 +2452,12 @@ retry:
must_grow:
l3->free_objects -= ac->avail;
alloc_done:
- spin_unlock(&cachep->spinlock);
+ spin_unlock(&l3->list_lock);

if (unlikely(!ac->avail)) {
int x;
- x = cache_grow(cachep, flags, -1);
-
+ x = cache_grow(cachep, flags, numa_node_id());
+
// cache_grow can reenable interrupts, then ac could change.
ac = ac_data(cachep);
if (!x && ac->avail == 0) // no objects in sight? abort
@@ -2084,7 +2467,7 @@ alloc_done:
goto retry;
}
ac->touched = 1;
- return ac_entry(ac)[--ac->avail];
+ return ac->entry[--ac->avail];
}

static inline void
@@ -2156,7 +2539,7 @@ static inline void *__cache_alloc(kmem_c
if (likely(ac->avail)) {
STATS_INC_ALLOCHIT(cachep);
ac->touched = 1;
- objp = ac_entry(ac)[--ac->avail];
+ objp = ac->entry[--ac->avail];
} else {
STATS_INC_ALLOCMISS(cachep);
objp = cache_alloc_refill(cachep, flags);
@@ -2166,29 +2549,102 @@ static inline void *__cache_alloc(kmem_c
return objp;
}

-/*
- * NUMA: different approach needed if the spinlock is moved into
- * the l3 structure
+#ifdef CONFIG_NUMA
+/*
+ * A interface to enable slab creation on nodeid
*/
+static void *__cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid)
+{
+ struct list_head *entry;
+ struct slab *slabp;
+ struct kmem_list3 *l3;
+ void *obj;
+ kmem_bufctl_t next;
+ int x;
+
+ l3 = cachep->nodelists[nodeid];
+ BUG_ON(!l3);
+
+retry:
+ spin_lock(&l3->list_lock);
+ entry = l3->slabs_partial.next;
+ if (entry == &l3->slabs_partial) {
+ l3->free_touched = 1;
+ entry = l3->slabs_free.next;
+ if (entry == &l3->slabs_free)
+ goto must_grow;
+ }
+
+ slabp = list_entry(entry, struct slab, list);
+ check_spinlock_acquired_node(cachep, nodeid);
+ check_slabp(cachep, slabp);
+
+ STATS_INC_NODEALLOCS(cachep);
+ STATS_INC_ACTIVE(cachep);
+ STATS_SET_HIGH(cachep);

+ BUG_ON(slabp->inuse == cachep->num);
+
+ /* get obj pointer */
+ obj = slabp->s_mem + slabp->free*cachep->objsize;
+ slabp->inuse++;
+ next = slab_bufctl(slabp)[slabp->free];
+#if DEBUG
+ slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+#endif
+ slabp->free = next;
+ check_slabp(cachep, slabp);
+ l3->free_objects--;
+ /* move slabp to correct slabp list: */
+ list_del(&slabp->list);
+
+ if (slabp->free == BUFCTL_END) {
+ list_add(&slabp->list, &l3->slabs_full);
+ }
+ else {
+ list_add(&slabp->list, &l3->slabs_partial);
+ }
+
+ spin_unlock(&l3->list_lock);
+ goto done;
+
+must_grow:
+ spin_unlock(&l3->list_lock);
+ x = cache_grow(cachep, flags, nodeid);
+
+ if (!x)
+ return NULL;
+
+ goto retry;
+done:
+ return obj;
+}
+#endif
+
+/*
+ * Caller needs to acquire correct kmem_list's list_lock
+ */
static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
{
int i;
-
- check_spinlock_acquired(cachep);
-
- /* NUMA: move add into loop */
- cachep->lists.free_objects += nr_objects;
+ struct kmem_list3 *l3;

for (i = 0; i < nr_objects; i++) {
void *objp = objpp[i];
struct slab *slabp;
unsigned int objnr;
+ int nodeid = 0;

slabp = GET_PAGE_SLAB(virt_to_page(objp));
+#ifdef CONFIG_NUMA
+ nodeid = slabp->nodeid;
+#endif
+ l3 = cachep->nodelists[nodeid];
list_del(&slabp->list);
objnr = (objp - slabp->s_mem) / cachep->objsize;
+ check_spinlock_acquired_node(cachep, nodeid);
check_slabp(cachep, slabp);
+
#if DEBUG
if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
printk(KERN_ERR "slab: double free detected in cache '%s', objp %p.\n",
@@ -2200,24 +2656,23 @@ static void free_block(kmem_cache_t *cac
slabp->free = objnr;
STATS_DEC_ACTIVE(cachep);
slabp->inuse--;
+ l3->free_objects++;
check_slabp(cachep, slabp);

/* fixup slab chains */
if (slabp->inuse == 0) {
- if (cachep->lists.free_objects > cachep->free_limit) {
- cachep->lists.free_objects -= cachep->num;
+ if (l3->free_objects > l3->free_limit) {
+ l3->free_objects -= cachep->num;
slab_destroy(cachep, slabp);
} else {
- list_add(&slabp->list,
- &list3_data_ptr(cachep, objp)->slabs_free);
+ list_add(&slabp->list, &l3->slabs_free);
}
} else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
*/
- list_add_tail(&slabp->list,
- &list3_data_ptr(cachep, objp)->slabs_partial);
+ list_add_tail(&slabp->list, &l3->slabs_partial);
}
}
}
@@ -2225,36 +2680,38 @@ static void free_block(kmem_cache_t *cac
static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
{
int batchcount;
+ struct kmem_list3 *l3;

batchcount = ac->batchcount;
#if DEBUG
BUG_ON(!batchcount || batchcount > ac->avail);
#endif
check_irq_off();
- spin_lock(&cachep->spinlock);
- if (cachep->lists.shared) {
- struct array_cache *shared_array = cachep->lists.shared;
+ l3 = list3_data(cachep);
+ spin_lock(&l3->list_lock);
+ if (l3->shared) {
+ struct array_cache *shared_array = l3->shared;
int max = shared_array->limit-shared_array->avail;
if (max) {
if (batchcount > max)
batchcount = max;
- memcpy(&ac_entry(shared_array)[shared_array->avail],
- &ac_entry(ac)[0],
+ memcpy(&(shared_array->entry[shared_array->avail]),
+ ac->entry,
sizeof(void*)*batchcount);
shared_array->avail += batchcount;
goto free_done;
}
}

- free_block(cachep, &ac_entry(ac)[0], batchcount);
+ free_block(cachep, ac->entry, batchcount);
free_done:
#if STATS
{
int i = 0;
struct list_head *p;

- p = list3_data(cachep)->slabs_free.next;
- while (p != &(list3_data(cachep)->slabs_free)) {
+ p = l3->slabs_free.next;
+ while (p != &(l3->slabs_free)) {
struct slab *slabp;

slabp = list_entry(p, struct slab, list);
@@ -2266,12 +2723,13 @@ free_done:
STATS_SET_FREEABLE(cachep, i);
}
#endif
- spin_unlock(&cachep->spinlock);
+ spin_unlock(&l3->list_lock);
ac->avail -= batchcount;
- memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount],
+ memmove(ac->entry, &(ac->entry[batchcount]),
sizeof(void*)*ac->avail);
}

+
/*
* __cache_free
* Release an obj back to its cache. If the obj has a constructed
@@ -2286,14 +2744,47 @@ static inline void __cache_free(kmem_cac
check_irq_off();
objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));

+ /* Make sure we are not freeing a object from another
+ * node to the array cache on this cpu.
+ */
+#ifdef CONFIG_NUMA
+ {
+ struct slab *slabp;
+ slabp = GET_PAGE_SLAB(virt_to_page(objp));
+ if(unlikely(slabp->nodeid != numa_node_id())) {
+ struct array_cache *alien = NULL;
+ int nodeid = slabp->nodeid;
+ struct kmem_list3 *l3 = list3_data(cachep);
+
+ STATS_INC_NODEFREES(cachep);
+ if(l3->alien && l3->alien[nodeid]) {
+ alien = l3->alien[nodeid];
+ spin_lock(&alien->lock);
+ if(unlikely(alien->avail == alien->limit))
+ __drain_alien_cache(cachep,
+ alien, nodeid);
+ alien->entry[alien->avail++] = objp;
+ spin_unlock(&alien->lock);
+ }
+ else {
+ spin_lock(&(cachep->nodelists[nodeid])->
+ list_lock);
+ free_block(cachep, &objp, 1);
+ spin_unlock(&(cachep->nodelists[nodeid])->
+ list_lock);
+ }
+ return;
+ }
+ }
+#endif
if (likely(ac->avail < ac->limit)) {
STATS_INC_FREEHIT(cachep);
- ac_entry(ac)[ac->avail++] = objp;
+ ac->entry[ac->avail++] = objp;
return;
} else {
STATS_INC_FREEMISS(cachep);
cache_flusharray(cachep, ac);
- ac_entry(ac)[ac->avail++] = objp;
+ ac->entry[ac->avail++] = objp;
}
}

@@ -2363,78 +2854,24 @@ out:
* Identical to kmem_cache_alloc, except that this function is slow
* and can sleep. And it will allocate memory on the given node, which
* can improve the performance for cpu bound structures.
+ * New and improved: it will now make sure that the object gets
+ * put on the correct node list so that there is no false sharing.
*/
void *kmem_cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid)
{
- int loop;
- void *objp;
- struct slab *slabp;
- kmem_bufctl_t next;
-
- for (loop = 0;;loop++) {
- struct list_head *q;
-
- objp = NULL;
- check_irq_on();
- spin_lock_irq(&cachep->spinlock);
- /* walk through all partial and empty slab and find one
- * from the right node */
- list_for_each(q,&cachep->lists.slabs_partial) {
- slabp = list_entry(q, struct slab, list);
-
- if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid ||
- loop > 2)
- goto got_slabp;
- }
- list_for_each(q, &cachep->lists.slabs_free) {
- slabp = list_entry(q, struct slab, list);
-
- if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid ||
- loop > 2)
- goto got_slabp;
- }
- spin_unlock_irq(&cachep->spinlock);
-
- local_irq_disable();
- if (!cache_grow(cachep, flags, nodeid)) {
- local_irq_enable();
- return NULL;
- }
- local_irq_enable();
- }
-got_slabp:
- /* found one: allocate object */
- check_slabp(cachep, slabp);
- check_spinlock_acquired(cachep);
-
- STATS_INC_ALLOCED(cachep);
- STATS_INC_ACTIVE(cachep);
- STATS_SET_HIGH(cachep);
- STATS_INC_NODEALLOCS(cachep);
-
- objp = slabp->s_mem + slabp->free*cachep->objsize;
-
- slabp->inuse++;
- next = slab_bufctl(slabp)[slabp->free];
-#if DEBUG
- slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
-#endif
- slabp->free = next;
- check_slabp(cachep, slabp);
+ unsigned long save_flags;
+ void *ptr;

- /* move slabp to correct slabp list: */
- list_del(&slabp->list);
- if (slabp->free == BUFCTL_END)
- list_add(&slabp->list, &cachep->lists.slabs_full);
- else
- list_add(&slabp->list, &cachep->lists.slabs_partial);
+ if(nodeid == numa_node_id() || nodeid == -1)
+ return __cache_alloc(cachep, flags);

- list3_data(cachep)->free_objects--;
- spin_unlock_irq(&cachep->spinlock);
+ cache_alloc_debugcheck_before(cachep, flags);
+ local_irq_save(save_flags);
+ ptr = __cache_alloc_node(cachep, flags, nodeid);
+ local_irq_restore(save_flags);
+ ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0));

- objp = cache_alloc_debugcheck_after(cachep, GFP_KERNEL, objp,
- __builtin_return_address(0));
- return objp;
+ return ptr;
}
EXPORT_SYMBOL(kmem_cache_alloc_node);

@@ -2620,6 +3057,81 @@ unsigned int kmem_cache_size(kmem_cache_
}
EXPORT_SYMBOL(kmem_cache_size);

+/*
+ * This initializes kmem_list3 for all nodes.
+ */
+static int alloc_kmemlist(kmem_cache_t *cachep)
+{
+ int node, i;
+ struct kmem_list3 *l3;
+ int err = 0;
+
+ for(i=0; i < NR_CPUS; i++) {
+ if(cpu_online(i)) {
+ struct array_cache *nc = NULL, *new;
+#ifdef CONFIG_NUMA
+ struct array_cache **new_alien = NULL;
+#endif
+ node = cpu_to_node(i);
+#ifdef CONFIG_NUMA
+ if(!(new_alien = alloc_alien_cache(i, cachep->limit)))
+ goto fail;
+#endif
+ if(!(new = alloc_arraycache(i, (cachep->shared*
+ cachep->batchcount), 0xbaadf00d)))
+ goto fail;
+ if((l3 = cachep->nodelists[node])) {
+
+ spin_lock_irq(&l3->list_lock);
+
+ if((nc = cachep->nodelists[node]->shared))
+ free_block(cachep, nc->entry,
+ nc->avail);
+
+ l3->shared = new;
+#ifdef CONFIG_NUMA
+ if(!cachep->nodelists[node]->alien) {
+ l3->alien = new_alien;
+ new_alien = NULL;
+ }
+ l3->free_limit = (1 + nr_cpus_node(node))*
+ cachep->batchcount + cachep->num;
+#else
+ l3->free_limit = (1 + num_online_cpus())*
+ cachep->batchcount + cachep->num;
+#endif
+ spin_unlock_irq(&l3->list_lock);
+ kfree(nc);
+#ifdef CONFIG_NUMA
+ free_alien_cache(new_alien);
+#endif
+ continue;
+ }
+ if(!(l3 = kmalloc_node(sizeof(struct kmem_list3),
+ GFP_KERNEL, node)))
+ goto fail;
+
+ LIST3_INIT(l3);
+ l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+ l3->shared = new;
+#ifdef CONFIG_NUMA
+ l3->alien = new_alien;
+ l3->free_limit = (1 + nr_cpus_node(node))*
+ cachep->batchcount + cachep->num;
+#else
+ l3->free_limit = (1 + num_online_cpus())*
+ cachep->batchcount + cachep->num;
+#endif
+ cachep->nodelists[node] = l3;
+ }
+ }
+ return err;
+fail:
+ err = -ENOMEM;
+ return err;
+}
+
struct ccupdate_struct {
kmem_cache_t *cachep;
struct array_cache *new[NR_CPUS];
@@ -2642,8 +3154,7 @@ static int do_tune_cpucache(kmem_cache_t
int shared)
{
struct ccupdate_struct new;
- struct array_cache *new_shared;
- int i;
+ int i, err;

memset(&new.new,0,sizeof(new.new));
for (i = 0; i < NR_CPUS; i++) {
@@ -2660,36 +3171,30 @@ static int do_tune_cpucache(kmem_cache_t
new.cachep = cachep;

smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
-
+
check_irq_on();
spin_lock_irq(&cachep->spinlock);
cachep->batchcount = batchcount;
cachep->limit = limit;
- cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num;
+ cachep->shared = shared;
spin_unlock_irq(&cachep->spinlock);

for (i = 0; i < NR_CPUS; i++) {
struct array_cache *ccold = new.new[i];
if (!ccold)
continue;
- spin_lock_irq(&cachep->spinlock);
- free_block(cachep, ac_entry(ccold), ccold->avail);
- spin_unlock_irq(&cachep->spinlock);
+ spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
+ free_block(cachep, ccold->entry, ccold->avail);
+ spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
kfree(ccold);
}
- new_shared = alloc_arraycache(-1, batchcount*shared, 0xbaadf00d);
- if (new_shared) {
- struct array_cache *old;
-
- spin_lock_irq(&cachep->spinlock);
- old = cachep->lists.shared;
- cachep->lists.shared = new_shared;
- if (old)
- free_block(cachep, ac_entry(old), old->avail);
- spin_unlock_irq(&cachep->spinlock);
- kfree(old);
- }

+ err = alloc_kmemlist(cachep);
+ if (err) {
+ printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
+ cachep->name, -err);
+ BUG();
+ }
return 0;
}

@@ -2747,11 +3252,11 @@ static void enable_cpucache(kmem_cache_t
}

static void drain_array_locked(kmem_cache_t *cachep,
- struct array_cache *ac, int force)
+ struct array_cache *ac, int force, int node)
{
int tofree;

- check_spinlock_acquired(cachep);
+ check_spinlock_acquired_node(cachep, node);
if (ac->touched && !force) {
ac->touched = 0;
} else if (ac->avail) {
@@ -2759,9 +3264,9 @@ static void drain_array_locked(kmem_cach
if (tofree > ac->avail) {
tofree = (ac->avail+1)/2;
}
- free_block(cachep, ac_entry(ac), tofree);
+ free_block(cachep, ac->entry, tofree);
ac->avail -= tofree;
- memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree],
+ memmove(ac->entry, &(ac->entry[tofree]),
sizeof(void*)*ac->avail);
}
}
@@ -2780,6 +3285,7 @@ static void drain_array_locked(kmem_cach
static void cache_reap(void *unused)
{
struct list_head *walk;
+ struct kmem_list3 *l3;

if (down_trylock(&cache_chain_sem)) {
/* Give up. Setup the next iteration. */
@@ -2800,27 +3306,35 @@ static void cache_reap(void *unused)

check_irq_on();

- spin_lock_irq(&searchp->spinlock);
+ l3 = list3_data(searchp);
+#ifdef CONFIG_NUMA
+ if(l3->alien)
+ drain_alien_cache(searchp, l3);
+#endif
+
+ spin_lock_irq(&l3->list_lock);

- drain_array_locked(searchp, ac_data(searchp), 0);
+ drain_array_locked(searchp, ac_data(searchp), 0,
+ numa_node_id());

- if(time_after(searchp->lists.next_reap, jiffies))
+ if(time_after(l3->next_reap, jiffies))
goto next_unlock;

- searchp->lists.next_reap = jiffies + REAPTIMEOUT_LIST3;
+ l3->next_reap = jiffies + REAPTIMEOUT_LIST3;

- if (searchp->lists.shared)
- drain_array_locked(searchp, searchp->lists.shared, 0);
+ if (l3->shared)
+ drain_array_locked(searchp, l3->shared, 0,
+ numa_node_id());

- if (searchp->lists.free_touched) {
- searchp->lists.free_touched = 0;
+ if (l3->free_touched) {
+ l3->free_touched = 0;
goto next_unlock;
}

- tofree = (searchp->free_limit+5*searchp->num-1)/(5*searchp->num);
+ tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num);
do {
- p = list3_data(searchp)->slabs_free.next;
- if (p == &(list3_data(searchp)->slabs_free))
+ p = l3->slabs_free.next;
+ if (p == &(l3->slabs_free))
break;

slabp = list_entry(p, struct slab, list);
@@ -2833,13 +3347,13 @@ static void cache_reap(void *unused)
* searchp cannot disappear, we hold
* cache_chain_lock
*/
- searchp->lists.free_objects -= searchp->num;
- spin_unlock_irq(&searchp->spinlock);
+ l3->free_objects -= searchp->num;
+ spin_unlock_irq(&l3->list_lock);
slab_destroy(searchp, slabp);
- spin_lock_irq(&searchp->spinlock);
+ spin_lock_irq(&l3->list_lock);
} while(--tofree > 0);
next_unlock:
- spin_unlock_irq(&searchp->spinlock);
+ spin_unlock_irq(&l3->list_lock);
next:
cond_resched();
}
@@ -2872,7 +3386,7 @@ static void *s_start(struct seq_file *m,
seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
#if STATS
seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>"
- " <error> <maxfreeable> <freelimit> <nodeallocs>");
+ " <error> <maxfreeable> <nodeallocs> <remotefrees>");
seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
#endif
seq_putc(m, '\n');
@@ -2907,39 +3421,53 @@ static int s_show(struct seq_file *m, vo
unsigned long active_objs;
unsigned long num_objs;
unsigned long active_slabs = 0;
- unsigned long num_slabs;
- const char *name;
+ unsigned long num_slabs, free_objects = 0, shared_avail = 0;
+ const char *name;
char *error = NULL;
+ int i;
+ struct kmem_list3 *l3;

check_irq_on();
spin_lock_irq(&cachep->spinlock);
active_objs = 0;
num_slabs = 0;
- list_for_each(q,&cachep->lists.slabs_full) {
- slabp = list_entry(q, struct slab, list);
- if (slabp->inuse != cachep->num && !error)
- error = "slabs_full accounting error";
- active_objs += cachep->num;
- active_slabs++;
- }
- list_for_each(q,&cachep->lists.slabs_partial) {
- slabp = list_entry(q, struct slab, list);
- if (slabp->inuse == cachep->num && !error)
- error = "slabs_partial inuse accounting error";
- if (!slabp->inuse && !error)
- error = "slabs_partial/inuse accounting error";
- active_objs += slabp->inuse;
- active_slabs++;
- }
- list_for_each(q,&cachep->lists.slabs_free) {
- slabp = list_entry(q, struct slab, list);
- if (slabp->inuse && !error)
- error = "slabs_free/inuse accounting error";
- num_slabs++;
+ for( i=0; i<MAX_NUMNODES; i++) {
+ l3 = cachep->nodelists[i];
+ if(!l3 || !is_node_online(i))
+ continue;
+
+ spin_lock(&l3->list_lock);
+
+ list_for_each(q,&l3->slabs_full) {
+ slabp = list_entry(q, struct slab, list);
+ if (slabp->inuse != cachep->num && !error)
+ error = "slabs_full accounting error";
+ active_objs += cachep->num;
+ active_slabs++;
+ }
+ list_for_each(q,&l3->slabs_partial) {
+ slabp = list_entry(q, struct slab, list);
+ if (slabp->inuse == cachep->num && !error)
+ error = "slabs_partial inuse accounting error";
+ if (!slabp->inuse && !error)
+ error = "slabs_partial/inuse accounting error";
+ active_objs += slabp->inuse;
+ active_slabs++;
+ }
+ list_for_each(q,&l3->slabs_free) {
+ slabp = list_entry(q, struct slab, list);
+ if (slabp->inuse && !error)
+ error = "slabs_free/inuse accounting error";
+ num_slabs++;
+ }
+ free_objects += l3->free_objects;
+ shared_avail += l3->shared->avail;
+
+ spin_unlock(&l3->list_lock);
}
num_slabs+=active_slabs;
num_objs = num_slabs*cachep->num;
- if (num_objs - active_objs != cachep->lists.free_objects && !error)
+ if (num_objs - active_objs != free_objects && !error)
error = "free_objects accounting error";

name = cachep->name;
@@ -2951,9 +3479,9 @@ static int s_show(struct seq_file *m, vo
cachep->num, (1<<cachep->gfporder));
seq_printf(m, " : tunables %4u %4u %4u",
cachep->limit, cachep->batchcount,
- cachep->lists.shared->limit/cachep->batchcount);
- seq_printf(m, " : slabdata %6lu %6lu %6u",
- active_slabs, num_slabs, cachep->lists.shared->avail);
+ cachep->shared);
+ seq_printf(m, " : slabdata %6lu %6lu %6lu",
+ active_slabs, num_slabs, shared_avail);
#if STATS
{ /* list3 stats */
unsigned long high = cachep->high_mark;
@@ -2962,12 +3490,13 @@ static int s_show(struct seq_file *m, vo
unsigned long reaped = cachep->reaped;
unsigned long errors = cachep->errors;
unsigned long max_freeable = cachep->max_freeable;
- unsigned long free_limit = cachep->free_limit;
unsigned long node_allocs = cachep->node_allocs;
+ unsigned long node_frees = cachep->node_frees;

- seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu",
- allocs, high, grown, reaped, errors,
- max_freeable, free_limit, node_allocs);
+ seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
+ %4lu %4lu %4lu %4lu",
+ allocs, high, grown, reaped, errors,
+ max_freeable, node_allocs, node_frees);
}
/* cpu stats */
{
@@ -3048,7 +3577,8 @@ ssize_t slabinfo_write(struct file *file
shared < 0) {
res = -EINVAL;
} else {
- res = do_tune_cpucache(cachep, limit, batchcount, shared);
+ res = do_tune_cpucache(cachep, limit,
+ batchcount, shared);
}
break;
}


2005-05-11 15:49:03

by tip-bot for Jack Steiner

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V2

On Wed, May 11, 2005 at 08:17:08AM -0700, Christoph Lameter wrote:
> The NUMA API change that introduced kmalloc_node was accepted last week by
> Linus. Now it is possible to do slab allocations on a node to localize
> memory structures. This API was used by the pageset localization patch and
> the block layer localization patch now in mm. The existing kmalloc_node is
> slow since it simply searches through all pages of the slab to find a page
> that is on the node requested. The two patches do a one time allocation of
> slab structures at initialization and therefore the speed of kmalloc node
> does not matter.

Christoph -

The results look good. One suggestion though. When you make a series of AIM7
runs, include more points at the low end of the curve. We need to verify
that changes don't cause regressions for small numbers of users.
Even better, if a change helps the low end, that is important to know, too.


>
> This patch allows kmalloc_node to be as fast as kmalloc by introducing
> node specific page lists for partial, free and full slabs. Slab allocation
> improves in a NUMA system so that we are seeing a performance gain in
> AIM7 of about 5% with this patch alone.
>
> More NUMA localizations are possible if kmalloc_node operates
> in an fast way like kmalloc.
>
> Test run on a 32p systems with 32G Ram.
>
> w/o patch
> Tasks jobs/min jti jobs/min/task real cpu
> 1 485.36 100 485.3640 11.99 1.91 Sat Apr 30 14:01:51 2005
> 100 26582.63 88 265.8263 21.89 144.96 Sat Apr 30 14:02:14 2005
> 200 29866.83 81 149.3342 38.97 286.08 Sat Apr 30 14:02:53 2005
> 300 33127.16 78 110.4239 52.71 426.54 Sat Apr 30 14:03:46 2005
> 400 34889.47 80 87.2237 66.72 568.90 Sat Apr 30 14:04:53 2005
> 500 35654.34 76 71.3087 81.62 714.55 Sat Apr 30 14:06:15 2005
> 600 36460.83 75 60.7681 95.77 853.42 Sat Apr 30 14:07:51 2005
> 700 35957.00 75 51.3671 113.30 990.67 Sat Apr 30 14:09:45 2005
> 800 33380.65 73 41.7258 139.48 1140.86 Sat Apr 30 14:12:05 2005
> 900 35095.01 76 38.9945 149.25 1281.30 Sat Apr 30 14:14:35 2005
> 1000 36094.37 74 36.0944 161.24 1419.66 Sat Apr 30 14:17:17 2005
>
> w/patch
> Tasks jobs/min jti jobs/min/task real cpu
> 1 484.27 100 484.2736 12.02 1.93 Sat Apr 30 15:59:45 2005
> 100 28262.03 90 282.6203 20.59 143.57 Sat Apr 30 16:00:06 2005
> 200 32246.45 82 161.2322 36.10 282.89 Sat Apr 30 16:00:42 2005
> 300 37945.80 83 126.4860 46.01 418.75 Sat Apr 30 16:01:28 2005
> 400 40000.69 81 100.0017 58.20 561.48 Sat Apr 30 16:02:27 2005
> 500 40976.10 78 81.9522 71.02 696.95 Sat Apr 30 16:03:38 2005
> 600 41121.54 78 68.5359 84.92 834.86 Sat Apr 30 16:05:04 2005
> 700 44052.77 78 62.9325 92.48 971.53 Sat Apr 30 16:06:37 2005
> 800 41066.89 79 51.3336 113.38 1111.15 Sat Apr 30 16:08:31 2005
> 900 38918.77 79 43.2431 134.59 1252.57 Sat Apr 30 16:10:46 2005
> 1000 41842.21 76 41.8422 139.09 1392.33 Sat Apr 30 16:13:05 2005
>
> These are measurement taken directly after boot and show a greater improvement than 5%.
> However, the performance improvements become less over time if the AIM7 runs are repeated
> and settle down at around 5%.
>
> Link to earlier discussions:
> http://marc.theaimsgroup.com/?t=111094594500003&r=1&w=2
>
> Changelog:
> - Batching for freeing of wrong-node objects (alien caches)
> - Locking changes and NUMA #ifdefs as requested by Manfred
>
> Signed-off-by: Alok N Kataria <[email protected]>
> Signed-off-by: Shobhit Dayal <[email protected]>
> Signed-off-by: Shai Fultheim <[email protected]>
> Signed-off-by: Christoph Lameter <[email protected]>
>
> Index: linux-2.6.11/mm/slab.c
> ===================================================================
> --- linux-2.6.11.orig/mm/slab.c 2005-04-30 11:41:28.000000000 -0700
> +++ linux-2.6.11/mm/slab.c 2005-05-04 09:18:16.000000000 -0700
> @@ -75,6 +75,13 @@
> *
> * At present, each engine can be growing a cache. This should be blocked.
> *
> + * 15 March 2005. NUMA slab allocator.
> + * Shobhit Dayal <[email protected]>
> + * Alok N Kataria <[email protected]>
> + *
> + * Modified the slab allocator to be node aware on NUMA systems.
> + * Each node has its own list of partial, free and full slabs.
> + * All object allocations for a node occur from node specific slab lists.
> */
>
> #include <linux/config.h>
> @@ -92,7 +99,7 @@
> #include <linux/sysctl.h>
> #include <linux/module.h>
> #include <linux/rcupdate.h>
> -
> +#include <linux/nodemask.h>
> #include <asm/uaccess.h>
> #include <asm/cacheflush.h>
> #include <asm/tlbflush.h>
> @@ -210,6 +217,9 @@ struct slab {
> void *s_mem; /* including colour offset */
> unsigned int inuse; /* num of objs active in slab */
> kmem_bufctl_t free;
> +#ifdef CONFIG_NUMA
> + unsigned short nodeid;
> +#endif
> };
>
> /*
> @@ -252,6 +262,10 @@ struct array_cache {
> unsigned int limit;
> unsigned int batchcount;
> unsigned int touched;
> +#ifdef CONFIG_NUMA
> + spinlock_t lock;
> +#endif
> + void *entry[];
> };
>
> /* bootstrap: The caches do not work without cpuarrays anymore,
> @@ -275,24 +289,77 @@ struct kmem_list3 {
> struct list_head slabs_full;
> struct list_head slabs_free;
> unsigned long free_objects;
> - int free_touched;
> unsigned long next_reap;
> + int free_touched;
> + unsigned int free_limit;
> + spinlock_t list_lock;
> struct array_cache *shared;
> +#ifdef CONFIG_NUMA
> + struct array_cache **alien;
> +#endif
> };
>
> +/*
> + * Need this for bootstrapping a per node allocator.
> + */
> +#define NUM_INIT_LISTS 3
> +struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
> +struct kmem_list3 __initdata kmem64_list3[MAX_NUMNODES];
> +
> +#ifdef CONFIG_NUMA
> +
> #define LIST3_INIT(parent) \
> - { \
> - .slabs_full = LIST_HEAD_INIT(parent.slabs_full), \
> - .slabs_partial = LIST_HEAD_INIT(parent.slabs_partial), \
> - .slabs_free = LIST_HEAD_INIT(parent.slabs_free) \
> - }
> + do { \
> + INIT_LIST_HEAD(&(parent)->slabs_full); \
> + INIT_LIST_HEAD(&(parent)->slabs_partial); \
> + INIT_LIST_HEAD(&(parent)->slabs_free); \
> + (parent)->shared = NULL; \
> + (parent)->alien = NULL; \
> + (parent)->list_lock = SPIN_LOCK_UNLOCKED; \
> + (parent)->free_objects = 0; \
> + (parent)->free_touched = 0; \
> + } while(0)
> +#else
> +
> +#define LIST3_INIT(parent) \
> + do { \
> + INIT_LIST_HEAD(&(parent)->slabs_full); \
> + INIT_LIST_HEAD(&(parent)->slabs_partial); \
> + INIT_LIST_HEAD(&(parent)->slabs_free); \
> + (parent)->shared = NULL; \
> + (parent)->list_lock = SPIN_LOCK_UNLOCKED; \
> + (parent)->free_objects = 0; \
> + (parent)->free_touched = 0; \
> + } while(0)
> +#endif
> +
> +#define MAKE_LIST(cachep, listp, slab, nodeid) \
> + do { \
> + INIT_LIST_HEAD(listp); \
> + list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
> + }while(0)
> +
> +#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
> + do { \
> + MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \
> + MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
> + MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
> + }while(0)
> +
> #define list3_data(cachep) \
> - (&(cachep)->lists)
> + ((cachep->nodelists[numa_node_id()]))
>
> /* NUMA: per-node */
> #define list3_data_ptr(cachep, ptr) \
> list3_data(cachep)
>
> +#ifdef CONFIG_NUMA
> +#define is_node_online(node) node_online(node)
> +#else
> +#define is_node_online(node) \
> + ({ BUG_ON(node != 0); 1; })
> +#endif /* CONFIG_NUMA */
> +
> /*
> * kmem_cache_t
> *
> @@ -304,13 +371,12 @@ struct kmem_cache_s {
> struct array_cache *array[NR_CPUS];
> unsigned int batchcount;
> unsigned int limit;
> -/* 2) touched by every alloc & free from the backend */
> - struct kmem_list3 lists;
> - /* NUMA: kmem_3list_t *nodelists[MAX_NUMNODES] */
> + unsigned int shared;
> unsigned int objsize;
> +/* 2) touched by every alloc & free from the backend */
> + struct kmem_list3 *nodelists[MAX_NUMNODES];
> unsigned int flags; /* constant flags */
> unsigned int num; /* # of objs per slab */
> - unsigned int free_limit; /* upper limit of objects in the lists */
> spinlock_t spinlock;
>
> /* 3) cache_grow/shrink */
> @@ -347,6 +413,7 @@ struct kmem_cache_s {
> unsigned long errors;
> unsigned long max_freeable;
> unsigned long node_allocs;
> + unsigned long node_frees;
> atomic_t allochit;
> atomic_t allocmiss;
> atomic_t freehit;
> @@ -382,6 +449,7 @@ struct kmem_cache_s {
> } while (0)
> #define STATS_INC_ERR(x) ((x)->errors++)
> #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
> +#define STATS_INC_NODEFREES(x) ((x)->node_frees++)
> #define STATS_SET_FREEABLE(x, i) \
> do { if ((x)->max_freeable < i) \
> (x)->max_freeable = i; \
> @@ -400,6 +468,7 @@ struct kmem_cache_s {
> #define STATS_SET_HIGH(x) do { } while (0)
> #define STATS_INC_ERR(x) do { } while (0)
> #define STATS_INC_NODEALLOCS(x) do { } while (0)
> +#define STATS_INC_NODEFREES(x) do { } while (0)
> #define STATS_SET_FREEABLE(x, i) \
> do { } while (0)
>
> @@ -532,9 +601,9 @@ static struct arraycache_init initarray_
>
> /* internal cache of cache description objs */
> static kmem_cache_t cache_cache = {
> - .lists = LIST3_INIT(cache_cache.lists),
> .batchcount = 1,
> .limit = BOOT_CPUCACHE_ENTRIES,
> + .shared = 1,
> .objsize = sizeof(kmem_cache_t),
> .flags = SLAB_NO_REAP,
> .spinlock = SPIN_LOCK_UNLOCKED,
> @@ -567,16 +636,20 @@ static enum {
> FULL
> } g_cpucache_up;
>
> +static enum {
> + CACHE_CACHE,
> + SIZE_32,
> + SIZE_DMA_32,
> + SIZE_64,
> + ALL
> +} cpucache_up_64;
> +
> static DEFINE_PER_CPU(struct work_struct, reap_work);
>
> static void free_block(kmem_cache_t* cachep, void** objpp, int len);
> static void enable_cpucache (kmem_cache_t *cachep);
> static void cache_reap (void *unused);
> -
> -static inline void **ac_entry(struct array_cache *ac)
> -{
> - return (void**)(ac+1);
> -}
> +static int __node_shrink(kmem_cache_t *cachep, int node);
>
> static inline struct array_cache *ac_data(kmem_cache_t *cachep)
> {
> @@ -678,42 +751,151 @@ static struct array_cache *alloc_arrayca
> int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
> struct array_cache *nc = NULL;
>
> - if (cpu == -1)
> - nc = kmalloc(memsize, GFP_KERNEL);
> - else
> - nc = kmalloc_node(memsize, GFP_KERNEL, cpu_to_node(cpu));
> -
> + nc = kmalloc_node(memsize, GFP_KERNEL, cpu_to_node(cpu));
> if (nc) {
> nc->avail = 0;
> nc->limit = entries;
> nc->batchcount = batchcount;
> nc->touched = 0;
> +#ifdef CONFIG_NUMA
> + spin_lock_init(&nc->lock);
> +#endif
> }
> return nc;
> }
> +#ifdef CONFIG_NUMA
> +static inline struct array_cache **alloc_alien_cache(int cpu, int limit)
> +{
> + struct array_cache **ac_ptr;
> + int memsize = sizeof(void*)*MAX_NUMNODES;
> + int node = cpu_to_node(cpu);
> + int i;
> +
> + if (limit > 1)
> + limit = 12;
> + ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
> + if(ac_ptr) {
> + for (i = 0; i < MAX_NUMNODES; i++) {
> + if (i == node) {
> + ac_ptr[i] = NULL;
> + continue;
> + }
> + ac_ptr[i] = alloc_arraycache(cpu, limit, 0xbaadf00d);
> + if(!ac_ptr[i]) {
> + for(i--; i <=0; i--)
> + kfree(ac_ptr[i]);
> + kfree(ac_ptr);
> + return NULL;
> + }
> + }
> + }
> + return ac_ptr;
> +}
> +
> +static inline void free_alien_cache(struct array_cache **ac_ptr)
> +{
> + int i;
> +
> + if(!ac_ptr)
> + return;
> + for (i = 0; i < MAX_NUMNODES; i++)
> + kfree(ac_ptr[i]);
> +
> + kfree(ac_ptr);
> +}
> +
> +static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node)
> +{
> + struct kmem_list3 *rl3 = cachep->nodelists[node];
> +
> + if(ac->avail) {
> + spin_lock(&rl3->list_lock);
> + free_block(cachep, ac->entry, ac->avail);
> + ac->avail = 0;
> + spin_unlock(&rl3->list_lock);
> + }
> +}
> +
> +static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
> +{
> + int i=0;
> + struct array_cache *ac;
> + unsigned long flags;
> +
> + for (i = 0; i < MAX_NUMNODES; i++) {
> + ac = l3->alien[i];
> + if(ac) {
> + spin_lock_irqsave(&ac->lock, flags);
> + __drain_alien_cache(cachep, ac, i);
> + spin_unlock_irqrestore(&ac->lock, flags);
> + }
> + }
> +}
> +#endif
>
> static int __devinit cpuup_callback(struct notifier_block *nfb,
> unsigned long action, void *hcpu)
> {
> long cpu = (long)hcpu;
> kmem_cache_t* cachep;
> + struct kmem_list3 *l3 = NULL;
> + int node = cpu_to_node(cpu);
> + int memsize = sizeof(struct kmem_list3);
> + struct array_cache *nc = NULL;
>
> switch (action) {
> case CPU_UP_PREPARE:
> down(&cache_chain_sem);
> + /* we need to do this right in the begining since
> + * alloc_arraycache's are going to use this list.
> + * kmalloc_node allows us to add the slab to the right
> + * kmem_list3 and not this cpu's kmem_list3
> + */
> +
> list_for_each_entry(cachep, &cache_chain, next) {
> - struct array_cache *nc;
> + /* setup the size64 kmemlist for hcpu before we can
> + * begin anything. Make sure some other cpu on this
> + * node has not already allocated this
> + */
> + if (!cachep->nodelists[node]) {
> + if(!(l3 = kmalloc_node(memsize,
> + GFP_KERNEL, node)))
> + goto bad;
> + LIST3_INIT(l3);
> + l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
> + ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
>
> - nc = alloc_arraycache(cpu, cachep->limit, cachep->batchcount);
> + cachep->nodelists[node] = l3;
> + }
> +
> + spin_lock_irq(&cachep->nodelists[node]->list_lock);
> + cachep->nodelists[node]->free_limit =
> + (1 + nr_cpus_node(node)) *
> + cachep->batchcount + cachep->num;
> + spin_unlock_irq(&cachep->nodelists[node]->list_lock);
> + }
> +
> + /* Now we can go ahead with allocating the shared array's
> + & array cache's */
> + list_for_each_entry(cachep, &cache_chain, next) {
> + nc = alloc_arraycache(cpu, cachep->limit,
> + cachep->batchcount);
> if (!nc)
> goto bad;
> -
> - spin_lock_irq(&cachep->spinlock);
> cachep->array[cpu] = nc;
> - cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
> - + cachep->num;
> - spin_unlock_irq(&cachep->spinlock);
>
> + l3 = cachep->nodelists[node];
> + BUG_ON(!l3);
> + if(!l3->shared) {
> + if(!(nc = alloc_arraycache(cpu,
> + cachep->shared*cachep->batchcount,
> + 0xbaadf00d)))
> + goto bad;
> +
> + /* we are serialised from CPU_DEAD or
> + CPU_UP_CANCELLED by the cpucontrol lock */
> + l3->shared = nc;
> + }
> }
> up(&cache_chain_sem);
> break;
> @@ -728,13 +910,53 @@ static int __devinit cpuup_callback(stru
>
> list_for_each_entry(cachep, &cache_chain, next) {
> struct array_cache *nc;
> + cpumask_t mask;
>
> + mask = node_to_cpumask(node);
> spin_lock_irq(&cachep->spinlock);
> /* cpu is dead; no one can alloc from it. */
> nc = cachep->array[cpu];
> cachep->array[cpu] = NULL;
> - cachep->free_limit -= cachep->batchcount;
> - free_block(cachep, ac_entry(nc), nc->avail);
> + l3 = cachep->nodelists[node];
> +
> + if(!l3)
> + goto unlock_cache;
> +
> + spin_lock(&l3->list_lock);
> +
> + /* Free limit for this kmem_list3 */
> + l3->free_limit -= cachep->batchcount;
> + if(nc)
> + free_block(cachep, nc->entry, nc->avail);
> +
> + if(!cpus_empty(mask)) {
> + spin_unlock(&l3->list_lock);
> + goto unlock_cache;
> + }
> +
> + if(l3->shared) {
> + free_block(cachep, l3->shared->entry,
> + l3->shared->avail);
> + kfree(l3->shared);
> + l3->shared = NULL;
> + }
> +#ifdef CONFIG_NUMA
> + if(l3->alien) {
> + drain_alien_cache(cachep, l3);
> + free_alien_cache(l3->alien);
> + l3->alien = NULL;
> + }
> +#endif
> +
> + /* free slabs belonging to this node */
> + if(__node_shrink(cachep, node)) {
> + cachep->nodelists[node] = NULL;
> + spin_unlock(&l3->list_lock);
> + kfree(l3);
> + }
> + else
> + spin_unlock(&l3->list_lock);
> +unlock_cache:
> spin_unlock_irq(&cachep->spinlock);
> kfree(nc);
> }
> @@ -750,6 +972,25 @@ bad:
>
> static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
>
> +/*
> + * swap the static kmem_list3 with kmalloced memory
> + */
> +static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list,
> + int nodeid)
> +{
> + struct kmem_list3 *ptr;
> +
> + BUG_ON((cachep->nodelists[nodeid]) != list);
> + ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
> + BUG_ON(!ptr);
> +
> + local_irq_disable();
> + memcpy(ptr, list, sizeof(struct kmem_list3));
> + MAKE_ALL_LISTS(cachep, ptr, nodeid);
> + cachep->nodelists[nodeid] = ptr;
> + local_irq_enable();
> +}
> +
> /* Initialisation.
> * Called after the gfp() functions have been enabled, and before smp_init().
> */
> @@ -758,7 +999,15 @@ void __init kmem_cache_init(void)
> size_t left_over;
> struct cache_sizes *sizes;
> struct cache_names *names;
> + int i;
>
> + for(i = 0; i < NUM_INIT_LISTS; i++)
> + LIST3_INIT(&initkmem_list3[i]);
> +
> + for(i = 0; i < MAX_NUMNODES; i++) {
> + LIST3_INIT(&kmem64_list3[i]);
> + cache_cache.nodelists[i] = NULL;
> + }
> /*
> * Fragmentation resistance on low memory - only use bigger
> * page orders on machines with more than 32MB of memory.
> @@ -766,21 +1015,24 @@ void __init kmem_cache_init(void)
> if (num_physpages > (32 << 20) >> PAGE_SHIFT)
> slab_break_gfp_order = BREAK_GFP_ORDER_HI;
>
> -
> /* Bootstrap is tricky, because several objects are allocated
> * from caches that do not exist yet:
> * 1) initialize the cache_cache cache: it contains the kmem_cache_t
> * structures of all caches, except cache_cache itself: cache_cache
> * is statically allocated.
> - * Initially an __init data area is used for the head array, it's
> - * replaced with a kmalloc allocated array at the end of the bootstrap.
> + * Initially an __init data area is used for the head array and the
> + * kmem_list3 structures, it's replaced with a kmalloc allocated
> + * array at the end of the bootstrap.
> * 2) Create the first kmalloc cache.
> - * The kmem_cache_t for the new cache is allocated normally. An __init
> - * data area is used for the head array.
> - * 3) Create the remaining kmalloc caches, with minimally sized head arrays.
> + * The kmem_cache_t for the new cache is allocated normally.
> + * An __init data area is used for the head array.
> + * 3) Create the remaining kmalloc caches, with minimally sized
> + * head arrays.
> * 4) Replace the __init data head arrays for cache_cache and the first
> * kmalloc cache with kmalloc allocated arrays.
> - * 5) Resize the head arrays of the kmalloc caches to their final sizes.
> + * 5) Replace the __init data for kmem_list3 for cache_cache and
> + * the other cache's with kmalloc allocated memory.
> + * 6) Resize the head arrays of the kmalloc caches to their final sizes.
> */
>
> /* 1) create the cache_cache */
> @@ -789,6 +1041,7 @@ void __init kmem_cache_init(void)
> list_add(&cache_cache.next, &cache_chain);
> cache_cache.colour_off = cache_line_size();
> cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
> + cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
>
> cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
>
> @@ -833,24 +1086,54 @@ void __init kmem_cache_init(void)
> /* 4) Replace the bootstrap head arrays */
> {
> void * ptr;
> -
> +
> ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
> +
> local_irq_disable();
> BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
> - memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init));
> + memcpy(ptr, ac_data(&cache_cache),
> + sizeof(struct arraycache_init));
> cache_cache.array[smp_processor_id()] = ptr;
> local_irq_enable();
> -
> +
> ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
> +
> local_irq_disable();
> - BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache);
> + BUG_ON(ac_data(malloc_sizes[0].cs_cachep)
> + != &initarray_generic.cache);
> memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep),
> sizeof(struct arraycache_init));
> malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr;
> + }
> + /* 5) Replace the bootstrap kmem_list3's */
> + {
> + int i, j;
> + for (i=0; malloc_sizes[i].cs_size &&
> + (malloc_sizes[i].cs_size < sizeof(struct kmem_list3));
> + i++);
> +
> + BUG_ON(!malloc_sizes[i].cs_size);
> + /* Replace the static kmem_list3 structures for the boot cpu */
> + init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
> + numa_node_id());
> + if(i) {
> + init_list(malloc_sizes[0].cs_cachep,
> + &initkmem_list3[SIZE_32],
> + numa_node_id());
> + init_list(malloc_sizes[0].cs_dmacachep,
> + &initkmem_list3[SIZE_DMA_32],
> + numa_node_id());
> + }
> +
> + for (j=0; j < MAX_NUMNODES; j++) {
> + if(is_node_online(j))
> + init_list(malloc_sizes[i].cs_cachep,
> + &kmem64_list3[j], j);
> + }
> local_irq_enable();
> }
>
> - /* 5) resize the head arrays to their final sizes */
> + /* 6) resize the head arrays to their final sizes */
> {
> kmem_cache_t *cachep;
> down(&cache_chain_sem);
> @@ -866,7 +1149,6 @@ void __init kmem_cache_init(void)
> * that initializes ac_data for all new cpus
> */
> register_cpu_notifier(&cpucache_notifier);
> -
>
> /* The reap timers are started later, with a module init call:
> * That part of the kernel is not yet operational.
> @@ -1163,6 +1445,21 @@ static void slab_destroy (kmem_cache_t *
> }
> }
>
> +/* For setting up all the kmem_list3s for cache whose objsize is same
> + as size of kmem_list3. */
> +static inline void set_up_list3s(kmem_cache_t *cachep)
> +{
> + int i;
> + for(i = 0; i < MAX_NUMNODES; i++) {
> + if(is_node_online(i)) {
> + cachep->nodelists[i] = &kmem64_list3[i];
> + cachep->nodelists[i]->next_reap = jiffies +
> + REAPTIMEOUT_LIST3 +
> + ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
> + }
> + }
> +}
> +
> /**
> * kmem_cache_create - Create a cache.
> * @name: A string which is used in /proc/slabinfo to identify this cache.
> @@ -1418,10 +1715,6 @@ next:
> cachep->gfpflags |= GFP_DMA;
> spin_lock_init(&cachep->spinlock);
> cachep->objsize = size;
> - /* NUMA */
> - INIT_LIST_HEAD(&cachep->lists.slabs_full);
> - INIT_LIST_HEAD(&cachep->lists.slabs_partial);
> - INIT_LIST_HEAD(&cachep->lists.slabs_free);
>
> if (flags & CFLGS_OFF_SLAB)
> cachep->slabp_cache = kmem_find_general_cachep(slab_size,0);
> @@ -1436,28 +1729,66 @@ next:
> enable_cpucache(cachep);
> } else {
> if (g_cpucache_up == NONE) {
> + int i;
> /* Note: the first kmem_cache_create must create
> * the cache that's used by kmalloc(24), otherwise
> * the creation of further caches will BUG().
> */
> - cachep->array[smp_processor_id()] = &initarray_generic.cache;
> + cachep->array[smp_processor_id()] =
> + &initarray_generic.cache;
> +
> + /* If the cache that's used by
> + * kmalloc(sizeof(kmem_list3)) is the first cache,
> + * then we need to set up all its list3s, otherwise
> + * the creation of further caches will BUG().
> + */
> + for (i=0; malloc_sizes[i].cs_size &&
> + (malloc_sizes[i].cs_size <
> + sizeof(struct kmem_list3)); i++);
> + if(i == 0) {
> + set_up_list3s(cachep);
> + cpucache_up_64 = ALL;
> + }
> + else {
> + cachep->nodelists[numa_node_id()] =
> + &initkmem_list3[SIZE_32];
> + cpucache_up_64 = SIZE_DMA_32;
> + }
> +
> g_cpucache_up = PARTIAL;
> } else {
> - cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL);
> + cachep->array[smp_processor_id()] =
> + kmalloc(sizeof(struct arraycache_init),
> + GFP_KERNEL);
> + if(cpucache_up_64 == SIZE_DMA_32) {
> + cachep->nodelists[numa_node_id()] =
> + &initkmem_list3[SIZE_DMA_32];
> + cpucache_up_64 = SIZE_64;
> + }
> + else if(cpucache_up_64 == SIZE_64) {
> + set_up_list3s(cachep);
> + cpucache_up_64 = ALL;
> + }
> + else {
> + cachep->nodelists[numa_node_id()] =
> + kmalloc(sizeof(struct kmem_list3),
> + GFP_KERNEL);
> + LIST3_INIT(cachep->nodelists[numa_node_id()]);
> + }
> }
> + cachep->nodelists[numa_node_id()]->next_reap =
> + jiffies + REAPTIMEOUT_LIST3 +
> + ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
> +
> BUG_ON(!ac_data(cachep));
> + BUG_ON(!cachep->nodelists[numa_node_id()]);
> ac_data(cachep)->avail = 0;
> ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
> ac_data(cachep)->batchcount = 1;
> ac_data(cachep)->touched = 0;
> cachep->batchcount = 1;
> cachep->limit = BOOT_CPUCACHE_ENTRIES;
> - cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
> - + cachep->num;
> - }
> -
> - cachep->lists.next_reap = jiffies + REAPTIMEOUT_LIST3 +
> - ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
> + }
>
> /* Need the semaphore to access the chain. */
> down(&cache_chain_sem);
> @@ -1515,13 +1846,23 @@ static void check_spinlock_acquired(kmem
> {
> #ifdef CONFIG_SMP
> check_irq_off();
> - BUG_ON(spin_trylock(&cachep->spinlock));
> + BUG_ON(spin_trylock(&list3_data(cachep)->list_lock));
> #endif
> }
> +
> +static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
> +{
> +#ifdef CONFIG_SMP
> + check_irq_off();
> + BUG_ON(spin_trylock(&(cachep->nodelists[node])->list_lock));
> +#endif
> +}
> +
> #else
> #define check_irq_off() do { } while(0)
> #define check_irq_on() do { } while(0)
> #define check_spinlock_acquired(x) do { } while(0)
> +#define check_spinlock_acquired_node(x, y) do { } while(0)
> #endif
>
> /*
> @@ -1543,7 +1884,7 @@ static void smp_call_function_all_cpus(v
> }
>
> static void drain_array_locked(kmem_cache_t* cachep,
> - struct array_cache *ac, int force);
> + struct array_cache *ac, int force, int node);
>
> static void do_drain(void *arg)
> {
> @@ -1552,59 +1893,84 @@ static void do_drain(void *arg)
>
> check_irq_off();
> ac = ac_data(cachep);
> - spin_lock(&cachep->spinlock);
> - free_block(cachep, &ac_entry(ac)[0], ac->avail);
> - spin_unlock(&cachep->spinlock);
> + spin_lock(&list3_data(cachep)->list_lock);
> + free_block(cachep, ac->entry, ac->avail);
> + spin_unlock(&list3_data(cachep)->list_lock);
> ac->avail = 0;
> }
>
> static void drain_cpu_caches(kmem_cache_t *cachep)
> {
> + struct kmem_list3 *l3;
> + int i;
> +
> smp_call_function_all_cpus(do_drain, cachep);
> check_irq_on();
> spin_lock_irq(&cachep->spinlock);
> - if (cachep->lists.shared)
> - drain_array_locked(cachep, cachep->lists.shared, 1);
> + for(i = 0; i < MAX_NUMNODES; i++) {
> + l3 = cachep->nodelists[i];
> + if (l3) {
> + spin_lock(&l3->list_lock);
> + drain_array_locked(cachep, l3->shared, 1, i);
> + spin_unlock(&l3->list_lock);
> +#ifdef CONFIG_NUMA
> + if(l3->alien)
> + drain_alien_cache(cachep, l3);
> +#endif
> + }
> + }
> spin_unlock_irq(&cachep->spinlock);
> }
>
> -
> -/* NUMA shrink all list3s */
> -static int __cache_shrink(kmem_cache_t *cachep)
> +static int __node_shrink(kmem_cache_t *cachep, int node)
> {
> struct slab *slabp;
> + struct kmem_list3 *l3 = cachep->nodelists[node];
> int ret;
>
> - drain_cpu_caches(cachep);
> -
> - check_irq_on();
> - spin_lock_irq(&cachep->spinlock);
> -
> for(;;) {
> struct list_head *p;
>
> - p = cachep->lists.slabs_free.prev;
> - if (p == &cachep->lists.slabs_free)
> + p = l3->slabs_free.prev;
> + if (p == &l3->slabs_free)
> break;
>
> - slabp = list_entry(cachep->lists.slabs_free.prev, struct slab, list);
> + slabp = list_entry(l3->slabs_free.prev, struct slab, list);
> #if DEBUG
> if (slabp->inuse)
> BUG();
> #endif
> list_del(&slabp->list);
>
> - cachep->lists.free_objects -= cachep->num;
> - spin_unlock_irq(&cachep->spinlock);
> + l3->free_objects -= cachep->num;
> + spin_unlock_irq(&l3->list_lock);
> slab_destroy(cachep, slabp);
> - spin_lock_irq(&cachep->spinlock);
> + spin_lock_irq(&l3->list_lock);
> }
> - ret = !list_empty(&cachep->lists.slabs_full) ||
> - !list_empty(&cachep->lists.slabs_partial);
> - spin_unlock_irq(&cachep->spinlock);
> + ret = !list_empty(&l3->slabs_full) ||
> + !list_empty(&l3->slabs_partial);
> return ret;
> }
>
> +static int __cache_shrink(kmem_cache_t *cachep)
> +{
> + int ret = 0, i = 0;
> + struct kmem_list3 *l3;
> +
> + drain_cpu_caches(cachep);
> +
> + check_irq_on();
> + for (i = 0; i < MAX_NUMNODES; i++) {
> + l3 = cachep->nodelists[i];
> + if(l3) {
> + spin_lock_irq(&l3->list_lock);
> + ret += __node_shrink(cachep, i);
> + spin_unlock_irq(&l3->list_lock);
> + }
> + }
> + return (ret ? 1 : 0);
> +}
> +
> /**
> * kmem_cache_shrink - Shrink a cache.
> * @cachep: The cache to shrink.
> @@ -1641,6 +2007,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
> int kmem_cache_destroy(kmem_cache_t * cachep)
> {
> int i;
> + struct kmem_list3 *l3;
>
> if (!cachep || in_interrupt())
> BUG();
> @@ -1675,8 +2042,15 @@ int kmem_cache_destroy(kmem_cache_t * ca
> kfree(cachep->array[i]);
>
> /* NUMA: free the list3 structures */
> - kfree(cachep->lists.shared);
> - cachep->lists.shared = NULL;
> + for(i = 0; i < MAX_NUMNODES; i++) {
> + if((l3 = cachep->nodelists[i])) {
> + kfree(l3->shared);
> +#ifdef CONFIG_NUMA
> + free_alien_cache(l3->alien);
> +#endif
> + kfree(l3);
> + }
> + }
> kmem_cache_free(&cache_cache, cachep);
>
> unlock_cpu_hotplug();
> @@ -1795,6 +2169,7 @@ static int cache_grow(kmem_cache_t *cach
> size_t offset;
> unsigned int local_flags;
> unsigned long ctor_flags;
> + struct kmem_list3 *l3;
>
> /* Be lazy and only check for valid flags here,
> * keeping it out of the critical path in kmem_cache_alloc().
> @@ -1826,6 +2201,7 @@ static int cache_grow(kmem_cache_t *cach
>
> spin_unlock(&cachep->spinlock);
>
> + check_irq_off();
> if (local_flags & __GFP_WAIT)
> local_irq_enable();
>
> @@ -1837,8 +2213,9 @@ static int cache_grow(kmem_cache_t *cach
> */
> kmem_flagcheck(cachep, flags);
>
> -
> - /* Get mem for the objs. */
> + /* Get mem for the objs.
> + * Attempt to allocate a physical page from 'nodeid',
> + */
> if (!(objp = kmem_getpages(cachep, flags, nodeid)))
> goto failed;
>
> @@ -1846,6 +2223,9 @@ static int cache_grow(kmem_cache_t *cach
> if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
> goto opps1;
>
> +#ifdef CONFIG_NUMA
> + slabp->nodeid = nodeid;
> +#endif
> set_slab_attr(cachep, slabp, objp);
>
> cache_init_objs(cachep, slabp, ctor_flags);
> @@ -1853,13 +2233,14 @@ static int cache_grow(kmem_cache_t *cach
> if (local_flags & __GFP_WAIT)
> local_irq_disable();
> check_irq_off();
> - spin_lock(&cachep->spinlock);
> + l3 = cachep->nodelists[nodeid];
> + spin_lock(&l3->list_lock);
>
> /* Make slab active. */
> - list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_free));
> + list_add_tail(&slabp->list, &(l3->slabs_free));
> STATS_INC_GROWN(cachep);
> - list3_data(cachep)->free_objects += cachep->num;
> - spin_unlock(&cachep->spinlock);
> + l3->free_objects += cachep->num;
> + spin_unlock(&l3->list_lock);
> return 1;
> opps1:
> kmem_freepages(cachep, objp);
> @@ -1965,7 +2346,6 @@ static void check_slabp(kmem_cache_t *ca
> kmem_bufctl_t i;
> int entries = 0;
>
> - check_spinlock_acquired(cachep);
> /* Check slab's freelist to see if this obj is there. */
> for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
> entries++;
> @@ -2010,8 +2390,9 @@ retry:
> }
> l3 = list3_data(cachep);
>
> - BUG_ON(ac->avail > 0);
> - spin_lock(&cachep->spinlock);
> + BUG_ON(ac->avail > 0 || !l3);
> + spin_lock(&l3->list_lock);
> +
> if (l3->shared) {
> struct array_cache *shared_array = l3->shared;
> if (shared_array->avail) {
> @@ -2019,8 +2400,9 @@ retry:
> batchcount = shared_array->avail;
> shared_array->avail -= batchcount;
> ac->avail = batchcount;
> - memcpy(ac_entry(ac), &ac_entry(shared_array)[shared_array->avail],
> - sizeof(void*)*batchcount);
> + memcpy(ac->entry,
> + &(shared_array->entry[shared_array->avail]),
> + sizeof(void*)*batchcount);
> shared_array->touched = 1;
> goto alloc_done;
> }
> @@ -2047,7 +2429,8 @@ retry:
> STATS_SET_HIGH(cachep);
>
> /* get obj pointer */
> - ac_entry(ac)[ac->avail++] = slabp->s_mem + slabp->free*cachep->objsize;
> + ac->entry[ac->avail++] = slabp->s_mem +
> + slabp->free*cachep->objsize;
>
> slabp->inuse++;
> next = slab_bufctl(slabp)[slabp->free];
> @@ -2069,12 +2452,12 @@ retry:
> must_grow:
> l3->free_objects -= ac->avail;
> alloc_done:
> - spin_unlock(&cachep->spinlock);
> + spin_unlock(&l3->list_lock);
>
> if (unlikely(!ac->avail)) {
> int x;
> - x = cache_grow(cachep, flags, -1);
> -
> + x = cache_grow(cachep, flags, numa_node_id());
> +
> // cache_grow can reenable interrupts, then ac could change.
> ac = ac_data(cachep);
> if (!x && ac->avail == 0) // no objects in sight? abort
> @@ -2084,7 +2467,7 @@ alloc_done:
> goto retry;
> }
> ac->touched = 1;
> - return ac_entry(ac)[--ac->avail];
> + return ac->entry[--ac->avail];
> }
>
> static inline void
> @@ -2156,7 +2539,7 @@ static inline void *__cache_alloc(kmem_c
> if (likely(ac->avail)) {
> STATS_INC_ALLOCHIT(cachep);
> ac->touched = 1;
> - objp = ac_entry(ac)[--ac->avail];
> + objp = ac->entry[--ac->avail];
> } else {
> STATS_INC_ALLOCMISS(cachep);
> objp = cache_alloc_refill(cachep, flags);
> @@ -2166,29 +2549,102 @@ static inline void *__cache_alloc(kmem_c
> return objp;
> }
>
> -/*
> - * NUMA: different approach needed if the spinlock is moved into
> - * the l3 structure
> +#ifdef CONFIG_NUMA
> +/*
> + * A interface to enable slab creation on nodeid
> */
> +static void *__cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid)
> +{
> + struct list_head *entry;
> + struct slab *slabp;
> + struct kmem_list3 *l3;
> + void *obj;
> + kmem_bufctl_t next;
> + int x;
> +
> + l3 = cachep->nodelists[nodeid];
> + BUG_ON(!l3);
> +
> +retry:
> + spin_lock(&l3->list_lock);
> + entry = l3->slabs_partial.next;
> + if (entry == &l3->slabs_partial) {
> + l3->free_touched = 1;
> + entry = l3->slabs_free.next;
> + if (entry == &l3->slabs_free)
> + goto must_grow;
> + }
> +
> + slabp = list_entry(entry, struct slab, list);
> + check_spinlock_acquired_node(cachep, nodeid);
> + check_slabp(cachep, slabp);
> +
> + STATS_INC_NODEALLOCS(cachep);
> + STATS_INC_ACTIVE(cachep);
> + STATS_SET_HIGH(cachep);
>
> + BUG_ON(slabp->inuse == cachep->num);
> +
> + /* get obj pointer */
> + obj = slabp->s_mem + slabp->free*cachep->objsize;
> + slabp->inuse++;
> + next = slab_bufctl(slabp)[slabp->free];
> +#if DEBUG
> + slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
> +#endif
> + slabp->free = next;
> + check_slabp(cachep, slabp);
> + l3->free_objects--;
> + /* move slabp to correct slabp list: */
> + list_del(&slabp->list);
> +
> + if (slabp->free == BUFCTL_END) {
> + list_add(&slabp->list, &l3->slabs_full);
> + }
> + else {
> + list_add(&slabp->list, &l3->slabs_partial);
> + }
> +
> + spin_unlock(&l3->list_lock);
> + goto done;
> +
> +must_grow:
> + spin_unlock(&l3->list_lock);
> + x = cache_grow(cachep, flags, nodeid);
> +
> + if (!x)
> + return NULL;
> +
> + goto retry;
> +done:
> + return obj;
> +}
> +#endif
> +
> +/*
> + * Caller needs to acquire correct kmem_list's list_lock
> + */
> static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
> {
> int i;
> -
> - check_spinlock_acquired(cachep);
> -
> - /* NUMA: move add into loop */
> - cachep->lists.free_objects += nr_objects;
> + struct kmem_list3 *l3;
>
> for (i = 0; i < nr_objects; i++) {
> void *objp = objpp[i];
> struct slab *slabp;
> unsigned int objnr;
> + int nodeid = 0;
>
> slabp = GET_PAGE_SLAB(virt_to_page(objp));
> +#ifdef CONFIG_NUMA
> + nodeid = slabp->nodeid;
> +#endif
> + l3 = cachep->nodelists[nodeid];
> list_del(&slabp->list);
> objnr = (objp - slabp->s_mem) / cachep->objsize;
> + check_spinlock_acquired_node(cachep, nodeid);
> check_slabp(cachep, slabp);
> +
> #if DEBUG
> if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
> printk(KERN_ERR "slab: double free detected in cache '%s', objp %p.\n",
> @@ -2200,24 +2656,23 @@ static void free_block(kmem_cache_t *cac
> slabp->free = objnr;
> STATS_DEC_ACTIVE(cachep);
> slabp->inuse--;
> + l3->free_objects++;
> check_slabp(cachep, slabp);
>
> /* fixup slab chains */
> if (slabp->inuse == 0) {
> - if (cachep->lists.free_objects > cachep->free_limit) {
> - cachep->lists.free_objects -= cachep->num;
> + if (l3->free_objects > l3->free_limit) {
> + l3->free_objects -= cachep->num;
> slab_destroy(cachep, slabp);
> } else {
> - list_add(&slabp->list,
> - &list3_data_ptr(cachep, objp)->slabs_free);
> + list_add(&slabp->list, &l3->slabs_free);
> }
> } else {
> /* Unconditionally move a slab to the end of the
> * partial list on free - maximum time for the
> * other objects to be freed, too.
> */
> - list_add_tail(&slabp->list,
> - &list3_data_ptr(cachep, objp)->slabs_partial);
> + list_add_tail(&slabp->list, &l3->slabs_partial);
> }
> }
> }
> @@ -2225,36 +2680,38 @@ static void free_block(kmem_cache_t *cac
> static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
> {
> int batchcount;
> + struct kmem_list3 *l3;
>
> batchcount = ac->batchcount;
> #if DEBUG
> BUG_ON(!batchcount || batchcount > ac->avail);
> #endif
> check_irq_off();
> - spin_lock(&cachep->spinlock);
> - if (cachep->lists.shared) {
> - struct array_cache *shared_array = cachep->lists.shared;
> + l3 = list3_data(cachep);
> + spin_lock(&l3->list_lock);
> + if (l3->shared) {
> + struct array_cache *shared_array = l3->shared;
> int max = shared_array->limit-shared_array->avail;
> if (max) {
> if (batchcount > max)
> batchcount = max;
> - memcpy(&ac_entry(shared_array)[shared_array->avail],
> - &ac_entry(ac)[0],
> + memcpy(&(shared_array->entry[shared_array->avail]),
> + ac->entry,
> sizeof(void*)*batchcount);
> shared_array->avail += batchcount;
> goto free_done;
> }
> }
>
> - free_block(cachep, &ac_entry(ac)[0], batchcount);
> + free_block(cachep, ac->entry, batchcount);
> free_done:
> #if STATS
> {
> int i = 0;
> struct list_head *p;
>
> - p = list3_data(cachep)->slabs_free.next;
> - while (p != &(list3_data(cachep)->slabs_free)) {
> + p = l3->slabs_free.next;
> + while (p != &(l3->slabs_free)) {
> struct slab *slabp;
>
> slabp = list_entry(p, struct slab, list);
> @@ -2266,12 +2723,13 @@ free_done:
> STATS_SET_FREEABLE(cachep, i);
> }
> #endif
> - spin_unlock(&cachep->spinlock);
> + spin_unlock(&l3->list_lock);
> ac->avail -= batchcount;
> - memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount],
> + memmove(ac->entry, &(ac->entry[batchcount]),
> sizeof(void*)*ac->avail);
> }
>
> +
> /*
> * __cache_free
> * Release an obj back to its cache. If the obj has a constructed
> @@ -2286,14 +2744,47 @@ static inline void __cache_free(kmem_cac
> check_irq_off();
> objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
>
> + /* Make sure we are not freeing a object from another
> + * node to the array cache on this cpu.
> + */
> +#ifdef CONFIG_NUMA
> + {
> + struct slab *slabp;
> + slabp = GET_PAGE_SLAB(virt_to_page(objp));
> + if(unlikely(slabp->nodeid != numa_node_id())) {
> + struct array_cache *alien = NULL;
> + int nodeid = slabp->nodeid;
> + struct kmem_list3 *l3 = list3_data(cachep);
> +
> + STATS_INC_NODEFREES(cachep);
> + if(l3->alien && l3->alien[nodeid]) {
> + alien = l3->alien[nodeid];
> + spin_lock(&alien->lock);
> + if(unlikely(alien->avail == alien->limit))
> + __drain_alien_cache(cachep,
> + alien, nodeid);
> + alien->entry[alien->avail++] = objp;
> + spin_unlock(&alien->lock);
> + }
> + else {
> + spin_lock(&(cachep->nodelists[nodeid])->
> + list_lock);
> + free_block(cachep, &objp, 1);
> + spin_unlock(&(cachep->nodelists[nodeid])->
> + list_lock);
> + }
> + return;
> + }
> + }
> +#endif
> if (likely(ac->avail < ac->limit)) {
> STATS_INC_FREEHIT(cachep);
> - ac_entry(ac)[ac->avail++] = objp;
> + ac->entry[ac->avail++] = objp;
> return;
> } else {
> STATS_INC_FREEMISS(cachep);
> cache_flusharray(cachep, ac);
> - ac_entry(ac)[ac->avail++] = objp;
> + ac->entry[ac->avail++] = objp;
> }
> }
>
> @@ -2363,78 +2854,24 @@ out:
> * Identical to kmem_cache_alloc, except that this function is slow
> * and can sleep. And it will allocate memory on the given node, which
> * can improve the performance for cpu bound structures.
> + * New and improved: it will now make sure that the object gets
> + * put on the correct node list so that there is no false sharing.
> */
> void *kmem_cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid)
> {
> - int loop;
> - void *objp;
> - struct slab *slabp;
> - kmem_bufctl_t next;
> -
> - for (loop = 0;;loop++) {
> - struct list_head *q;
> -
> - objp = NULL;
> - check_irq_on();
> - spin_lock_irq(&cachep->spinlock);
> - /* walk through all partial and empty slab and find one
> - * from the right node */
> - list_for_each(q,&cachep->lists.slabs_partial) {
> - slabp = list_entry(q, struct slab, list);
> -
> - if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid ||
> - loop > 2)
> - goto got_slabp;
> - }
> - list_for_each(q, &cachep->lists.slabs_free) {
> - slabp = list_entry(q, struct slab, list);
> -
> - if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid ||
> - loop > 2)
> - goto got_slabp;
> - }
> - spin_unlock_irq(&cachep->spinlock);
> -
> - local_irq_disable();
> - if (!cache_grow(cachep, flags, nodeid)) {
> - local_irq_enable();
> - return NULL;
> - }
> - local_irq_enable();
> - }
> -got_slabp:
> - /* found one: allocate object */
> - check_slabp(cachep, slabp);
> - check_spinlock_acquired(cachep);
> -
> - STATS_INC_ALLOCED(cachep);
> - STATS_INC_ACTIVE(cachep);
> - STATS_SET_HIGH(cachep);
> - STATS_INC_NODEALLOCS(cachep);
> -
> - objp = slabp->s_mem + slabp->free*cachep->objsize;
> -
> - slabp->inuse++;
> - next = slab_bufctl(slabp)[slabp->free];
> -#if DEBUG
> - slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
> -#endif
> - slabp->free = next;
> - check_slabp(cachep, slabp);
> + unsigned long save_flags;
> + void *ptr;
>
> - /* move slabp to correct slabp list: */
> - list_del(&slabp->list);
> - if (slabp->free == BUFCTL_END)
> - list_add(&slabp->list, &cachep->lists.slabs_full);
> - else
> - list_add(&slabp->list, &cachep->lists.slabs_partial);
> + if(nodeid == numa_node_id() || nodeid == -1)
> + return __cache_alloc(cachep, flags);
>
> - list3_data(cachep)->free_objects--;
> - spin_unlock_irq(&cachep->spinlock);
> + cache_alloc_debugcheck_before(cachep, flags);
> + local_irq_save(save_flags);
> + ptr = __cache_alloc_node(cachep, flags, nodeid);
> + local_irq_restore(save_flags);
> + ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0));
>
> - objp = cache_alloc_debugcheck_after(cachep, GFP_KERNEL, objp,
> - __builtin_return_address(0));
> - return objp;
> + return ptr;
> }
> EXPORT_SYMBOL(kmem_cache_alloc_node);
>
> @@ -2620,6 +3057,81 @@ unsigned int kmem_cache_size(kmem_cache_
> }
> EXPORT_SYMBOL(kmem_cache_size);
>
> +/*
> + * This initializes kmem_list3 for all nodes.
> + */
> +static int alloc_kmemlist(kmem_cache_t *cachep)
> +{
> + int node, i;
> + struct kmem_list3 *l3;
> + int err = 0;
> +
> + for(i=0; i < NR_CPUS; i++) {
> + if(cpu_online(i)) {
> + struct array_cache *nc = NULL, *new;
> +#ifdef CONFIG_NUMA
> + struct array_cache **new_alien = NULL;
> +#endif
> + node = cpu_to_node(i);
> +#ifdef CONFIG_NUMA
> + if(!(new_alien = alloc_alien_cache(i, cachep->limit)))
> + goto fail;
> +#endif
> + if(!(new = alloc_arraycache(i, (cachep->shared*
> + cachep->batchcount), 0xbaadf00d)))
> + goto fail;
> + if((l3 = cachep->nodelists[node])) {
> +
> + spin_lock_irq(&l3->list_lock);
> +
> + if((nc = cachep->nodelists[node]->shared))
> + free_block(cachep, nc->entry,
> + nc->avail);
> +
> + l3->shared = new;
> +#ifdef CONFIG_NUMA
> + if(!cachep->nodelists[node]->alien) {
> + l3->alien = new_alien;
> + new_alien = NULL;
> + }
> + l3->free_limit = (1 + nr_cpus_node(node))*
> + cachep->batchcount + cachep->num;
> +#else
> + l3->free_limit = (1 + num_online_cpus())*
> + cachep->batchcount + cachep->num;
> +#endif
> + spin_unlock_irq(&l3->list_lock);
> + kfree(nc);
> +#ifdef CONFIG_NUMA
> + free_alien_cache(new_alien);
> +#endif
> + continue;
> + }
> + if(!(l3 = kmalloc_node(sizeof(struct kmem_list3),
> + GFP_KERNEL, node)))
> + goto fail;
> +
> + LIST3_INIT(l3);
> + l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
> + ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
> + l3->shared = new;
> +#ifdef CONFIG_NUMA
> + l3->alien = new_alien;
> + l3->free_limit = (1 + nr_cpus_node(node))*
> + cachep->batchcount + cachep->num;
> +#else
> + l3->free_limit = (1 + num_online_cpus())*
> + cachep->batchcount + cachep->num;
> +#endif
> + cachep->nodelists[node] = l3;
> + }
> + }
> + return err;
> +fail:
> + err = -ENOMEM;
> + return err;
> +}
> +
> struct ccupdate_struct {
> kmem_cache_t *cachep;
> struct array_cache *new[NR_CPUS];
> @@ -2642,8 +3154,7 @@ static int do_tune_cpucache(kmem_cache_t
> int shared)
> {
> struct ccupdate_struct new;
> - struct array_cache *new_shared;
> - int i;
> + int i, err;
>
> memset(&new.new,0,sizeof(new.new));
> for (i = 0; i < NR_CPUS; i++) {
> @@ -2660,36 +3171,30 @@ static int do_tune_cpucache(kmem_cache_t
> new.cachep = cachep;
>
> smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
> -
> +
> check_irq_on();
> spin_lock_irq(&cachep->spinlock);
> cachep->batchcount = batchcount;
> cachep->limit = limit;
> - cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num;
> + cachep->shared = shared;
> spin_unlock_irq(&cachep->spinlock);
>
> for (i = 0; i < NR_CPUS; i++) {
> struct array_cache *ccold = new.new[i];
> if (!ccold)
> continue;
> - spin_lock_irq(&cachep->spinlock);
> - free_block(cachep, ac_entry(ccold), ccold->avail);
> - spin_unlock_irq(&cachep->spinlock);
> + spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
> + free_block(cachep, ccold->entry, ccold->avail);
> + spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
> kfree(ccold);
> }
> - new_shared = alloc_arraycache(-1, batchcount*shared, 0xbaadf00d);
> - if (new_shared) {
> - struct array_cache *old;
> -
> - spin_lock_irq(&cachep->spinlock);
> - old = cachep->lists.shared;
> - cachep->lists.shared = new_shared;
> - if (old)
> - free_block(cachep, ac_entry(old), old->avail);
> - spin_unlock_irq(&cachep->spinlock);
> - kfree(old);
> - }
>
> + err = alloc_kmemlist(cachep);
> + if (err) {
> + printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
> + cachep->name, -err);
> + BUG();
> + }
> return 0;
> }
>
> @@ -2747,11 +3252,11 @@ static void enable_cpucache(kmem_cache_t
> }
>
> static void drain_array_locked(kmem_cache_t *cachep,
> - struct array_cache *ac, int force)
> + struct array_cache *ac, int force, int node)
> {
> int tofree;
>
> - check_spinlock_acquired(cachep);
> + check_spinlock_acquired_node(cachep, node);
> if (ac->touched && !force) {
> ac->touched = 0;
> } else if (ac->avail) {
> @@ -2759,9 +3264,9 @@ static void drain_array_locked(kmem_cach
> if (tofree > ac->avail) {
> tofree = (ac->avail+1)/2;
> }
> - free_block(cachep, ac_entry(ac), tofree);
> + free_block(cachep, ac->entry, tofree);
> ac->avail -= tofree;
> - memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree],
> + memmove(ac->entry, &(ac->entry[tofree]),
> sizeof(void*)*ac->avail);
> }
> }
> @@ -2780,6 +3285,7 @@ static void drain_array_locked(kmem_cach
> static void cache_reap(void *unused)
> {
> struct list_head *walk;
> + struct kmem_list3 *l3;
>
> if (down_trylock(&cache_chain_sem)) {
> /* Give up. Setup the next iteration. */
> @@ -2800,27 +3306,35 @@ static void cache_reap(void *unused)
>
> check_irq_on();
>
> - spin_lock_irq(&searchp->spinlock);
> + l3 = list3_data(searchp);
> +#ifdef CONFIG_NUMA
> + if(l3->alien)
> + drain_alien_cache(searchp, l3);
> +#endif
> +
> + spin_lock_irq(&l3->list_lock);
>
> - drain_array_locked(searchp, ac_data(searchp), 0);
> + drain_array_locked(searchp, ac_data(searchp), 0,
> + numa_node_id());
>
> - if(time_after(searchp->lists.next_reap, jiffies))
> + if(time_after(l3->next_reap, jiffies))
> goto next_unlock;
>
> - searchp->lists.next_reap = jiffies + REAPTIMEOUT_LIST3;
> + l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
>
> - if (searchp->lists.shared)
> - drain_array_locked(searchp, searchp->lists.shared, 0);
> + if (l3->shared)
> + drain_array_locked(searchp, l3->shared, 0,
> + numa_node_id());
>
> - if (searchp->lists.free_touched) {
> - searchp->lists.free_touched = 0;
> + if (l3->free_touched) {
> + l3->free_touched = 0;
> goto next_unlock;
> }
>
> - tofree = (searchp->free_limit+5*searchp->num-1)/(5*searchp->num);
> + tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num);
> do {
> - p = list3_data(searchp)->slabs_free.next;
> - if (p == &(list3_data(searchp)->slabs_free))
> + p = l3->slabs_free.next;
> + if (p == &(l3->slabs_free))
> break;
>
> slabp = list_entry(p, struct slab, list);
> @@ -2833,13 +3347,13 @@ static void cache_reap(void *unused)
> * searchp cannot disappear, we hold
> * cache_chain_lock
> */
> - searchp->lists.free_objects -= searchp->num;
> - spin_unlock_irq(&searchp->spinlock);
> + l3->free_objects -= searchp->num;
> + spin_unlock_irq(&l3->list_lock);
> slab_destroy(searchp, slabp);
> - spin_lock_irq(&searchp->spinlock);
> + spin_lock_irq(&l3->list_lock);
> } while(--tofree > 0);
> next_unlock:
> - spin_unlock_irq(&searchp->spinlock);
> + spin_unlock_irq(&l3->list_lock);
> next:
> cond_resched();
> }
> @@ -2872,7 +3386,7 @@ static void *s_start(struct seq_file *m,
> seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
> #if STATS
> seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>"
> - " <error> <maxfreeable> <freelimit> <nodeallocs>");
> + " <error> <maxfreeable> <nodeallocs> <remotefrees>");
> seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
> #endif
> seq_putc(m, '\n');
> @@ -2907,39 +3421,53 @@ static int s_show(struct seq_file *m, vo
> unsigned long active_objs;
> unsigned long num_objs;
> unsigned long active_slabs = 0;
> - unsigned long num_slabs;
> - const char *name;
> + unsigned long num_slabs, free_objects = 0, shared_avail = 0;
> + const char *name;
> char *error = NULL;
> + int i;
> + struct kmem_list3 *l3;
>
> check_irq_on();
> spin_lock_irq(&cachep->spinlock);
> active_objs = 0;
> num_slabs = 0;
> - list_for_each(q,&cachep->lists.slabs_full) {
> - slabp = list_entry(q, struct slab, list);
> - if (slabp->inuse != cachep->num && !error)
> - error = "slabs_full accounting error";
> - active_objs += cachep->num;
> - active_slabs++;
> - }
> - list_for_each(q,&cachep->lists.slabs_partial) {
> - slabp = list_entry(q, struct slab, list);
> - if (slabp->inuse == cachep->num && !error)
> - error = "slabs_partial inuse accounting error";
> - if (!slabp->inuse && !error)
> - error = "slabs_partial/inuse accounting error";
> - active_objs += slabp->inuse;
> - active_slabs++;
> - }
> - list_for_each(q,&cachep->lists.slabs_free) {
> - slabp = list_entry(q, struct slab, list);
> - if (slabp->inuse && !error)
> - error = "slabs_free/inuse accounting error";
> - num_slabs++;
> + for( i=0; i<MAX_NUMNODES; i++) {
> + l3 = cachep->nodelists[i];
> + if(!l3 || !is_node_online(i))
> + continue;
> +
> + spin_lock(&l3->list_lock);
> +
> + list_for_each(q,&l3->slabs_full) {
> + slabp = list_entry(q, struct slab, list);
> + if (slabp->inuse != cachep->num && !error)
> + error = "slabs_full accounting error";
> + active_objs += cachep->num;
> + active_slabs++;
> + }
> + list_for_each(q,&l3->slabs_partial) {
> + slabp = list_entry(q, struct slab, list);
> + if (slabp->inuse == cachep->num && !error)
> + error = "slabs_partial inuse accounting error";
> + if (!slabp->inuse && !error)
> + error = "slabs_partial/inuse accounting error";
> + active_objs += slabp->inuse;
> + active_slabs++;
> + }
> + list_for_each(q,&l3->slabs_free) {
> + slabp = list_entry(q, struct slab, list);
> + if (slabp->inuse && !error)
> + error = "slabs_free/inuse accounting error";
> + num_slabs++;
> + }
> + free_objects += l3->free_objects;
> + shared_avail += l3->shared->avail;
> +
> + spin_unlock(&l3->list_lock);
> }
> num_slabs+=active_slabs;
> num_objs = num_slabs*cachep->num;
> - if (num_objs - active_objs != cachep->lists.free_objects && !error)
> + if (num_objs - active_objs != free_objects && !error)
> error = "free_objects accounting error";
>
> name = cachep->name;
> @@ -2951,9 +3479,9 @@ static int s_show(struct seq_file *m, vo
> cachep->num, (1<<cachep->gfporder));
> seq_printf(m, " : tunables %4u %4u %4u",
> cachep->limit, cachep->batchcount,
> - cachep->lists.shared->limit/cachep->batchcount);
> - seq_printf(m, " : slabdata %6lu %6lu %6u",
> - active_slabs, num_slabs, cachep->lists.shared->avail);
> + cachep->shared);
> + seq_printf(m, " : slabdata %6lu %6lu %6lu",
> + active_slabs, num_slabs, shared_avail);
> #if STATS
> { /* list3 stats */
> unsigned long high = cachep->high_mark;
> @@ -2962,12 +3490,13 @@ static int s_show(struct seq_file *m, vo
> unsigned long reaped = cachep->reaped;
> unsigned long errors = cachep->errors;
> unsigned long max_freeable = cachep->max_freeable;
> - unsigned long free_limit = cachep->free_limit;
> unsigned long node_allocs = cachep->node_allocs;
> + unsigned long node_frees = cachep->node_frees;
>
> - seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu",
> - allocs, high, grown, reaped, errors,
> - max_freeable, free_limit, node_allocs);
> + seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
> + %4lu %4lu %4lu %4lu",
> + allocs, high, grown, reaped, errors,
> + max_freeable, node_allocs, node_frees);
> }
> /* cpu stats */
> {
> @@ -3048,7 +3577,8 @@ ssize_t slabinfo_write(struct file *file
> shared < 0) {
> res = -EINVAL;
> } else {
> - res = do_tune_cpucache(cachep, limit, batchcount, shared);
> + res = do_tune_cpucache(cachep, limit,
> + batchcount, shared);
> }
> break;
> }
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

--
Thanks

Jack Steiner ([email protected]) 651-683-5302
Principal Engineer SGI - Silicon Graphics, Inc.


2005-05-12 07:05:32

by Andrew Morton

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V2

Christoph Lameter <[email protected]> wrote:
>
> This patch allows kmalloc_node to be as fast as kmalloc by introducing
> node specific page lists for partial, free and full slabs.

This patch causes the ppc64 G5 to lock up fairly early in boot. It's
pretty much a default config:
http://www.zip.com.au/~akpm/linux/patches/stuff/config-pmac

No serial port, no debug environment, but no useful-looking error messages
either. See http://www.zip.com.au/~akpm/linux/patches/stuff/dsc02516.jpg

Also, the patch came through with all the "^ $" lines converted to
completely empty lines - probably your email client is trying to be clever.
Please send yourself a patch, check that it applies?

Finally, I do intend to merge up the various slab patches which are in -mm,
so if you could base further work on top of those it would simplify life,
thanks.

2005-05-12 09:39:48

by Niraj kumar

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V2

On 5/12/05, Andrew Morton <[email protected]> wrote:
> Christoph Lameter <[email protected]> wrote:
> >
> > This patch allows kmalloc_node to be as fast as kmalloc by introducing
> > node specific page lists for partial, free and full slabs.
>
> This patch causes the ppc64 G5 to lock up fairly early in boot. It's
> pretty much a default config:
> http://www.zip.com.au/~akpm/linux/patches/stuff/config-pmac
>
> No serial port, no debug environment, but no useful-looking error messages
> either. See http://www.zip.com.au/~akpm/linux/patches/stuff/dsc02516.jpg

The image shows that kernel comand line option "quiet" was used .
We can probably get some more info if booted without "quiet" .

Niraj

2005-05-12 20:02:34

by Christoph Lameter

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V2

On Thu, 12 May 2005, Andrew Morton wrote:

> Christoph Lameter <[email protected]> wrote:
> >
> > This patch allows kmalloc_node to be as fast as kmalloc by introducing
> > node specific page lists for partial, free and full slabs.
>
> This patch causes the ppc64 G5 to lock up fairly early in boot. It's
> pretty much a default config:
> http://www.zip.com.au/~akpm/linux/patches/stuff/config-pmac
>
> No serial port, no debug environment, but no useful-looking error messages
> either. See http://www.zip.com.au/~akpm/linux/patches/stuff/dsc02516.jpg

I got rc4-mm1 and booted it on an x86_64 machines with similar
configuration (no NUMA but SMP, numa slab uncommented) but multiple
configurations worked fine (apart from another error attempting to
initialize a nonexistand second cpu by the NMI handler that I described
in another email to you). I have no ppc64 available.

Could we boot the box without quiet so that we can get better debug
messages? Did the box boot okay without the patch?

> Finally, I do intend to merge up the various slab patches which are in -mm,
> so if you could base further work on top of those it would simplify life,
> thanks.

Ok.

2005-05-12 20:22:00

by Andrew Morton

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V2

Christoph Lameter <[email protected]> wrote:
>
> On Thu, 12 May 2005, Andrew Morton wrote:
>
> > Christoph Lameter <[email protected]> wrote:
> > >
> > > This patch allows kmalloc_node to be as fast as kmalloc by introducing
> > > node specific page lists for partial, free and full slabs.
> >
> > This patch causes the ppc64 G5 to lock up fairly early in boot. It's
> > pretty much a default config:
> > http://www.zip.com.au/~akpm/linux/patches/stuff/config-pmac
> >
> > No serial port, no debug environment, but no useful-looking error messages
> > either. See http://www.zip.com.au/~akpm/linux/patches/stuff/dsc02516.jpg
>
> I got rc4-mm1 and booted it on an x86_64 machines with similar
> configuration (no NUMA but SMP, numa slab uncommented) but multiple
> configurations worked fine (apart from another error attempting to
> initialize a nonexistand second cpu by the NMI handler that I described
> in another email to you). I have no ppc64 available.
>
> Could we boot the box without quiet so that we can get better debug
> messages?

OK, I'll try that, but I doubt if it'll give much more info.

> Did the box boot okay without the patch?

Yup, I tested base 2.6.12-rc4 and 2.6.12-rc4+the-patch-you-sent.

2005-05-12 21:49:21

by Robin Holt

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V2

Christoph,

Can you let me know when this is in so I can modify the ia64 pgalloc.h
to not use the quicklists any longer?

Thanks,
Robin

2005-05-13 07:07:35

by Andrew Morton

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V2

Christoph Lameter <[email protected]> wrote:
>
> Could we boot the box without quiet so that we can get better debug
> messages?

It didn't produce anything interesting. For some reason the console output
stops when start_kernel() runs console_init() (I guess it all comes out
later) so the machine is running blind when we run kmem_cache_init().
Irritating. I just moved the console_init() call to happen later on.

It's going BUG() in kmem_cache_init()->set_up_list3s->is_node_online
because for some reason the !CONFIG_NUMA ppc build has MAX_NUMNODES=16,
even though there's only one node.

Doing

#define is_node_online(node) node_online(node)

unconditionally fixes that up (your patch shuld be using
for_each_online_node() everywhere?) but it oopses later - I think it's the
first time kmem_cache_alloc() is called.

2005-05-13 11:21:36

by Christoph Lameter

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V2

On Fri, 13 May 2005, Andrew Morton wrote:

> It didn't produce anything interesting. For some reason the console output
> stops when start_kernel() runs console_init() (I guess it all comes out
> later) so the machine is running blind when we run kmem_cache_init().
> Irritating. I just moved the console_init() call to happen later on.
>
> It's going BUG() in kmem_cache_init()->set_up_list3s->is_node_online
> because for some reason the !CONFIG_NUMA ppc build has MAX_NUMNODES=16,
> even though there's only one node.

Yuck.

The definition for the number of NUMA nodes is dependent on
CONFIG_FLATMEM instead of CONFIG_NUMA in mm.
CONFIG_FLATMEM is not set on ppc64 because CONFIG_DISCONTIG is set! And
consequently nodes exist in a non NUMA config.

s/CONFIG_NUMA/CONFIG_FLATMEM/ ??

2005-05-13 11:34:15

by Andrew Morton

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V2

Christoph Lameter <[email protected]> wrote:
>
> On Fri, 13 May 2005, Andrew Morton wrote:
>
> > It didn't produce anything interesting. For some reason the console output
> > stops when start_kernel() runs console_init() (I guess it all comes out
> > later) so the machine is running blind when we run kmem_cache_init().
> > Irritating. I just moved the console_init() call to happen later on.
> >
> > It's going BUG() in kmem_cache_init()->set_up_list3s->is_node_online
> > because for some reason the !CONFIG_NUMA ppc build has MAX_NUMNODES=16,
> > even though there's only one node.
>
> Yuck.
>
> The definition for the number of NUMA nodes is dependent on
> CONFIG_FLATMEM instead of CONFIG_NUMA in mm.
> CONFIG_FLATMEM is not set on ppc64 because CONFIG_DISCONTIG is set! And
> consequently nodes exist in a non NUMA config.

I was testing 2.6.12-rc4 base.


2005-05-13 11:38:16

by Christoph Lameter

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V2

On Fri, 13 May 2005, Andrew Morton wrote:

> > The definition for the number of NUMA nodes is dependent on
> > CONFIG_FLATMEM instead of CONFIG_NUMA in mm.
> > CONFIG_FLATMEM is not set on ppc64 because CONFIG_DISCONTIG is set! And
> > consequently nodes exist in a non NUMA config.
>
> I was testing 2.6.12-rc4 base.

There we still have the notion of nodes depending on CONFIG_DISCONTIG and
not on CONFIG_NUMA. The node stuff needs to be

#ifdef CONFIG_FLATMEM

or

#ifdef CONFIG_DISCONTIG

??

2005-05-13 13:46:37

by Dave Hansen

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V2

On Fri, 2005-05-13 at 04:21 -0700, Christoph Lameter wrote:
> The definition for the number of NUMA nodes is dependent on
> CONFIG_FLATMEM instead of CONFIG_NUMA in mm.
> CONFIG_FLATMEM is not set on ppc64 because CONFIG_DISCONTIG is set! And
> consequently nodes exist in a non NUMA config.
>
> s/CONFIG_NUMA/CONFIG_FLATMEM/ ??

FLATMEM effectively means that you have a contiguous, single mem_map[];
it isn't directly related to NUMA.

Could you point me to the code that you're looking at? We shouldn't
have numbers of NUMA nodes is dependent on CONFIG_FLATMEM, at least
directly.

-- Dave

2005-05-13 13:59:08

by Dave Hansen

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V2

On Fri, 2005-05-13 at 04:37 -0700, Christoph Lameter wrote:
> On Fri, 13 May 2005, Andrew Morton wrote:
> > > The definition for the number of NUMA nodes is dependent on
> > > CONFIG_FLATMEM instead of CONFIG_NUMA in mm.
> > > CONFIG_FLATMEM is not set on ppc64 because CONFIG_DISCONTIG is set! And
> > > consequently nodes exist in a non NUMA config.
> >
> > I was testing 2.6.12-rc4 base.
>
> There we still have the notion of nodes depending on CONFIG_DISCONTIG and
> not on CONFIG_NUMA. The node stuff needs to be
>
> #ifdef CONFIG_FLATMEM
>
> or
>
> #ifdef CONFIG_DISCONTIG

I think I found the problem. Could you try the attached patch?

As I said before FLATMEM is really referring to things like the
mem_map[] or max_mapnr.

CONFIG_NEED_MULTIPLE_NODES is what gets turned on for DISCONTIG or for
NUMA. We'll slowly be removing all of the DISCONTIG cases, so
eventually it will merge back to be one with NUMA.

-- Dave

--- clean/include/linux/numa.h.orig 2005-05-13 06:44:56.000000000 -0700
+++ clean/include/linux/numa.h 2005-05-13 06:52:05.000000000 -0700
@@ -3,7 +3,7 @@

#include <linux/config.h>

-#ifndef CONFIG_FLATMEM
+#ifdef CONFIG_NEED_MULTIPLE_NODES
#include <asm/numnodes.h>
#endif



2005-05-13 16:21:05

by Christoph Lameter

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V2

On Fri, 13 May 2005, Dave Hansen wrote:

> I think I found the problem. Could you try the attached patch?

Ok. That is a part of the problem. The other issue that I saw while
testing is that the new slab allocator fails on 64 bit non NUMA platforms
because the bootstrap does not work right. The size of struct kmem_list3
may become > 64 bytes (with preempt etc on which increases the size of the
spinlock_t) which requires an additional slab to be handled in a special
way during bootstrap. I hope I will have an updated patch soon.

2005-05-14 01:29:49

by Christoph Lameter

[permalink] [raw]
Subject: NUMA aware slab allocator V3

The NUMA API change that introduced kmalloc_node was accepted last week by
Linus. Now it is possible to do slab allocations on a node to localize
memory structures. This API was used by the pageset localization patch and
the block layer localization patch now in mm. The existing kmalloc_node is
slow since it simply searches through all pages of the slab to find a page
that is on the node requested. The two patches do a one time allocation of
slab structures at initialization and therefore the speed of kmalloc node
does not matter.

This patch allows kmalloc_node to be as fast as kmalloc by introducing
node specific page lists for partial, free and full slabs. Slab allocation
improves in a NUMA system so that we are seeing a performance gain in
AIM7 of about 5% with this patch alone.

More NUMA localizations are possible if kmalloc_node operates
in an fast way like kmalloc.

Test run on a 32p systems with 32G Ram.

w/o patch
Tasks jobs/min jti jobs/min/task real cpu
1 485.36 100 485.3640 11.99 1.91 Sat Apr 30 14:01:51 2005
100 26582.63 88 265.8263 21.89 144.96 Sat Apr 30 14:02:14 2005
200 29866.83 81 149.3342 38.97 286.08 Sat Apr 30 14:02:53 2005
300 33127.16 78 110.4239 52.71 426.54 Sat Apr 30 14:03:46 2005
400 34889.47 80 87.2237 66.72 568.90 Sat Apr 30 14:04:53 2005
500 35654.34 76 71.3087 81.62 714.55 Sat Apr 30 14:06:15 2005
600 36460.83 75 60.7681 95.77 853.42 Sat Apr 30 14:07:51 2005
700 35957.00 75 51.3671 113.30 990.67 Sat Apr 30 14:09:45 2005
800 33380.65 73 41.7258 139.48 1140.86 Sat Apr 30 14:12:05 2005
900 35095.01 76 38.9945 149.25 1281.30 Sat Apr 30 14:14:35 2005
1000 36094.37 74 36.0944 161.24 1419.66 Sat Apr 30 14:17:17 2005

w/patch
Tasks jobs/min jti jobs/min/task real cpu
1 484.27 100 484.2736 12.02 1.93 Sat Apr 30 15:59:45 2005
100 28262.03 90 282.6203 20.59 143.57 Sat Apr 30 16:00:06 2005
200 32246.45 82 161.2322 36.10 282.89 Sat Apr 30 16:00:42 2005
300 37945.80 83 126.4860 46.01 418.75 Sat Apr 30 16:01:28 2005
400 40000.69 81 100.0017 58.20 561.48 Sat Apr 30 16:02:27 2005
500 40976.10 78 81.9522 71.02 696.95 Sat Apr 30 16:03:38 2005
600 41121.54 78 68.5359 84.92 834.86 Sat Apr 30 16:05:04 2005
700 44052.77 78 62.9325 92.48 971.53 Sat Apr 30 16:06:37 2005
800 41066.89 79 51.3336 113.38 1111.15 Sat Apr 30 16:08:31 2005
900 38918.77 79 43.2431 134.59 1252.57 Sat Apr 30 16:10:46 2005
1000 41842.21 76 41.8422 139.09 1392.33 Sat Apr 30 16:13:05 2005

These are measurement taken directly after boot and show a greater improvement than 5%.
However, the performance improvements become less over time if the AIM7 runs are repeated
and settle down at around 5%.

Link to earlier discussions:
http://marc.theaimsgroup.com/?t=111094594500003&r=1&w=2

Changelog V2-V3:
- Made to patch against 2.6.12-rc4-mm1
- Revised bootstrap mechanism so that larger size kmem_list3 structs can be
supported. Do a generic solution so that the right slab can be found
for the internal structs.
- use for_each_online_node

Changelog V1-V2:
- Batching for freeing of wrong-node objects (alien caches)
- Locking changes and NUMA #ifdefs as requested by Manfred

Signed-off-by: Alok N Kataria <[email protected]>
Signed-off-by: Shobhit Dayal <[email protected]>
Signed-off-by: Shai Fultheim <[email protected]>
Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.12-rc4/mm/slab.c
===================================================================
--- linux-2.6.12-rc4.orig/mm/slab.c 2005-05-13 11:15:34.000000000 -0700
+++ linux-2.6.12-rc4/mm/slab.c 2005-05-13 16:09:39.000000000 -0700
@@ -75,6 +75,14 @@
*
* At present, each engine can be growing a cache. This should be blocked.
*
+ * 15 March 2005. NUMA slab allocator.
+ * Shobhit Dayal <[email protected]>
+ * Alok N Kataria <[email protected]>
+ * Christoph Lameter <[email protected]>
+ *
+ * Modified the slab allocator to be node aware on NUMA systems.
+ * Each node has its own list of partial, free and full slabs.
+ * All object allocations for a node occur from node specific slab lists.
*/

#include <linux/config.h>
@@ -93,13 +101,24 @@
#include <linux/module.h>
#include <linux/rcupdate.h>
#include <linux/string.h>
-
+#include <linux/nodemask.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/page.h>

/*
+ * Some Linux kernels currently have weird notions of NUMA. Make sure that
+ * there is only a single node if CONFIG_NUMA is not set. Remove this check
+ * after the situation has stabilized.
+ */
+#ifndef CONFIG_NUMA
+#if MAX_NUMNODES != 1
+#error "Broken Configuration: CONFIG_NUMA not set but MAX_NUMNODES !=1 !!"
+#endif
+#endif
+
+/*
* DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
* SLAB_RED_ZONE & SLAB_POISON.
* 0 for faster, smaller code (especially in the critical paths).
@@ -211,6 +230,9 @@
void *s_mem; /* including colour offset */
unsigned int inuse; /* num of objs active in slab */
kmem_bufctl_t free;
+#ifdef CONFIG_NUMA
+ unsigned short nodeid;
+#endif
};

/*
@@ -253,6 +275,10 @@
unsigned int limit;
unsigned int batchcount;
unsigned int touched;
+#ifdef CONFIG_NUMA
+ spinlock_t lock;
+#endif
+ void *entry[];
};

/* bootstrap: The caches do not work without cpuarrays anymore,
@@ -265,35 +291,113 @@
};

/*
- * The slab lists of all objects.
- * Hopefully reduce the internal fragmentation
- * NUMA: The spinlock could be moved from the kmem_cache_t
- * into this structure, too. Figure out what causes
- * fewer cross-node spinlock operations.
+ * The slab lists for all objects.
*/
struct kmem_list3 {
struct list_head slabs_partial; /* partial list first, better asm code */
struct list_head slabs_full;
struct list_head slabs_free;
unsigned long free_objects;
- int free_touched;
unsigned long next_reap;
+ int free_touched;
+ unsigned int free_limit;
+ spinlock_t list_lock;
struct array_cache *shared;
+#ifdef CONFIG_NUMA
+ struct array_cache **alien;
+#endif
};

-#define LIST3_INIT(parent) \
- { \
- .slabs_full = LIST_HEAD_INIT(parent.slabs_full), \
- .slabs_partial = LIST_HEAD_INIT(parent.slabs_partial), \
- .slabs_free = LIST_HEAD_INIT(parent.slabs_free) \
+/*
+ * Need this for bootstrapping a per node allocator.
+ */
+#define NUM_INIT_LISTS (2 + MAX_NUMNODES)
+struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
+#define CACHE_CACHE 0
+#define SIZE_AC 1
+#define SIZE_L3 2
+
+/*
+ * This function may be completely optimized away if
+ * a constant is passed to it. Mostly the same as
+ * what is in linux/slab.h except it returns an
+ * index.
+ */
+static inline int index_of(const size_t size)
+{
+ int i = 0;
+
+#define CACHE(x) \
+ if (size <=x) \
+ return i; \
+ else \
+ i++;
+#include "linux/kmalloc_sizes.h"
+#undef CACHE
+ {
+ extern void __bad_size(void);
+ __bad_size();
+ return 0;
}
+}
+
+#define INDEX_AC index_of(sizeof(struct array_cache))
+#define INDEX_L3 index_of(sizeof(struct kmem_list3))
+
+#ifdef CONFIG_NUMA
+
+#define LIST3_INIT(parent) \
+ do { \
+ INIT_LIST_HEAD(&(parent)->slabs_full); \
+ INIT_LIST_HEAD(&(parent)->slabs_partial); \
+ INIT_LIST_HEAD(&(parent)->slabs_free); \
+ (parent)->shared = NULL; \
+ (parent)->alien = NULL; \
+ (parent)->list_lock = SPIN_LOCK_UNLOCKED; \
+ (parent)->free_objects = 0; \
+ (parent)->free_touched = 0; \
+ } while(0)
+#else
+
+#define LIST3_INIT(parent) \
+ do { \
+ INIT_LIST_HEAD(&(parent)->slabs_full); \
+ INIT_LIST_HEAD(&(parent)->slabs_partial); \
+ INIT_LIST_HEAD(&(parent)->slabs_free); \
+ (parent)->shared = NULL; \
+ (parent)->list_lock = SPIN_LOCK_UNLOCKED; \
+ (parent)->free_objects = 0; \
+ (parent)->free_touched = 0; \
+ } while(0)
+#endif
+
+#define MAKE_LIST(cachep, listp, slab, nodeid) \
+ do { \
+ INIT_LIST_HEAD(listp); \
+ list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
+ }while(0)
+
+#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
+ do { \
+ MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \
+ MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
+ MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
+ }while(0)
+
#define list3_data(cachep) \
- (&(cachep)->lists)
+ ((cachep->nodelists[numa_node_id()]))

/* NUMA: per-node */
#define list3_data_ptr(cachep, ptr) \
list3_data(cachep)

+#ifdef CONFIG_NUMA
+#define is_node_online(node) node_online(node)
+#else
+#define is_node_online(node) \
+ (node == 0)
+#endif /* CONFIG_NUMA */
+
/*
* kmem_cache_t
*
@@ -305,13 +409,12 @@
struct array_cache *array[NR_CPUS];
unsigned int batchcount;
unsigned int limit;
-/* 2) touched by every alloc & free from the backend */
- struct kmem_list3 lists;
- /* NUMA: kmem_3list_t *nodelists[MAX_NUMNODES] */
+ unsigned int shared;
unsigned int objsize;
+/* 2) touched by every alloc & free from the backend */
+ struct kmem_list3 *nodelists[MAX_NUMNODES];
unsigned int flags; /* constant flags */
unsigned int num; /* # of objs per slab */
- unsigned int free_limit; /* upper limit of objects in the lists */
spinlock_t spinlock;

/* 3) cache_grow/shrink */
@@ -348,6 +451,7 @@
unsigned long errors;
unsigned long max_freeable;
unsigned long node_allocs;
+ unsigned long node_frees;
atomic_t allochit;
atomic_t allocmiss;
atomic_t freehit;
@@ -385,6 +489,7 @@
} while (0)
#define STATS_INC_ERR(x) ((x)->errors++)
#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
+#define STATS_INC_NODEFREES(x) ((x)->node_frees++)
#define STATS_SET_FREEABLE(x, i) \
do { if ((x)->max_freeable < i) \
(x)->max_freeable = i; \
@@ -403,6 +508,7 @@
#define STATS_SET_HIGH(x) do { } while (0)
#define STATS_INC_ERR(x) do { } while (0)
#define STATS_INC_NODEALLOCS(x) do { } while (0)
+#define STATS_INC_NODEFREES(x) do { } while (0)
#define STATS_SET_FREEABLE(x, i) \
do { } while (0)

@@ -535,9 +641,9 @@

/* internal cache of cache description objs */
static kmem_cache_t cache_cache = {
- .lists = LIST3_INIT(cache_cache.lists),
.batchcount = 1,
.limit = BOOT_CPUCACHE_ENTRIES,
+ .shared = 1,
.objsize = sizeof(kmem_cache_t),
.flags = SLAB_NO_REAP,
.spinlock = SPIN_LOCK_UNLOCKED,
@@ -565,7 +671,8 @@
*/
static enum {
NONE,
- PARTIAL,
+ PARTIAL_AC,
+ PARTIAL_L3,
FULL
} g_cpucache_up;

@@ -574,11 +681,7 @@
static void free_block(kmem_cache_t* cachep, void** objpp, int len);
static void enable_cpucache (kmem_cache_t *cachep);
static void cache_reap (void *unused);
-
-static inline void **ac_entry(struct array_cache *ac)
-{
- return (void**)(ac+1);
-}
+static int __node_shrink(kmem_cache_t *cachep, int node);

static inline struct array_cache *ac_data(kmem_cache_t *cachep)
{
@@ -680,42 +783,151 @@
int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
struct array_cache *nc = NULL;

- if (cpu == -1)
- nc = kmalloc(memsize, GFP_KERNEL);
- else
- nc = kmalloc_node(memsize, GFP_KERNEL, cpu_to_node(cpu));
-
+ nc = kmalloc_node(memsize, GFP_KERNEL, cpu_to_node(cpu));
if (nc) {
nc->avail = 0;
nc->limit = entries;
nc->batchcount = batchcount;
nc->touched = 0;
+#ifdef CONFIG_NUMA
+ spin_lock_init(&nc->lock);
+#endif
}
return nc;
}
+#ifdef CONFIG_NUMA
+static inline struct array_cache **alloc_alien_cache(int cpu, int limit)
+{
+ struct array_cache **ac_ptr;
+ int memsize = sizeof(void*)*MAX_NUMNODES;
+ int node = cpu_to_node(cpu);
+ int i;
+
+ if (limit > 1)
+ limit = 12;
+ ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
+ if(ac_ptr) {
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ if (i == node) {
+ ac_ptr[i] = NULL;
+ continue;
+ }
+ ac_ptr[i] = alloc_arraycache(cpu, limit, 0xbaadf00d);
+ if(!ac_ptr[i]) {
+ for(i--; i <=0; i--)
+ kfree(ac_ptr[i]);
+ kfree(ac_ptr);
+ return NULL;
+ }
+ }
+ }
+ return ac_ptr;
+}
+
+static inline void free_alien_cache(struct array_cache **ac_ptr)
+{
+ int i;
+
+ if(!ac_ptr)
+ return;
+ for (i = 0; i < MAX_NUMNODES; i++)
+ kfree(ac_ptr[i]);
+
+ kfree(ac_ptr);
+}
+
+static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node)
+{
+ struct kmem_list3 *rl3 = cachep->nodelists[node];
+
+ if(ac->avail) {
+ spin_lock(&rl3->list_lock);
+ free_block(cachep, ac->entry, ac->avail);
+ ac->avail = 0;
+ spin_unlock(&rl3->list_lock);
+ }
+}
+
+static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
+{
+ int i=0;
+ struct array_cache *ac;
+ unsigned long flags;
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ ac = l3->alien[i];
+ if(ac) {
+ spin_lock_irqsave(&ac->lock, flags);
+ __drain_alien_cache(cachep, ac, i);
+ spin_unlock_irqrestore(&ac->lock, flags);
+ }
+ }
+}
+#endif

static int __devinit cpuup_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
kmem_cache_t* cachep;
+ struct kmem_list3 *l3 = NULL;
+ int node = cpu_to_node(cpu);
+ int memsize = sizeof(struct kmem_list3);
+ struct array_cache *nc = NULL;

switch (action) {
case CPU_UP_PREPARE:
down(&cache_chain_sem);
+ /* we need to do this right in the begining since
+ * alloc_arraycache's are going to use this list.
+ * kmalloc_node allows us to add the slab to the right
+ * kmem_list3 and not this cpu's kmem_list3
+ */
+
list_for_each_entry(cachep, &cache_chain, next) {
- struct array_cache *nc;
+ /* setup the size64 kmemlist for hcpu before we can
+ * begin anything. Make sure some other cpu on this
+ * node has not already allocated this
+ */
+ if (!cachep->nodelists[node]) {
+ if(!(l3 = kmalloc_node(memsize,
+ GFP_KERNEL, node)))
+ goto bad;
+ LIST3_INIT(l3);
+ l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep)%REAPTIMEOUT_LIST3;

- nc = alloc_arraycache(cpu, cachep->limit, cachep->batchcount);
+ cachep->nodelists[node] = l3;
+ }
+
+ spin_lock_irq(&cachep->nodelists[node]->list_lock);
+ cachep->nodelists[node]->free_limit =
+ (1 + nr_cpus_node(node)) *
+ cachep->batchcount + cachep->num;
+ spin_unlock_irq(&cachep->nodelists[node]->list_lock);
+ }
+
+ /* Now we can go ahead with allocating the shared array's
+ & array cache's */
+ list_for_each_entry(cachep, &cache_chain, next) {
+ nc = alloc_arraycache(cpu, cachep->limit,
+ cachep->batchcount);
if (!nc)
goto bad;
-
- spin_lock_irq(&cachep->spinlock);
cachep->array[cpu] = nc;
- cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
- + cachep->num;
- spin_unlock_irq(&cachep->spinlock);

+ l3 = cachep->nodelists[node];
+ BUG_ON(!l3);
+ if(!l3->shared) {
+ if(!(nc = alloc_arraycache(cpu,
+ cachep->shared*cachep->batchcount,
+ 0xbaadf00d)))
+ goto bad;
+
+ /* we are serialised from CPU_DEAD or
+ CPU_UP_CANCELLED by the cpucontrol lock */
+ l3->shared = nc;
+ }
}
up(&cache_chain_sem);
break;
@@ -730,13 +942,53 @@

list_for_each_entry(cachep, &cache_chain, next) {
struct array_cache *nc;
+ cpumask_t mask;

+ mask = node_to_cpumask(node);
spin_lock_irq(&cachep->spinlock);
/* cpu is dead; no one can alloc from it. */
nc = cachep->array[cpu];
cachep->array[cpu] = NULL;
- cachep->free_limit -= cachep->batchcount;
- free_block(cachep, ac_entry(nc), nc->avail);
+ l3 = cachep->nodelists[node];
+
+ if(!l3)
+ goto unlock_cache;
+
+ spin_lock(&l3->list_lock);
+
+ /* Free limit for this kmem_list3 */
+ l3->free_limit -= cachep->batchcount;
+ if(nc)
+ free_block(cachep, nc->entry, nc->avail);
+
+ if(!cpus_empty(mask)) {
+ spin_unlock(&l3->list_lock);
+ goto unlock_cache;
+ }
+
+ if(l3->shared) {
+ free_block(cachep, l3->shared->entry,
+ l3->shared->avail);
+ kfree(l3->shared);
+ l3->shared = NULL;
+ }
+#ifdef CONFIG_NUMA
+ if(l3->alien) {
+ drain_alien_cache(cachep, l3);
+ free_alien_cache(l3->alien);
+ l3->alien = NULL;
+ }
+#endif
+
+ /* free slabs belonging to this node */
+ if(__node_shrink(cachep, node)) {
+ cachep->nodelists[node] = NULL;
+ spin_unlock(&l3->list_lock);
+ kfree(l3);
+ }
+ else
+ spin_unlock(&l3->list_lock);
+unlock_cache:
spin_unlock_irq(&cachep->spinlock);
kfree(nc);
}
@@ -752,6 +1004,25 @@

static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };

+/*
+ * swap the static kmem_list3 with kmalloced memory
+ */
+static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list,
+ int nodeid)
+{
+ struct kmem_list3 *ptr;
+
+ BUG_ON((cachep->nodelists[nodeid]) != list);
+ ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
+ BUG_ON(!ptr);
+
+ local_irq_disable();
+ memcpy(ptr, list, sizeof(struct kmem_list3));
+ MAKE_ALL_LISTS(cachep, ptr, nodeid);
+ cachep->nodelists[nodeid] = ptr;
+ local_irq_enable();
+}
+
/* Initialisation.
* Called after the gfp() functions have been enabled, and before smp_init().
*/
@@ -760,6 +1031,13 @@
size_t left_over;
struct cache_sizes *sizes;
struct cache_names *names;
+ int i;
+
+ for(i = 0; i < NUM_INIT_LISTS; i++) {
+ LIST3_INIT(&initkmem_list3[i]);
+ if (i < MAX_NUMNODES)
+ cache_cache.nodelists[i] = NULL;
+ }

/*
* Fragmentation resistance on low memory - only use bigger
@@ -768,21 +1046,24 @@
if (num_physpages > (32 << 20) >> PAGE_SHIFT)
slab_break_gfp_order = BREAK_GFP_ORDER_HI;

-
/* Bootstrap is tricky, because several objects are allocated
* from caches that do not exist yet:
* 1) initialize the cache_cache cache: it contains the kmem_cache_t
* structures of all caches, except cache_cache itself: cache_cache
* is statically allocated.
- * Initially an __init data area is used for the head array, it's
- * replaced with a kmalloc allocated array at the end of the bootstrap.
+ * Initially an __init data area is used for the head array and the
+ * kmem_list3 structures, it's replaced with a kmalloc allocated
+ * array at the end of the bootstrap.
* 2) Create the first kmalloc cache.
- * The kmem_cache_t for the new cache is allocated normally. An __init
- * data area is used for the head array.
- * 3) Create the remaining kmalloc caches, with minimally sized head arrays.
+ * The kmem_cache_t for the new cache is allocated normally.
+ * An __init data area is used for the head array.
+ * 3) Create the remaining kmalloc caches, with minimally sized
+ * head arrays.
* 4) Replace the __init data head arrays for cache_cache and the first
* kmalloc cache with kmalloc allocated arrays.
- * 5) Resize the head arrays of the kmalloc caches to their final sizes.
+ * 5) Replace the __init data for kmem_list3 for cache_cache and
+ * the other cache's with kmalloc allocated memory.
+ * 6) Resize the head arrays of the kmalloc caches to their final sizes.
*/

/* 1) create the cache_cache */
@@ -791,6 +1072,7 @@
list_add(&cache_cache.next, &cache_chain);
cache_cache.colour_off = cache_line_size();
cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
+ cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];

cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());

@@ -808,15 +1090,30 @@
sizes = malloc_sizes;
names = cache_names;

+ /* Initialize the caches that provide memory for the array cache
+ * and the kmem_list3 structures first.
+ * Without this, further allocations will bug
+ **/
+
+ sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
+ sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN,
+ (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+
+ if (INDEX_AC != INDEX_L3)
+ sizes[INDEX_L3].cs_cachep = kmem_cache_create(names[INDEX_L3].name,
+ sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN,
+ (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+
while (sizes->cs_size != ULONG_MAX) {
/* For performance, all the general caches are L1 aligned.
* This should be particularly beneficial on SMP boxes, as it
* eliminates "false sharing".
* Note for systems short on memory removing the alignment will
* allow tighter packing of the smaller caches. */
- sizes->cs_cachep = kmem_cache_create(names->name,
- sizes->cs_size, ARCH_KMALLOC_MINALIGN,
- (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+ if(!sizes->cs_cachep)
+ sizes->cs_cachep = kmem_cache_create(names->name,
+ sizes->cs_size, ARCH_KMALLOC_MINALIGN,
+ (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);

/* Inc off-slab bufctl limit until the ceiling is hit. */
if (!(OFF_SLAB(sizes->cs_cachep))) {
@@ -835,24 +1132,46 @@
/* 4) Replace the bootstrap head arrays */
{
void * ptr;
-
+
ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+
local_irq_disable();
BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
- memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init));
+ memcpy(ptr, ac_data(&cache_cache),
+ sizeof(struct arraycache_init));
cache_cache.array[smp_processor_id()] = ptr;
local_irq_enable();
-
+
ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+
local_irq_disable();
- BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache);
- memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep),
+ BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep)
+ != &initarray_generic.cache);
+ memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep),
sizeof(struct arraycache_init));
- malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr;
+ malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = ptr;
local_irq_enable();
}
+ /* 5) Replace the bootstrap kmem_list3's */
+ {
+ int j;
+ /* Replace the static kmem_list3 structures for the boot cpu */
+ init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
+ numa_node_id());
+
+ for (j=0; j < MAX_NUMNODES; j++) {
+ if(is_node_online(j))
+ init_list(malloc_sizes[INDEX_L3].cs_cachep,
+ &initkmem_list3[SIZE_L3+j], j);
+ }
+ if (INDEX_AC != INDEX_L3) {
+ init_list(malloc_sizes[INDEX_AC].cs_cachep,
+ &initkmem_list3[SIZE_AC],
+ numa_node_id());
+ }
+ }

- /* 5) resize the head arrays to their final sizes */
+ /* 6) resize the head arrays to their final sizes */
{
kmem_cache_t *cachep;
down(&cache_chain_sem);
@@ -868,7 +1187,6 @@
* that initializes ac_data for all new cpus
*/
register_cpu_notifier(&cpucache_notifier);
-

/* The reap timers are started later, with a module init call:
* That part of the kernel is not yet operational.
@@ -1165,6 +1483,21 @@
}
}

+/* For setting up all the kmem_list3s for cache whose objsize is same
+ as size of kmem_list3. */
+static inline void set_up_list3s(kmem_cache_t *cachep)
+{
+ int i;
+ for(i = 0; i < MAX_NUMNODES; i++) {
+ if(is_node_online(i)) {
+ cachep->nodelists[i] = &initkmem_list3[SIZE_L3+i];
+ cachep->nodelists[i]->next_reap = jiffies +
+ REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+ }
+ }
+}
+
/**
* kmem_cache_create - Create a cache.
* @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -1420,10 +1753,6 @@
cachep->gfpflags |= GFP_DMA;
spin_lock_init(&cachep->spinlock);
cachep->objsize = size;
- /* NUMA */
- INIT_LIST_HEAD(&cachep->lists.slabs_full);
- INIT_LIST_HEAD(&cachep->lists.slabs_partial);
- INIT_LIST_HEAD(&cachep->lists.slabs_free);

if (flags & CFLGS_OFF_SLAB)
cachep->slabp_cache = kmem_find_general_cachep(slab_size,0);
@@ -1442,24 +1771,51 @@
* the cache that's used by kmalloc(24), otherwise
* the creation of further caches will BUG().
*/
- cachep->array[smp_processor_id()] = &initarray_generic.cache;
- g_cpucache_up = PARTIAL;
+ cachep->array[smp_processor_id()] =
+ &initarray_generic.cache;
+
+ /* If the cache that's used by
+ * kmalloc(sizeof(kmem_list3)) is the first cache,
+ * then we need to set up all its list3s, otherwise
+ * the creation of further caches will BUG().
+ */
+ if (INDEX_AC == INDEX_L3) {
+ set_up_list3s(cachep);
+ g_cpucache_up = PARTIAL_L3;
+ } else {
+ cachep->nodelists[numa_node_id()] =
+ &initkmem_list3[SIZE_AC];
+ g_cpucache_up = PARTIAL_AC;
+ }
} else {
- cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL);
+ cachep->array[smp_processor_id()] =
+ kmalloc(sizeof(struct arraycache_init),
+ GFP_KERNEL);
+
+ if (g_cpucache_up == PARTIAL_AC) {
+ set_up_list3s(cachep);
+ g_cpucache_up = PARTIAL_L3;
+ } else {
+ cachep->nodelists[numa_node_id()] =
+ kmalloc(sizeof(struct kmem_list3),
+ GFP_KERNEL);
+ LIST3_INIT(cachep->nodelists[numa_node_id()]);
+ }
}
+ cachep->nodelists[numa_node_id()]->next_reap =
+ jiffies + REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+
BUG_ON(!ac_data(cachep));
+ BUG_ON(!cachep->nodelists[numa_node_id()]);
ac_data(cachep)->avail = 0;
ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
ac_data(cachep)->batchcount = 1;
ac_data(cachep)->touched = 0;
cachep->batchcount = 1;
cachep->limit = BOOT_CPUCACHE_ENTRIES;
- cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
- + cachep->num;
}

- cachep->lists.next_reap = jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep/L1_CACHE_BYTES)%REAPTIMEOUT_LIST3;
#if DEBUG
cachep->redzonetest = jiffies + REDZONETIMEOUT +
((unsigned long)cachep/L1_CACHE_BYTES)%REDZONETIMEOUT;
@@ -1521,13 +1877,23 @@
{
#ifdef CONFIG_SMP
check_irq_off();
- BUG_ON(spin_trylock(&cachep->spinlock));
+ BUG_ON(spin_trylock(&list3_data(cachep)->list_lock));
+#endif
+}
+
+static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
+{
+#ifdef CONFIG_SMP
+ check_irq_off();
+ BUG_ON(spin_trylock(&(cachep->nodelists[node])->list_lock));
#endif
}
+
#else
#define check_irq_off() do { } while(0)
#define check_irq_on() do { } while(0)
#define check_spinlock_acquired(x) do { } while(0)
+#define check_spinlock_acquired_node(x, y) do { } while(0)
#endif

/*
@@ -1549,7 +1915,7 @@
}

static void drain_array_locked(kmem_cache_t* cachep,
- struct array_cache *ac, int force);
+ struct array_cache *ac, int force, int node);

static void do_drain(void *arg)
{
@@ -1558,59 +1924,84 @@

check_irq_off();
ac = ac_data(cachep);
- spin_lock(&cachep->spinlock);
- free_block(cachep, &ac_entry(ac)[0], ac->avail);
- spin_unlock(&cachep->spinlock);
+ spin_lock(&list3_data(cachep)->list_lock);
+ free_block(cachep, ac->entry, ac->avail);
+ spin_unlock(&list3_data(cachep)->list_lock);
ac->avail = 0;
}

static void drain_cpu_caches(kmem_cache_t *cachep)
{
+ struct kmem_list3 *l3;
+ int i;
+
smp_call_function_all_cpus(do_drain, cachep);
check_irq_on();
spin_lock_irq(&cachep->spinlock);
- if (cachep->lists.shared)
- drain_array_locked(cachep, cachep->lists.shared, 1);
+ for(i = 0; i < MAX_NUMNODES; i++) {
+ l3 = cachep->nodelists[i];
+ if (l3) {
+ spin_lock(&l3->list_lock);
+ drain_array_locked(cachep, l3->shared, 1, i);
+ spin_unlock(&l3->list_lock);
+#ifdef CONFIG_NUMA
+ if(l3->alien)
+ drain_alien_cache(cachep, l3);
+#endif
+ }
+ }
spin_unlock_irq(&cachep->spinlock);
}

-
-/* NUMA shrink all list3s */
-static int __cache_shrink(kmem_cache_t *cachep)
+static int __node_shrink(kmem_cache_t *cachep, int node)
{
struct slab *slabp;
+ struct kmem_list3 *l3 = cachep->nodelists[node];
int ret;

- drain_cpu_caches(cachep);
-
- check_irq_on();
- spin_lock_irq(&cachep->spinlock);
-
for(;;) {
struct list_head *p;

- p = cachep->lists.slabs_free.prev;
- if (p == &cachep->lists.slabs_free)
+ p = l3->slabs_free.prev;
+ if (p == &l3->slabs_free)
break;

- slabp = list_entry(cachep->lists.slabs_free.prev, struct slab, list);
+ slabp = list_entry(l3->slabs_free.prev, struct slab, list);
#if DEBUG
if (slabp->inuse)
BUG();
#endif
list_del(&slabp->list);

- cachep->lists.free_objects -= cachep->num;
- spin_unlock_irq(&cachep->spinlock);
+ l3->free_objects -= cachep->num;
+ spin_unlock_irq(&l3->list_lock);
slab_destroy(cachep, slabp);
- spin_lock_irq(&cachep->spinlock);
+ spin_lock_irq(&l3->list_lock);
}
- ret = !list_empty(&cachep->lists.slabs_full) ||
- !list_empty(&cachep->lists.slabs_partial);
- spin_unlock_irq(&cachep->spinlock);
+ ret = !list_empty(&l3->slabs_full) ||
+ !list_empty(&l3->slabs_partial);
return ret;
}

+static int __cache_shrink(kmem_cache_t *cachep)
+{
+ int ret = 0, i = 0;
+ struct kmem_list3 *l3;
+
+ drain_cpu_caches(cachep);
+
+ check_irq_on();
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ l3 = cachep->nodelists[i];
+ if (l3) {
+ spin_lock_irq(&l3->list_lock);
+ ret += __node_shrink(cachep, i);
+ spin_unlock_irq(&l3->list_lock);
+ }
+ }
+ return (ret ? 1 : 0);
+}
+
/**
* kmem_cache_shrink - Shrink a cache.
* @cachep: The cache to shrink.
@@ -1647,6 +2038,7 @@
int kmem_cache_destroy(kmem_cache_t * cachep)
{
int i;
+ struct kmem_list3 *l3;

if (!cachep || in_interrupt())
BUG();
@@ -1681,8 +2073,15 @@
kfree(cachep->array[i]);

/* NUMA: free the list3 structures */
- kfree(cachep->lists.shared);
- cachep->lists.shared = NULL;
+ for(i = 0; i < MAX_NUMNODES; i++) {
+ if ((l3 = cachep->nodelists[i])) {
+ kfree(l3->shared);
+#ifdef CONFIG_NUMA
+ free_alien_cache(l3->alien);
+#endif
+ kfree(l3);
+ }
+ }
kmem_cache_free(&cache_cache, cachep);

unlock_cpu_hotplug();
@@ -1801,6 +2200,7 @@
size_t offset;
unsigned int local_flags;
unsigned long ctor_flags;
+ struct kmem_list3 *l3;

/* Be lazy and only check for valid flags here,
* keeping it out of the critical path in kmem_cache_alloc().
@@ -1832,6 +2232,7 @@

spin_unlock(&cachep->spinlock);

+ check_irq_off();
if (local_flags & __GFP_WAIT)
local_irq_enable();

@@ -1843,8 +2244,9 @@
*/
kmem_flagcheck(cachep, flags);

-
- /* Get mem for the objs. */
+ /* Get mem for the objs.
+ * Attempt to allocate a physical page from 'nodeid',
+ */
if (!(objp = kmem_getpages(cachep, flags, nodeid)))
goto failed;

@@ -1852,6 +2254,9 @@
if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
goto opps1;

+#ifdef CONFIG_NUMA
+ slabp->nodeid = nodeid;
+#endif
set_slab_attr(cachep, slabp, objp);

cache_init_objs(cachep, slabp, ctor_flags);
@@ -1859,13 +2264,14 @@
if (local_flags & __GFP_WAIT)
local_irq_disable();
check_irq_off();
- spin_lock(&cachep->spinlock);
+ l3 = cachep->nodelists[nodeid];
+ spin_lock(&l3->list_lock);

/* Make slab active. */
- list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_free));
+ list_add_tail(&slabp->list, &(l3->slabs_free));
STATS_INC_GROWN(cachep);
- list3_data(cachep)->free_objects += cachep->num;
- spin_unlock(&cachep->spinlock);
+ l3->free_objects += cachep->num;
+ spin_unlock(&l3->list_lock);
return 1;
opps1:
kmem_freepages(cachep, objp);
@@ -1971,7 +2377,6 @@
kmem_bufctl_t i;
int entries = 0;

- check_spinlock_acquired(cachep);
/* Check slab's freelist to see if this obj is there. */
for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
entries++;
@@ -2016,8 +2421,9 @@
}
l3 = list3_data(cachep);

- BUG_ON(ac->avail > 0);
- spin_lock(&cachep->spinlock);
+ BUG_ON(ac->avail > 0 || !l3);
+ spin_lock(&l3->list_lock);
+
if (l3->shared) {
struct array_cache *shared_array = l3->shared;
if (shared_array->avail) {
@@ -2025,8 +2431,9 @@
batchcount = shared_array->avail;
shared_array->avail -= batchcount;
ac->avail = batchcount;
- memcpy(ac_entry(ac), &ac_entry(shared_array)[shared_array->avail],
- sizeof(void*)*batchcount);
+ memcpy(ac->entry,
+ &(shared_array->entry[shared_array->avail]),
+ sizeof(void*)*batchcount);
shared_array->touched = 1;
goto alloc_done;
}
@@ -2053,7 +2460,8 @@
STATS_SET_HIGH(cachep);

/* get obj pointer */
- ac_entry(ac)[ac->avail++] = slabp->s_mem + slabp->free*cachep->objsize;
+ ac->entry[ac->avail++] = slabp->s_mem +
+ slabp->free*cachep->objsize;

slabp->inuse++;
next = slab_bufctl(slabp)[slabp->free];
@@ -2075,12 +2483,12 @@
must_grow:
l3->free_objects -= ac->avail;
alloc_done:
- spin_unlock(&cachep->spinlock);
+ spin_unlock(&l3->list_lock);

if (unlikely(!ac->avail)) {
int x;
- x = cache_grow(cachep, flags, -1);
-
+ x = cache_grow(cachep, flags, numa_node_id());
+
// cache_grow can reenable interrupts, then ac could change.
ac = ac_data(cachep);
if (!x && ac->avail == 0) // no objects in sight? abort
@@ -2090,7 +2498,7 @@
goto retry;
}
ac->touched = 1;
- return ac_entry(ac)[--ac->avail];
+ return ac->entry[--ac->avail];
}

static inline void
@@ -2171,7 +2579,7 @@
if (likely(ac->avail)) {
STATS_INC_ALLOCHIT(cachep);
ac->touched = 1;
- objp = ac_entry(ac)[--ac->avail];
+ objp = ac->entry[--ac->avail];
} else {
STATS_INC_ALLOCMISS(cachep);
objp = cache_alloc_refill(cachep, flags);
@@ -2181,29 +2589,102 @@
return objp;
}

-/*
- * NUMA: different approach needed if the spinlock is moved into
- * the l3 structure
+#ifdef CONFIG_NUMA
+/*
+ * A interface to enable slab creation on nodeid
*/
+static void *__cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid)
+{
+ struct list_head *entry;
+ struct slab *slabp;
+ struct kmem_list3 *l3;
+ void *obj;
+ kmem_bufctl_t next;
+ int x;

+ l3 = cachep->nodelists[nodeid];
+ BUG_ON(!l3);
+
+retry:
+ spin_lock(&l3->list_lock);
+ entry = l3->slabs_partial.next;
+ if (entry == &l3->slabs_partial) {
+ l3->free_touched = 1;
+ entry = l3->slabs_free.next;
+ if (entry == &l3->slabs_free)
+ goto must_grow;
+ }
+
+ slabp = list_entry(entry, struct slab, list);
+ check_spinlock_acquired_node(cachep, nodeid);
+ check_slabp(cachep, slabp);
+
+ STATS_INC_NODEALLOCS(cachep);
+ STATS_INC_ACTIVE(cachep);
+ STATS_SET_HIGH(cachep);
+
+ BUG_ON(slabp->inuse == cachep->num);
+
+ /* get obj pointer */
+ obj = slabp->s_mem + slabp->free*cachep->objsize;
+ slabp->inuse++;
+ next = slab_bufctl(slabp)[slabp->free];
+#if DEBUG
+ slab_bufctl(slabp)[slabp->free] = BUFCTL_ALLOC;
+#endif
+ slabp->free = next;
+ check_slabp(cachep, slabp);
+ l3->free_objects--;
+ /* move slabp to correct slabp list: */
+ list_del(&slabp->list);
+
+ if (slabp->free == BUFCTL_END) {
+ list_add(&slabp->list, &l3->slabs_full);
+ } else {
+ list_add(&slabp->list, &l3->slabs_partial);
+ }
+
+ spin_unlock(&l3->list_lock);
+ goto done;
+
+must_grow:
+ spin_unlock(&l3->list_lock);
+ x = cache_grow(cachep, flags, nodeid);
+
+ if (!x)
+ return NULL;
+
+ goto retry;
+done:
+ return obj;
+}
+#endif
+
+/*
+ * Caller needs to acquire correct kmem_list's list_lock
+ */
static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
{
int i;
-
- check_spinlock_acquired(cachep);
-
- /* NUMA: move add into loop */
- cachep->lists.free_objects += nr_objects;
+ struct kmem_list3 *l3;

for (i = 0; i < nr_objects; i++) {
void *objp = objpp[i];
struct slab *slabp;
unsigned int objnr;
+ int nodeid = 0;

slabp = GET_PAGE_SLAB(virt_to_page(objp));
+#ifdef CONFIG_NUMA
+ nodeid = slabp->nodeid;
+#endif
+ l3 = cachep->nodelists[nodeid];
list_del(&slabp->list);
objnr = (objp - slabp->s_mem) / cachep->objsize;
+ check_spinlock_acquired_node(cachep, nodeid);
check_slabp(cachep, slabp);
+
+
#if 0 /* disabled, not compatible with leak detection */
if (slab_bufctl(slabp)[objnr] != BUFCTL_ALLOC) {
printk(KERN_ERR "slab: double free detected in cache "
@@ -2215,24 +2696,23 @@
slabp->free = objnr;
STATS_DEC_ACTIVE(cachep);
slabp->inuse--;
+ l3->free_objects++;
check_slabp(cachep, slabp);

/* fixup slab chains */
if (slabp->inuse == 0) {
- if (cachep->lists.free_objects > cachep->free_limit) {
- cachep->lists.free_objects -= cachep->num;
+ if (l3->free_objects > l3->free_limit) {
+ l3->free_objects -= cachep->num;
slab_destroy(cachep, slabp);
} else {
- list_add(&slabp->list,
- &list3_data_ptr(cachep, objp)->slabs_free);
+ list_add(&slabp->list, &l3->slabs_free);
}
} else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
*/
- list_add_tail(&slabp->list,
- &list3_data_ptr(cachep, objp)->slabs_partial);
+ list_add_tail(&slabp->list, &l3->slabs_partial);
}
}
}
@@ -2240,36 +2720,38 @@
static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
{
int batchcount;
+ struct kmem_list3 *l3;

batchcount = ac->batchcount;
#if DEBUG
BUG_ON(!batchcount || batchcount > ac->avail);
#endif
check_irq_off();
- spin_lock(&cachep->spinlock);
- if (cachep->lists.shared) {
- struct array_cache *shared_array = cachep->lists.shared;
+ l3 = list3_data(cachep);
+ spin_lock(&l3->list_lock);
+ if (l3->shared) {
+ struct array_cache *shared_array = l3->shared;
int max = shared_array->limit-shared_array->avail;
if (max) {
if (batchcount > max)
batchcount = max;
- memcpy(&ac_entry(shared_array)[shared_array->avail],
- &ac_entry(ac)[0],
+ memcpy(&(shared_array->entry[shared_array->avail]),
+ ac->entry,
sizeof(void*)*batchcount);
shared_array->avail += batchcount;
goto free_done;
}
}

- free_block(cachep, &ac_entry(ac)[0], batchcount);
+ free_block(cachep, ac->entry, batchcount);
free_done:
#if STATS
{
int i = 0;
struct list_head *p;

- p = list3_data(cachep)->slabs_free.next;
- while (p != &(list3_data(cachep)->slabs_free)) {
+ p = l3->slabs_free.next;
+ while (p != &(l3->slabs_free)) {
struct slab *slabp;

slabp = list_entry(p, struct slab, list);
@@ -2281,12 +2763,13 @@
STATS_SET_FREEABLE(cachep, i);
}
#endif
- spin_unlock(&cachep->spinlock);
+ spin_unlock(&l3->list_lock);
ac->avail -= batchcount;
- memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount],
+ memmove(ac->entry, &(ac->entry[batchcount]),
sizeof(void*)*ac->avail);
}

+
/*
* __cache_free
* Release an obj back to its cache. If the obj has a constructed
@@ -2301,14 +2784,47 @@
check_irq_off();
objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));

+ /* Make sure we are not freeing a object from another
+ * node to the array cache on this cpu.
+ */
+#ifdef CONFIG_NUMA
+ {
+ struct slab *slabp;
+ slabp = GET_PAGE_SLAB(virt_to_page(objp));
+ if (unlikely(slabp->nodeid != numa_node_id())) {
+ struct array_cache *alien = NULL;
+ int nodeid = slabp->nodeid;
+ struct kmem_list3 *l3 = list3_data(cachep);
+
+ STATS_INC_NODEFREES(cachep);
+ if (l3->alien && l3->alien[nodeid]) {
+ alien = l3->alien[nodeid];
+ spin_lock(&alien->lock);
+ if (unlikely(alien->avail == alien->limit))
+ __drain_alien_cache(cachep,
+ alien, nodeid);
+ alien->entry[alien->avail++] = objp;
+ spin_unlock(&alien->lock);
+ }
+ else {
+ spin_lock(&(cachep->nodelists[nodeid])->
+ list_lock);
+ free_block(cachep, &objp, 1);
+ spin_unlock(&(cachep->nodelists[nodeid])->
+ list_lock);
+ }
+ return;
+ }
+ }
+#endif
if (likely(ac->avail < ac->limit)) {
STATS_INC_FREEHIT(cachep);
- ac_entry(ac)[ac->avail++] = objp;
+ ac->entry[ac->avail++] = objp;
return;
} else {
STATS_INC_FREEMISS(cachep);
cache_flusharray(cachep, ac);
- ac_entry(ac)[ac->avail++] = objp;
+ ac->entry[ac->avail++] = objp;
}
}

@@ -2378,78 +2894,24 @@
* Identical to kmem_cache_alloc, except that this function is slow
* and can sleep. And it will allocate memory on the given node, which
* can improve the performance for cpu bound structures.
+ * New and improved: it will now make sure that the object gets
+ * put on the correct node list so that there is no false sharing.
*/
void *kmem_cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid)
{
- int loop;
- void *objp;
- struct slab *slabp;
- kmem_bufctl_t next;
-
- for (loop = 0;;loop++) {
- struct list_head *q;
-
- objp = NULL;
- check_irq_on();
- spin_lock_irq(&cachep->spinlock);
- /* walk through all partial and empty slab and find one
- * from the right node */
- list_for_each(q,&cachep->lists.slabs_partial) {
- slabp = list_entry(q, struct slab, list);
-
- if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid ||
- loop > 2)
- goto got_slabp;
- }
- list_for_each(q, &cachep->lists.slabs_free) {
- slabp = list_entry(q, struct slab, list);
-
- if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid ||
- loop > 2)
- goto got_slabp;
- }
- spin_unlock_irq(&cachep->spinlock);
-
- local_irq_disable();
- if (!cache_grow(cachep, flags, nodeid)) {
- local_irq_enable();
- return NULL;
- }
- local_irq_enable();
- }
-got_slabp:
- /* found one: allocate object */
- check_slabp(cachep, slabp);
- check_spinlock_acquired(cachep);
-
- STATS_INC_ALLOCED(cachep);
- STATS_INC_ACTIVE(cachep);
- STATS_SET_HIGH(cachep);
- STATS_INC_NODEALLOCS(cachep);
-
- objp = slabp->s_mem + slabp->free*cachep->objsize;
+ unsigned long save_flags;
+ void *ptr;

- slabp->inuse++;
- next = slab_bufctl(slabp)[slabp->free];
-#if DEBUG
- slab_bufctl(slabp)[slabp->free] = BUFCTL_ALLOC;
-#endif
- slabp->free = next;
- check_slabp(cachep, slabp);
+ if (nodeid == numa_node_id() || nodeid == -1)
+ return __cache_alloc(cachep, flags);

- /* move slabp to correct slabp list: */
- list_del(&slabp->list);
- if (slabp->free == BUFCTL_END)
- list_add(&slabp->list, &cachep->lists.slabs_full);
- else
- list_add(&slabp->list, &cachep->lists.slabs_partial);
-
- list3_data(cachep)->free_objects--;
- spin_unlock_irq(&cachep->spinlock);
+ cache_alloc_debugcheck_before(cachep, flags);
+ local_irq_save(save_flags);
+ ptr = __cache_alloc_node(cachep, flags, nodeid);
+ local_irq_restore(save_flags);
+ ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0));

- objp = cache_alloc_debugcheck_after(cachep, GFP_KERNEL, objp,
- __builtin_return_address(0));
- return objp;
+ return ptr;
}
EXPORT_SYMBOL(kmem_cache_alloc_node);

@@ -2679,43 +3141,126 @@
{
struct list_head *q;
struct slab *slabp;
+ int i;
+ struct kmem_list3 *l3;

check_spinlock_acquired(cachep);

- list_for_each(q,&cachep->lists.slabs_full) {
- slabp = list_entry(q, struct slab, list);
+ for( i=0; i<MAX_NUMNODES; i++) {
+ l3 = cachep->nodelists[i];
+ if (!l3 || !is_node_online(i))
+ continue;
+
+ list_for_each(q,&l3->slabs_full) {
+ slabp = list_entry(q, struct slab, list);

- if (slabp->inuse != cachep->num) {
- printk(KERN_INFO "slab %s: wrong slabp found in full slab chain at %p (%d/%d).\n",
- cachep->name, slabp, slabp->inuse, cachep->num);
+ if (slabp->inuse != cachep->num) {
+ printk(KERN_INFO "slab %s: wrong slabp found in full slab chain at %p (%d/%d).\n",
+ cachep->name, slabp, slabp->inuse, cachep->num);
+ }
+ check_slabp(cachep, slabp);
+ check_slabuse(cachep, slabp);
}
- check_slabp(cachep, slabp);
- check_slabuse(cachep, slabp);
- }
- list_for_each(q,&cachep->lists.slabs_partial) {
- slabp = list_entry(q, struct slab, list);
+ list_for_each(q,&l3->slabs_partial) {
+ slabp = list_entry(q, struct slab, list);

- if (slabp->inuse == cachep->num || slabp->inuse == 0) {
- printk(KERN_INFO "slab %s: wrong slab found in partial chain at %p (%d/%d).\n",
- cachep->name, slabp, slabp->inuse, cachep->num);
+ if (slabp->inuse == cachep->num || slabp->inuse == 0) {
+ printk(KERN_INFO "slab %s: wrong slab found in partial chain at %p (%d/%d).\n",
+ cachep->name, slabp, slabp->inuse, cachep->num);
+ }
+ check_slabp(cachep, slabp);
+ check_slabuse(cachep, slabp);
}
- check_slabp(cachep, slabp);
- check_slabuse(cachep, slabp);
- }
- list_for_each(q,&cachep->lists.slabs_free) {
- slabp = list_entry(q, struct slab, list);
+ list_for_each(q,&l3->slabs_free) {
+ slabp = list_entry(q, struct slab, list);

- if (slabp->inuse != 0) {
- printk(KERN_INFO "slab %s: wrong slab found in free chain at %p (%d/%d).\n",
- cachep->name, slabp, slabp->inuse, cachep->num);
+ if (slabp->inuse != 0) {
+ printk(KERN_INFO "slab %s: wrong slab found in free chain at %p (%d/%d).\n",
+ cachep->name, slabp, slabp->inuse, cachep->num);
+ }
+ check_slabp(cachep, slabp);
+ check_slabuse(cachep, slabp);
}
- check_slabp(cachep, slabp);
- check_slabuse(cachep, slabp);
}
}

#endif

+/*
+ * This initializes kmem_list3 for all nodes.
+ */
+static int alloc_kmemlist(kmem_cache_t *cachep)
+{
+ int node, i;
+ struct kmem_list3 *l3;
+ int err = 0;
+
+ for(i=0; i < NR_CPUS; i++) {
+ if (cpu_online(i)) {
+ struct array_cache *nc = NULL, *new;
+#ifdef CONFIG_NUMA
+ struct array_cache **new_alien = NULL;
+#endif
+ node = cpu_to_node(i);
+#ifdef CONFIG_NUMA
+ if (!(new_alien = alloc_alien_cache(i, cachep->limit)))
+ goto fail;
+#endif
+ if (!(new = alloc_arraycache(i, (cachep->shared*
+ cachep->batchcount), 0xbaadf00d)))
+ goto fail;
+ if ((l3 = cachep->nodelists[node])) {
+
+ spin_lock_irq(&l3->list_lock);
+
+ if ((nc = cachep->nodelists[node]->shared))
+ free_block(cachep, nc->entry,
+ nc->avail);
+
+ l3->shared = new;
+#ifdef CONFIG_NUMA
+ if (!cachep->nodelists[node]->alien) {
+ l3->alien = new_alien;
+ new_alien = NULL;
+ }
+ l3->free_limit = (1 + nr_cpus_node(node))*
+ cachep->batchcount + cachep->num;
+#else
+ l3->free_limit = (1 + num_online_cpus())*
+ cachep->batchcount + cachep->num;
+#endif
+ spin_unlock_irq(&l3->list_lock);
+ kfree(nc);
+#ifdef CONFIG_NUMA
+ free_alien_cache(new_alien);
+#endif
+ continue;
+ }
+ if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
+ GFP_KERNEL, node)))
+ goto fail;
+
+ LIST3_INIT(l3);
+ l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+ l3->shared = new;
+#ifdef CONFIG_NUMA
+ l3->alien = new_alien;
+ l3->free_limit = (1 + nr_cpus_node(node))*
+ cachep->batchcount + cachep->num;
+#else
+ l3->free_limit = (1 + num_online_cpus())*
+ cachep->batchcount + cachep->num;
+#endif
+ cachep->nodelists[node] = l3;
+ }
+ }
+ return err;
+fail:
+ err = -ENOMEM;
+ return err;
+}
+
struct ccupdate_struct {
kmem_cache_t *cachep;
struct array_cache *new[NR_CPUS];
@@ -2738,8 +3283,7 @@
int shared)
{
struct ccupdate_struct new;
- struct array_cache *new_shared;
- int i;
+ int i, err;

memset(&new.new,0,sizeof(new.new));
for (i = 0; i < NR_CPUS; i++) {
@@ -2756,36 +3300,30 @@
new.cachep = cachep;

smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
-
+
check_irq_on();
spin_lock_irq(&cachep->spinlock);
cachep->batchcount = batchcount;
cachep->limit = limit;
- cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num;
+ cachep->shared = shared;
spin_unlock_irq(&cachep->spinlock);

for (i = 0; i < NR_CPUS; i++) {
struct array_cache *ccold = new.new[i];
if (!ccold)
continue;
- spin_lock_irq(&cachep->spinlock);
- free_block(cachep, ac_entry(ccold), ccold->avail);
- spin_unlock_irq(&cachep->spinlock);
+ spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
+ free_block(cachep, ccold->entry, ccold->avail);
+ spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
kfree(ccold);
}
- new_shared = alloc_arraycache(-1, batchcount*shared, 0xbaadf00d);
- if (new_shared) {
- struct array_cache *old;
-
- spin_lock_irq(&cachep->spinlock);
- old = cachep->lists.shared;
- cachep->lists.shared = new_shared;
- if (old)
- free_block(cachep, ac_entry(old), old->avail);
- spin_unlock_irq(&cachep->spinlock);
- kfree(old);
- }

+ err = alloc_kmemlist(cachep);
+ if (err) {
+ printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
+ cachep->name, -err);
+ BUG();
+ }
return 0;
}

@@ -2843,11 +3381,11 @@
}

static void drain_array_locked(kmem_cache_t *cachep,
- struct array_cache *ac, int force)
+ struct array_cache *ac, int force, int node)
{
int tofree;

- check_spinlock_acquired(cachep);
+ check_spinlock_acquired_node(cachep, node);
if (ac->touched && !force) {
ac->touched = 0;
} else if (ac->avail) {
@@ -2855,9 +3393,9 @@
if (tofree > ac->avail) {
tofree = (ac->avail+1)/2;
}
- free_block(cachep, ac_entry(ac), tofree);
+ free_block(cachep, ac->entry, tofree);
ac->avail -= tofree;
- memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree],
+ memmove(ac->entry, &(ac->entry[tofree]),
sizeof(void*)*ac->avail);
}
}
@@ -2876,6 +3414,7 @@
static void cache_reap(void *unused)
{
struct list_head *walk;
+ struct kmem_list3 *l3;

if (down_trylock(&cache_chain_sem)) {
/* Give up. Setup the next iteration. */
@@ -2896,33 +3435,40 @@

check_irq_on();

- spin_lock_irq(&searchp->spinlock);
+ l3 = list3_data(searchp);
+#ifdef CONFIG_NUMA
+ if (l3->alien)
+ drain_alien_cache(searchp, l3);
+#endif
+ spin_lock_irq(&l3->list_lock);

- drain_array_locked(searchp, ac_data(searchp), 0);
+ drain_array_locked(searchp, ac_data(searchp), 0,
+ numa_node_id());

#if DEBUG
- if(time_before(searchp->redzonetest, jiffies)) {
+ if (time_before(searchp->redzonetest, jiffies)) {
searchp->redzonetest = jiffies + REDZONETIMEOUT;
check_redzone(searchp);
}
#endif
- if(time_after(searchp->lists.next_reap, jiffies))
+ if (time_after(l3->next_reap, jiffies))
goto next_unlock;

- searchp->lists.next_reap = jiffies + REAPTIMEOUT_LIST3;
+ l3->next_reap = jiffies + REAPTIMEOUT_LIST3;

- if (searchp->lists.shared)
- drain_array_locked(searchp, searchp->lists.shared, 0);
+ if (l3->shared)
+ drain_array_locked(searchp, l3->shared, 0,
+ numa_node_id());

- if (searchp->lists.free_touched) {
- searchp->lists.free_touched = 0;
+ if (l3->free_touched) {
+ l3->free_touched = 0;
goto next_unlock;
}

- tofree = (searchp->free_limit+5*searchp->num-1)/(5*searchp->num);
+ tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num);
do {
- p = list3_data(searchp)->slabs_free.next;
- if (p == &(list3_data(searchp)->slabs_free))
+ p = l3->slabs_free.next;
+ if (p == &(l3->slabs_free))
break;

slabp = list_entry(p, struct slab, list);
@@ -2935,13 +3481,13 @@
* searchp cannot disappear, we hold
* cache_chain_lock
*/
- searchp->lists.free_objects -= searchp->num;
- spin_unlock_irq(&searchp->spinlock);
+ l3->free_objects -= searchp->num;
+ spin_unlock_irq(&l3->list_lock);
slab_destroy(searchp, slabp);
- spin_lock_irq(&searchp->spinlock);
+ spin_lock_irq(&l3->list_lock);
} while(--tofree > 0);
next_unlock:
- spin_unlock_irq(&searchp->spinlock);
+ spin_unlock_irq(&l3->list_lock);
next:
cond_resched();
}
@@ -2974,7 +3520,7 @@
seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
#if STATS
seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>"
- " <error> <maxfreeable> <freelimit> <nodeallocs>");
+ " <error> <maxfreeable> <nodeallocs> <remotefrees>");
seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
#endif
seq_putc(m, '\n');
@@ -3009,39 +3555,53 @@
unsigned long active_objs;
unsigned long num_objs;
unsigned long active_slabs = 0;
- unsigned long num_slabs;
- const char *name;
+ unsigned long num_slabs, free_objects = 0, shared_avail = 0;
+ const char *name;
char *error = NULL;
+ int i;
+ struct kmem_list3 *l3;

check_irq_on();
spin_lock_irq(&cachep->spinlock);
active_objs = 0;
num_slabs = 0;
- list_for_each(q,&cachep->lists.slabs_full) {
- slabp = list_entry(q, struct slab, list);
- if (slabp->inuse != cachep->num && !error)
- error = "slabs_full accounting error";
- active_objs += cachep->num;
- active_slabs++;
- }
- list_for_each(q,&cachep->lists.slabs_partial) {
- slabp = list_entry(q, struct slab, list);
- if (slabp->inuse == cachep->num && !error)
- error = "slabs_partial inuse accounting error";
- if (!slabp->inuse && !error)
- error = "slabs_partial/inuse accounting error";
- active_objs += slabp->inuse;
- active_slabs++;
- }
- list_for_each(q,&cachep->lists.slabs_free) {
- slabp = list_entry(q, struct slab, list);
- if (slabp->inuse && !error)
- error = "slabs_free/inuse accounting error";
- num_slabs++;
+ for( i=0; i<MAX_NUMNODES; i++) {
+ l3 = cachep->nodelists[i];
+ if (!l3 || !is_node_online(i))
+ continue;
+
+ spin_lock(&l3->list_lock);
+
+ list_for_each(q,&l3->slabs_full) {
+ slabp = list_entry(q, struct slab, list);
+ if (slabp->inuse != cachep->num && !error)
+ error = "slabs_full accounting error";
+ active_objs += cachep->num;
+ active_slabs++;
+ }
+ list_for_each(q,&l3->slabs_partial) {
+ slabp = list_entry(q, struct slab, list);
+ if (slabp->inuse == cachep->num && !error)
+ error = "slabs_partial inuse accounting error";
+ if (!slabp->inuse && !error)
+ error = "slabs_partial/inuse accounting error";
+ active_objs += slabp->inuse;
+ active_slabs++;
+ }
+ list_for_each(q,&l3->slabs_free) {
+ slabp = list_entry(q, struct slab, list);
+ if (slabp->inuse && !error)
+ error = "slabs_free/inuse accounting error";
+ num_slabs++;
+ }
+ free_objects += l3->free_objects;
+ shared_avail += l3->shared->avail;
+
+ spin_unlock(&l3->list_lock);
}
num_slabs+=active_slabs;
num_objs = num_slabs*cachep->num;
- if (num_objs - active_objs != cachep->lists.free_objects && !error)
+ if (num_objs - active_objs != free_objects && !error)
error = "free_objects accounting error";

name = cachep->name;
@@ -3053,9 +3613,9 @@
cachep->num, (1<<cachep->gfporder));
seq_printf(m, " : tunables %4u %4u %4u",
cachep->limit, cachep->batchcount,
- cachep->lists.shared->limit/cachep->batchcount);
- seq_printf(m, " : slabdata %6lu %6lu %6u",
- active_slabs, num_slabs, cachep->lists.shared->avail);
+ cachep->shared);
+ seq_printf(m, " : slabdata %6lu %6lu %6lu",
+ active_slabs, num_slabs, shared_avail);
#if STATS
{ /* list3 stats */
unsigned long high = cachep->high_mark;
@@ -3064,12 +3624,13 @@
unsigned long reaped = cachep->reaped;
unsigned long errors = cachep->errors;
unsigned long max_freeable = cachep->max_freeable;
- unsigned long free_limit = cachep->free_limit;
unsigned long node_allocs = cachep->node_allocs;
+ unsigned long node_frees = cachep->node_frees;

- seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu",
- allocs, high, grown, reaped, errors,
- max_freeable, free_limit, node_allocs);
+ seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
+ %4lu %4lu %4lu %4lu",
+ allocs, high, grown, reaped, errors,
+ max_freeable, node_allocs, node_frees);
}
/* cpu stats */
{
@@ -3112,19 +3673,27 @@
{
#if DEBUG
struct list_head *q;
+ int node;
+ struct kmem_list3 *l3;

check_irq_on();
spin_lock_irq(&cachep->spinlock);
- list_for_each(q,&cachep->lists.slabs_full) {
- struct slab *slabp;
- int i;
- slabp = list_entry(q, struct slab, list);
- for (i = 0; i < cachep->num; i++) {
- unsigned long sym = slab_bufctl(slabp)[i];
+ for( node=0; node<MAX_NUMNODES; node++) {
+ l3 = cachep->nodelists[node];
+ if (!l3 || !is_node_online(node))
+ continue;

- printk("obj %p/%d: %p", slabp, i, (void *)sym);
- print_symbol(" <%s>", sym);
- printk("\n");
+ list_for_each(q,&l3->slabs_full) {
+ struct slab *slabp;
+ int i;
+ slabp = list_entry(q, struct slab, list);
+ for (i = 0; i < cachep->num; i++) {
+ unsigned long sym = slab_bufctl(slabp)[i];
+
+ printk("obj %p/%d: %p", slabp, i, (void *)sym);
+ print_symbol(" <%s>", sym);
+ printk("\n");
+ }
}
}
spin_unlock_irq(&cachep->spinlock);

2005-05-14 07:43:13

by Andrew Morton

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

Christoph Lameter <[email protected]> wrote:
>
> This patch allows kmalloc_node to be as fast as kmalloc by introducing
> node specific page lists for partial, free and full slabs.

Oh drat - what happened to all the coding-style fixups? Redone patch
below. Please merge - slab.c is already not a nice place to visit.

> +#ifndef CONFIG_NUMA
> +#if MAX_NUMNODES != 1
> +#error "Broken Configuration: CONFIG_NUMA not set but MAX_NUMNODES !=1 !!"
> +#endif
> +#endif

Well that's doing to make it fail to compile at all on ppc64.

> {
> #ifdef CONFIG_SMP
> check_irq_off();
> - BUG_ON(spin_trylock(&cachep->spinlock));
> + BUG_ON(spin_trylock(&list3_data(cachep)->list_lock));
> +#endif

We can use assert_spin_lcoked() here now btw.


I hacked things to compile by setting NDOES_SHIFT to zero and the machine
boots. I'll leave that hack in place for the while, so -mm is busted on
ppc64 NUMA. Please sort things out with the ppc64 guys?


diff -puN mm/slab.c~numa-aware-slab-allocator-v2-tidy mm/slab.c
--- 25/mm/slab.c~numa-aware-slab-allocator-v2-tidy 2005-05-14 00:08:02.000000000 -0700
+++ 25-akpm/mm/slab.c 2005-05-14 00:16:41.000000000 -0700
@@ -356,7 +356,7 @@ static inline int index_of(const size_t
(parent)->list_lock = SPIN_LOCK_UNLOCKED; \
(parent)->free_objects = 0; \
(parent)->free_touched = 0; \
- } while(0)
+ } while (0)
#else

#define LIST3_INIT(parent) \
@@ -368,21 +368,21 @@ static inline int index_of(const size_t
(parent)->list_lock = SPIN_LOCK_UNLOCKED; \
(parent)->free_objects = 0; \
(parent)->free_touched = 0; \
- } while(0)
+ } while (0)
#endif

#define MAKE_LIST(cachep, listp, slab, nodeid) \
do { \
INIT_LIST_HEAD(listp); \
list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
- }while(0)
+ } while (0)

#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
do { \
MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \
- MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
+ MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
- }while(0)
+ } while (0)

#define list3_data(cachep) \
((cachep->nodelists[numa_node_id()]))
@@ -807,15 +807,15 @@ static inline struct array_cache **alloc
if (limit > 1)
limit = 12;
ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
- if(ac_ptr) {
+ if (ac_ptr) {
for (i = 0; i < MAX_NUMNODES; i++) {
if (i == node) {
ac_ptr[i] = NULL;
continue;
}
ac_ptr[i] = alloc_arraycache(cpu, limit, 0xbaadf00d);
- if(!ac_ptr[i]) {
- for(i--; i <=0; i--)
+ if (!ac_ptr[i]) {
+ for (i--; i <=0; i--)
kfree(ac_ptr[i]);
kfree(ac_ptr);
return NULL;
@@ -829,7 +829,7 @@ static inline void free_alien_cache(stru
{
int i;

- if(!ac_ptr)
+ if (!ac_ptr)
return;
for (i = 0; i < MAX_NUMNODES; i++)
kfree(ac_ptr[i]);
@@ -841,7 +841,7 @@ static inline void __drain_alien_cache(k
{
struct kmem_list3 *rl3 = cachep->nodelists[node];

- if(ac->avail) {
+ if (ac->avail) {
spin_lock(&rl3->list_lock);
free_block(cachep, ac->entry, ac->avail);
ac->avail = 0;
@@ -857,7 +857,7 @@ static void drain_alien_cache(kmem_cache

for (i = 0; i < MAX_NUMNODES; i++) {
ac = l3->alien[i];
- if(ac) {
+ if (ac) {
spin_lock_irqsave(&ac->lock, flags);
__drain_alien_cache(cachep, ac, i);
spin_unlock_irqrestore(&ac->lock, flags);
@@ -891,12 +891,12 @@ static int __devinit cpuup_callback(stru
* node has not already allocated this
*/
if (!cachep->nodelists[node]) {
- if(!(l3 = kmalloc_node(memsize,
+ if (!(l3 = kmalloc_node(memsize,
GFP_KERNEL, node)))
goto bad;
LIST3_INIT(l3);
l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+ ((unsigned long)cachep)%REAPTIMEOUT_LIST3;

cachep->nodelists[node] = l3;
}
@@ -919,8 +919,8 @@ static int __devinit cpuup_callback(stru

l3 = cachep->nodelists[node];
BUG_ON(!l3);
- if(!l3->shared) {
- if(!(nc = alloc_arraycache(cpu,
+ if (!l3->shared) {
+ if (!(nc = alloc_arraycache(cpu,
cachep->shared*cachep->batchcount,
0xbaadf00d)))
goto bad;
@@ -952,29 +952,29 @@ static int __devinit cpuup_callback(stru
cachep->array[cpu] = NULL;
l3 = cachep->nodelists[node];

- if(!l3)
+ if (!l3)
goto unlock_cache;

spin_lock(&l3->list_lock);

/* Free limit for this kmem_list3 */
l3->free_limit -= cachep->batchcount;
- if(nc)
+ if (nc)
free_block(cachep, nc->entry, nc->avail);

- if(!cpus_empty(mask)) {
+ if (!cpus_empty(mask)) {
spin_unlock(&l3->list_lock);
goto unlock_cache;
}

- if(l3->shared) {
+ if (l3->shared) {
free_block(cachep, l3->shared->entry,
l3->shared->avail);
kfree(l3->shared);
l3->shared = NULL;
}
#ifdef CONFIG_NUMA
- if(l3->alien) {
+ if (l3->alien) {
drain_alien_cache(cachep, l3);
free_alien_cache(l3->alien);
l3->alien = NULL;
@@ -982,13 +982,13 @@ static int __devinit cpuup_callback(stru
#endif

/* free slabs belonging to this node */
- if(__node_shrink(cachep, node)) {
+ if (__node_shrink(cachep, node)) {
cachep->nodelists[node] = NULL;
spin_unlock(&l3->list_lock);
kfree(l3);
- }
- else
+ } else {
spin_unlock(&l3->list_lock);
+ }
unlock_cache:
spin_unlock_irq(&cachep->spinlock);
kfree(nc);
@@ -1034,7 +1034,7 @@ void __init kmem_cache_init(void)
struct cache_names *names;
int i;

- for(i = 0; i < NUM_INIT_LISTS; i++) {
+ for (i = 0; i < NUM_INIT_LISTS; i++) {
LIST3_INIT(&initkmem_list3[i]);
if (i < MAX_NUMNODES)
cache_cache.nodelists[i] = NULL;
@@ -1101,16 +1101,19 @@ void __init kmem_cache_init(void)
(ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);

if (INDEX_AC != INDEX_L3)
- sizes[INDEX_L3].cs_cachep = kmem_cache_create(names[INDEX_L3].name,
- sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN,
- (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+ sizes[INDEX_L3].cs_cachep =
+ kmem_cache_create(names[INDEX_L3].name,
+ sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN,
+ (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);

while (sizes->cs_size != ULONG_MAX) {
- /* For performance, all the general caches are L1 aligned.
+ /*
+ * For performance, all the general caches are L1 aligned.
* This should be particularly beneficial on SMP boxes, as it
* eliminates "false sharing".
* Note for systems short on memory removing the alignment will
- * allow tighter packing of the smaller caches. */
+ * allow tighter packing of the smaller caches.
+ */
if(!sizes->cs_cachep)
sizes->cs_cachep = kmem_cache_create(names->name,
sizes->cs_size, ARCH_KMALLOC_MINALIGN,
@@ -1150,7 +1153,8 @@ void __init kmem_cache_init(void)
!= &initarray_generic.cache);
memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep),
sizeof(struct arraycache_init));
- malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = ptr;
+ malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
+ ptr;
local_irq_enable();
}
/* 5) Replace the bootstrap kmem_list3's */
@@ -1160,8 +1164,8 @@ void __init kmem_cache_init(void)
init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
numa_node_id());

- for (j=0; j < MAX_NUMNODES; j++) {
- if(is_node_online(j))
+ for (j = 0; j < MAX_NUMNODES; j++) {
+ if (is_node_online(j))
init_list(malloc_sizes[INDEX_L3].cs_cachep,
&initkmem_list3[SIZE_L3+j], j);
}
@@ -1489,8 +1493,9 @@ static void slab_destroy (kmem_cache_t *
static inline void set_up_list3s(kmem_cache_t *cachep)
{
int i;
- for(i = 0; i < MAX_NUMNODES; i++) {
- if(is_node_online(i)) {
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ if (is_node_online(i)) {
cachep->nodelists[i] = &initkmem_list3[SIZE_L3+i];
cachep->nodelists[i]->next_reap = jiffies +
REAPTIMEOUT_LIST3 +
@@ -1939,14 +1944,14 @@ static void drain_cpu_caches(kmem_cache_
smp_call_function_all_cpus(do_drain, cachep);
check_irq_on();
spin_lock_irq(&cachep->spinlock);
- for(i = 0; i < MAX_NUMNODES; i++) {
+ for (i = 0; i < MAX_NUMNODES; i++) {
l3 = cachep->nodelists[i];
if (l3) {
spin_lock(&l3->list_lock);
drain_array_locked(cachep, l3->shared, 1, i);
spin_unlock(&l3->list_lock);
#ifdef CONFIG_NUMA
- if(l3->alien)
+ if (l3->alien)
drain_alien_cache(cachep, l3);
#endif
}
@@ -2074,7 +2079,7 @@ int kmem_cache_destroy(kmem_cache_t * ca
kfree(cachep->array[i]);

/* NUMA: free the list3 structures */
- for(i = 0; i < MAX_NUMNODES; i++) {
+ for (i = 0; i < MAX_NUMNODES; i++) {
if ((l3 = cachep->nodelists[i])) {
kfree(l3->shared);
#ifdef CONFIG_NUMA
@@ -2092,8 +2097,8 @@ int kmem_cache_destroy(kmem_cache_t * ca
EXPORT_SYMBOL(kmem_cache_destroy);

/* Get the memory for a slab management obj. */
-static struct slab* alloc_slabmgmt(kmem_cache_t *cachep,
- void *objp, int colour_off, unsigned int __nocast local_flags)
+static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
+ int colour_off, unsigned int __nocast local_flags)
{
struct slab *slabp;

@@ -2124,7 +2129,7 @@ static void cache_init_objs(kmem_cache_t
int i;

for (i = 0; i < cachep->num; i++) {
- void* objp = slabp->s_mem+cachep->objsize*i;
+ void *objp = slabp->s_mem+cachep->objsize*i;
#if DEBUG
/* need to poison the objs? */
if (cachep->flags & SLAB_POISON)
@@ -2806,8 +2811,7 @@ static inline void __cache_free(kmem_cac
alien, nodeid);
alien->entry[alien->avail++] = objp;
spin_unlock(&alien->lock);
- }
- else {
+ } else {
spin_lock(&(cachep->nodelists[nodeid])->
list_lock);
free_block(cachep, &objp, 1);
@@ -3196,7 +3200,7 @@ static int alloc_kmemlist(kmem_cache_t *
struct kmem_list3 *l3;
int err = 0;

- for(i=0; i < NR_CPUS; i++) {
+ for (i = 0; i < NR_CPUS; i++) {
if (cpu_online(i)) {
struct array_cache *nc = NULL, *new;
#ifdef CONFIG_NUMA
_

2005-05-14 16:24:47

by Christoph Lameter

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

On Sat, 14 May 2005, Andrew Morton wrote:

> Christoph Lameter <[email protected]> wrote:
> >
> > This patch allows kmalloc_node to be as fast as kmalloc by introducing
> > node specific page lists for partial, free and full slabs.
>
> Oh drat - what happened to all the coding-style fixups? Redone patch
> below. Please merge - slab.c is already not a nice place to visit.

Hmmm.. Strange...

> > +#ifndef CONFIG_NUMA
> > +#if MAX_NUMNODES != 1
> > +#error "Broken Configuration: CONFIG_NUMA not set but MAX_NUMNODES !=1 !!"
> > +#endif
> > +#endif
>
> Well that's doing to make it fail to compile at all on ppc64.

That was intended. Better fail to compile than break on boot.

>
> > {
> > #ifdef CONFIG_SMP
> > check_irq_off();
> > - BUG_ON(spin_trylock(&cachep->spinlock));
> > + BUG_ON(spin_trylock(&list3_data(cachep)->list_lock));
> > +#endif
>
> We can use assert_spin_lcoked() here now btw.

ok.

> I hacked things to compile by setting NDOES_SHIFT to zero and the machine
> boots. I'll leave that hack in place for the while, so -mm is busted on
> ppc64 NUMA. Please sort things out with the ppc64 guys?

Ok. However, this is a general issue with CONFIG_DISCONTIG being on and
CONFIG_NUMA being off. ppc64 will be fine for CONFIG_NUMA but not for
CONFIG_NUMA being off and CONFIG_DISCONTIG being on.

Would you put Dave Hansen's fix in that he posted in this thread?
Seems that we will be evolving finally into a situation in which
all of this will work itself out again.

Another solution would be to s/CONFIG_NUMA/CONFIG_DISCONTIG/ in the slab
allocator until the issues has been worked through.

Here is Dave's patch again:

=====================================================================
I think I found the problem. Could you try the attached patch?

As I said before FLATMEM is really referring to things like the
mem_map[] or max_mapnr.

CONFIG_NEED_MULTIPLE_NODES is what gets turned on for DISCONTIG or for
NUMA. We'll slowly be removing all of the DISCONTIG cases, so
eventually it will merge back to be one with NUMA.

-- Dave

--- clean/include/linux/numa.h.orig 2005-05-13 06:44:56.000000000
-0700
+++ clean/include/linux/numa.h 2005-05-13 06:52:05.000000000 -0700
@@ -3,7 +3,7 @@

#include <linux/config.h>

-#ifndef CONFIG_FLATMEM
+#ifdef CONFIG_NEED_MULTIPLE_NODES
#include <asm/numnodes.h>
#endif

=====================================================================

2005-05-16 05:01:10

by Andrew Morton

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

Christoph Lameter <[email protected]> wrote:
>
> Here is Dave's patch again:
>
> =====================================================================
> I think I found the problem. Could you try the attached patch?
>
> As I said before FLATMEM is really referring to things like the
> mem_map[] or max_mapnr.
>
> CONFIG_NEED_MULTIPLE_NODES is what gets turned on for DISCONTIG or for
> NUMA. We'll slowly be removing all of the DISCONTIG cases, so
> eventually it will merge back to be one with NUMA.
>
> -- Dave
>
> --- clean/include/linux/numa.h.orig 2005-05-13 06:44:56.000000000
> -0700
> +++ clean/include/linux/numa.h 2005-05-13 06:52:05.000000000 -0700
> @@ -3,7 +3,7 @@
>
> #include <linux/config.h>
>
> -#ifndef CONFIG_FLATMEM
> +#ifdef CONFIG_NEED_MULTIPLE_NODES
> #include <asm/numnodes.h>
> #endif

Nope.

mm/slab.c:117:2: #error "Broken Configuration: CONFIG_NUMA not set but MAX_NUMNODES !=1 !!"

2005-05-16 13:53:21

by Dave Hansen

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

On Fri, 2005-05-13 at 18:24 -0700, Christoph Lameter wrote:
> /*
> + * Some Linux kernels currently have weird notions of NUMA. Make sure that
> + * there is only a single node if CONFIG_NUMA is not set. Remove this check
> + * after the situation has stabilized.
> + */
> +#ifndef CONFIG_NUMA
> +#if MAX_NUMNODES != 1
> +#error "Broken Configuration: CONFIG_NUMA not set but MAX_NUMNODES !=1 !!"
> +#endif
> +#endif

There are some broken assumptions in the kernel that
CONFIG_DISCONTIG==CONFIG_NUMA. These usually manifest when code assumes
that one pg_data_t means one NUMA node.

However, NUMA node ids are actually distinct from "discontigmem nodes".
A "discontigmem node" is just one physically contiguous area of memory,
thus one pg_data_t. Some (non-NUMA) Mac G5's have a gap in their
address space, so they get two discontigmem nodes.

So, that #error is bogus. It's perfectly valid to have multiple
discontigmem nodes, when the number of NUMA nodes is 1. MAX_NUMNODES
refers to discontigmem nodes, not NUMA nodes.

In current -mm, you can use CONFIG_NEED_MULTIPLE_NODES to mean 'NUMA ||
DISCONTIG'.

-- Dave

2005-05-16 16:47:51

by Christoph Lameter

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

On Mon, 16 May 2005, Dave Hansen wrote:

> There are some broken assumptions in the kernel that
> CONFIG_DISCONTIG==CONFIG_NUMA. These usually manifest when code assumes
> that one pg_data_t means one NUMA node.
>
> However, NUMA node ids are actually distinct from "discontigmem nodes".
> A "discontigmem node" is just one physically contiguous area of memory,
> thus one pg_data_t. Some (non-NUMA) Mac G5's have a gap in their
> address space, so they get two discontigmem nodes.

I thought the discontigous memory in one node was handled through zones?
I.e. ZONE_HIGHMEM in i386?

> So, that #error is bogus. It's perfectly valid to have multiple
> discontigmem nodes, when the number of NUMA nodes is 1. MAX_NUMNODES
> refers to discontigmem nodes, not NUMA nodes.

Ok. We looked through the code and saw that the check may be removed
without causing problems. However, there is still a feeling of uneasiness
about this.

To what node does numa_node_id() refer? And it is legit to use
numa_node_id() to index cpu maps and stuff? How do the concepts of numa
node id relate to discontig node ids?

2005-05-16 17:22:42

by Dave Hansen

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

On Mon, 2005-05-16 at 09:47 -0700, Christoph Lameter wrote:
> On Mon, 16 May 2005, Dave Hansen wrote:
> > There are some broken assumptions in the kernel that
> > CONFIG_DISCONTIG==CONFIG_NUMA. These usually manifest when code assumes
> > that one pg_data_t means one NUMA node.
> >
> > However, NUMA node ids are actually distinct from "discontigmem nodes".
> > A "discontigmem node" is just one physically contiguous area of memory,
> > thus one pg_data_t. Some (non-NUMA) Mac G5's have a gap in their
> > address space, so they get two discontigmem nodes.
>
> I thought the discontigous memory in one node was handled through zones?
> I.e. ZONE_HIGHMEM in i386?

You can only have one zone of each type under each pg_data_t. For
instance, you can't properly represent (DMA, NORMAL, HIGHMEM, <GAP>,
HIGHMEM) in a single pg_data_t without wasting node_mem_map[] space.
The "proper" discontig way of representing that is like this:

pg_data_t[0] (DMA, NORMAL, HIGHMEM)
<GAP>
pg_data_t[1] (---, ------, HIGHMEM)

Where pg_data_t[1] has empty DMA and NORMAL zones. Also, remember that
both of these could theoretically be on the same NUMA node. But, I
don't think we ever do that in practice.

> > So, that #error is bogus. It's perfectly valid to have multiple
> > discontigmem nodes, when the number of NUMA nodes is 1. MAX_NUMNODES
> > refers to discontigmem nodes, not NUMA nodes.
>
> Ok. We looked through the code and saw that the check may be removed
> without causing problems. However, there is still a feeling of uneasiness
> about this.

I don't blame you :)

> To what node does numa_node_id() refer?

That refers to the NUMA node that you're thinking of. Close CPUs and
memory and I/O, etc...

> And it is legit to use
> numa_node_id() to index cpu maps and stuff?

Yes, those are all NUMA nodes.

> How do the concepts of numa node id relate to discontig node ids?

I believe there are quite a few assumptions on some architectures that,
when NUMA is on, they are equivalent. It appears to be pretty much
assumed everywhere that CONFIG_NUMA=y means one pg_data_t per NUMA node.

Remember, as you saw, you can't assume that MAX_NUMNODES=1 when NUMA=n
because of the DISCONTIG=y case.

So, in summary, if you want to do it right: use the
CONFIG_NEED_MULTIPLE_NODES that you see in -mm. As plain DISCONTIG=y
gets replaced by sparsemem any code using this is likely to stay
working.

-- Dave

2005-05-16 17:54:59

by Christoph Lameter

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

On Mon, 16 May 2005, Dave Hansen wrote:

> > How do the concepts of numa node id relate to discontig node ids?
>
> I believe there are quite a few assumptions on some architectures that,
> when NUMA is on, they are equivalent. It appears to be pretty much
> assumed everywhere that CONFIG_NUMA=y means one pg_data_t per NUMA node.

Ah. That sounds much better.

> Remember, as you saw, you can't assume that MAX_NUMNODES=1 when NUMA=n
> because of the DISCONTIG=y case.

I have never seen such a machine. A SMP machine with multiple
"nodes"? So essentially one NUMA node has multiple discontig "nodes"?

This means that the concept of a node suddenly changes if there is just
one numa node(CONFIG_NUMA off implies one numa node)?

> So, in summary, if you want to do it right: use the
> CONFIG_NEED_MULTIPLE_NODES that you see in -mm. As plain DISCONTIG=y
> gets replaced by sparsemem any code using this is likely to stay
> working.

s/CONFIG_NUMA/CONFIG_NEED_MULTIPLE_NODES?

That will not work because the idea is the localize the slabs to each
node.

If there are multiple nodes per numa node then invariable one node in the
numa node (sorry for this duplication of what node means but I did not
do it) must be preferred since numa_node_id() does not return a set of
discontig nodes.

Sorry but this all sounds like an flaw in the design. There is no
consistent notion of node. Are you sure that this is not a ppc64 screwup?

2005-05-16 18:11:30

by Martin Bligh

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

>> > How do the concepts of numa node id relate to discontig node ids?
>>
>> I believe there are quite a few assumptions on some architectures that,
>> when NUMA is on, they are equivalent. It appears to be pretty much
>> assumed everywhere that CONFIG_NUMA=y means one pg_data_t per NUMA node.
>
> Ah. That sounds much better.
>
>> Remember, as you saw, you can't assume that MAX_NUMNODES=1 when NUMA=n
>> because of the DISCONTIG=y case.
>
> I have never seen such a machine. A SMP machine with multiple
> "nodes"? So essentially one NUMA node has multiple discontig "nodes"?

I believe you (SGI) make one ;-) Anywhere where you have large gaps in
the physical address range within a node, this is what you really need.
Except ia64 has this wierd virtual mem_map thing that can go away once
we have sparsemem.

> This means that the concept of a node suddenly changes if there is just
> one numa node(CONFIG_NUMA off implies one numa node)?

The end point of where we're getting to is 1 node = 1 pgdat (which we can
then rename to struct node or something). All this confusing mess of
config options is just a migration path, which I'll leave it to Andy to
explain ;-)

> s/CONFIG_NUMA/CONFIG_NEED_MULTIPLE_NODES?
>
> That will not work because the idea is the localize the slabs to each
> node.
>
> If there are multiple nodes per numa node then invariable one node in the
> numa node (sorry for this duplication of what node means but I did not
> do it) must be preferred since numa_node_id() does not return a set of
> discontig nodes.
>
> Sorry but this all sounds like an flaw in the design. There is no
> consistent notion of node. Are you sure that this is not a ppc64 screwup?

No, it's a discontigmem screwup. Currently a pgdat represents 2 different
scenarios:

(1) physically discontiguous memory chunk.
(2) a NUMA node.

I don't think we support both at the same time with the old code. So it
seems to me like your numa aware slab code (which I'm still intending to
go read, but haven't yet) is only interested in real nodes. Logically
speaking, that would be CONFIG_NUMA. The current transition config options
are a bit of a mess ... Andy, I presume CONFIG_NEED_MULTIPLE_NODES is
really CONFIG_NEED_MULTIPLE_PGDATS ?

M.

2005-05-16 18:15:17

by Dave Hansen

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

On Mon, 2005-05-16 at 10:54 -0700, Christoph Lameter wrote:
> > Remember, as you saw, you can't assume that MAX_NUMNODES=1 when NUMA=n
> > because of the DISCONTIG=y case.
>
> I have never seen such a machine. A SMP machine with multiple
> "nodes"?

Yes. "discontigmem nodes"

> So essentially one NUMA node has multiple discontig "nodes"?

Yes, in theory.

A discontig node is just a contiguous area of physical memory.

> This means that the concept of a node suddenly changes if there is just
> one numa node(CONFIG_NUMA off implies one numa node)?

Correct as well.

> > So, in summary, if you want to do it right: use the
> > CONFIG_NEED_MULTIPLE_NODES that you see in -mm. As plain DISCONTIG=y
> > gets replaced by sparsemem any code using this is likely to stay
> > working.
>
> s/CONFIG_NUMA/CONFIG_NEED_MULTIPLE_NODES?
>
> That will not work because the idea is the localize the slabs to each
> node.
>
> If there are multiple nodes per numa node then invariable one node in the
> numa node (sorry for this duplication of what node means but I did not
> do it) must be preferred since numa_node_id() does not return a set of
> discontig nodes.

I know it's confusing. I feel your pain :)

You're right, I think you completely want CONFIG_NUMA, not
NEED_MULTIPLE_NODES. So, toss out that #ifdef, and everything should be
in pretty good shape. Just don't make any assumptions about how many
'struct zone' or 'pg_data_t's a single "node's" pages can come from.

Although it doesn't help your issue, you may want to read the comments
in here, I wrote it when my brain was twisting around the same issues:

http://www.kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.12-rc4/2.6.12-rc4-mm2/broken-out/introduce-new-kconfig-option-for-numa-or-discontig.patch

> Sorry but this all sounds like an flaw in the design. There is no
> consistent notion of node.

It's not really a flaw in the design, it's a misinterpretation of the
original design as new architectures implemented things. I hope to
completely ditch DISCONTIGMEM, eventually.

> Are you sure that this is not a ppc64 screwup?

Yeah, ppc64 is not at fault, it just provides the most obvious exposure
of the issue.

-- Dave

2005-05-16 21:13:41

by Jesse Barnes

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

On Monday, May 16, 2005 11:08 am, Martin J. Bligh wrote:
> > I have never seen such a machine. A SMP machine with multiple
> > "nodes"? So essentially one NUMA node has multiple discontig
> > "nodes"?
>
> I believe you (SGI) make one ;-) Anywhere where you have large gaps
> in the physical address range within a node, this is what you really
> need. Except ia64 has this wierd virtual mem_map thing that can go
> away once we have sparsemem.

Right, the SGI boxes have discontiguous memory within a node, but it's
not represented by pgdats (like you said, one 'virtual memmap' spans
the whole address space of a node). Sparse can help simplify this
across platforms, but has the potential to be more expensive for
systems with dynamically sized holes, due to the additional calculation
and potential cache miss associated with indexing into the correct
memmap (Dave can probably correct me here, it's been awhile). With a
virtual memmap, you only occasionally take a TLB miss on the struct
page access after indexing into the array.

> The end point of where we're getting to is 1 node = 1 pgdat (which we
> can then rename to struct node or something). All this confusing mess
> of config options is just a migration path, which I'll leave it to
> Andy to explain ;-)

Yes!

> No, it's a discontigmem screwup. Currently a pgdat represents 2
> different scenarios:
>
> (1) physically discontiguous memory chunk.
> (2) a NUMA node.
>
> I don't think we support both at the same time with the old code. So
> it seems to me like your numa aware slab code (which I'm still
> intending to go read, but haven't yet) is only interested in real
> nodes. Logically speaking, that would be CONFIG_NUMA. The current
> transition config options are a bit of a mess ... Andy, I presume
> CONFIG_NEED_MULTIPLE_NODES is really CONFIG_NEED_MULTIPLE_PGDATS ?

Yeah, makes sense for the NUMA aware slab allocator to depend on
CONFIG_NUMA.

Jesse

2005-05-16 21:26:02

by Martin Bligh

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

> Right, the SGI boxes have discontiguous memory within a node, but it's
> not represented by pgdats (like you said, one 'virtual memmap' spans
> the whole address space of a node). Sparse can help simplify this
> across platforms, but has the potential to be more expensive for
> systems with dynamically sized holes, due to the additional calculation
> and potential cache miss associated with indexing into the correct
> memmap (Dave can probably correct me here, it's been awhile). With a
> virtual memmap, you only occasionally take a TLB miss on the struct
> page access after indexing into the array.

That's exactly what was brilliant about Andy's code ... it fixed that,
there shouldn't be extra references ...

>> transition config options are a bit of a mess ... Andy, I presume
>> CONFIG_NEED_MULTIPLE_NODES is really CONFIG_NEED_MULTIPLE_PGDATS ?
>
> Yeah, makes sense for the NUMA aware slab allocator to depend on
> CONFIG_NUMA.

Andy confirmed offline that this is really CONFIG_NEED_MULTIPLE_PGDATS,
and is just named wrong.

M.

2005-05-16 22:02:47

by Dave Hansen

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

On Mon, 2005-05-16 at 14:10 -0700, Jesse Barnes wrote:
> On Monday, May 16, 2005 11:08 am, Martin J. Bligh wrote:
> > > I have never seen such a machine. A SMP machine with multiple
> > > "nodes"? So essentially one NUMA node has multiple discontig
> > > "nodes"?
> >
> > I believe you (SGI) make one ;-) Anywhere where you have large gaps
> > in the physical address range within a node, this is what you really
> > need. Except ia64 has this wierd virtual mem_map thing that can go
> > away once we have sparsemem.
>
> Right, the SGI boxes have discontiguous memory within a node, but it's
> not represented by pgdats (like you said, one 'virtual memmap' spans
> the whole address space of a node). Sparse can help simplify this
> across platforms, but has the potential to be more expensive for
> systems with dynamically sized holes, due to the additional calculation
> and potential cache miss associated with indexing into the correct
> memmap (Dave can probably correct me here, it's been awhile). With a
> virtual memmap, you only occasionally take a TLB miss on the struct
> page access after indexing into the array.

The sparsemem calculation costs are quite low. One of the main costs is
bringing the actual 'struct page' into the cache so you can use the
hints in page->flags. In reality, after almost every pfn_to_page(), you
go ahead and touch the 'struct page' anyway. So, this cost is
effectively zero. In fact, it's kinda like doing a prefetch, so it may
even speed some things up.

After you have the section index from page->flags (which costs just a
shift and a mask), you access into a static array, and do a single
subtraction. Here's the I386) disassembly this function with
SPARSEMEM=y:

unsigned long page_to_pfn_stub(struct page *page)
{
return page_to_pfn(page);
}

1c30: 8b 54 24 04 mov 0x4(%esp),%edx
1c34: 8b 02 mov (%edx),%eax
1c36: c1 e8 1a shr $0x1a,%eax
1c39: 8b 04 85 00 00 00 00 mov 0x0(,%eax,4),%eax
1c40: 24 fc and $0xfc,%al
1c42: 29 c2 sub %eax,%edx
1c44: c1 fa 05 sar $0x5,%edx
1c47: 89 d0 mov %edx,%eax
1c49: c3 ret

Other than popping the arguments off the stack, I think there are only
two loads in there: the page->flags load, and the mem_section[]
dereference. So, in the end, the only advantage of the vmem_map[]
approach is saving that _one_ load. The worst-case-scenario for this
load in the sparsemem case is a full cache miss. The worst case in the
vmem_map[] case is a TLB miss, which is probably hundreds of times
slower than even a full cache miss.

BTW, the object footprint of sparsemem is lower than discontigmem, too:

SPARSEMEM DISCONTIGMEM
pfn_to_page: 25b 41b
page_to_pfn: 25b 33b

So, that helps out things like icache footprint.

-- Dave

2005-05-17 00:15:57

by Christoph Lameter

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

On Mon, 16 May 2005, Martin J. Bligh wrote:

> > Yeah, makes sense for the NUMA aware slab allocator to depend on
> > CONFIG_NUMA.
>
> Andy confirmed offline that this is really CONFIG_NEED_MULTIPLE_PGDATS,
> and is just named wrong.

Hmmm.. In this case it may be necessary for the slab allocator to
determine what is the proper number of NUMA nodes. I do not really like it
but it seems that we need the following patch to rectify the situation.

Index: linux-2.6.12-rc4/mm/slab.c
===================================================================
--- linux-2.6.12-rc4.orig/mm/slab.c 2005-05-16 16:58:44.000000000 -0700
+++ linux-2.6.12-rc4/mm/slab.c 2005-05-16 17:04:11.000000000 -0700
@@ -112,10 +112,12 @@
* there is only a single node if CONFIG_NUMA is not set. Remove this check
* after the situation has stabilized.
*/
-#ifndef CONFIG_NUMA
-#if MAX_NUMNODES != 1
-#error "Broken Configuration: CONFIG_NUMA not set but MAX_NUMNODES !=1 !!"
-#endif
+#ifdef CONFIG_NUMA
+#define NUMA_NODES MAX_NUMNODES
+#define NUMA_NODE_ID numa_node_id()
+#else
+#define NUMA_NODES 1
+#define NUMA_NODE_ID 0
#endif

/*
@@ -311,7 +313,7 @@
/*
* Need this for bootstrapping a per node allocator.
*/
-#define NUM_INIT_LISTS (2 + MAX_NUMNODES)
+#define NUM_INIT_LISTS (2 + NUMA_NODES)
struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
#define CACHE_CACHE 0
#define SIZE_AC 1
@@ -385,7 +387,7 @@
} while (0)

#define list3_data(cachep) \
- ((cachep->nodelists[numa_node_id()]))
+ ((cachep->nodelists[NUMA_NODE_ID]))

/* NUMA: per-node */
#define list3_data_ptr(cachep, ptr) \
@@ -405,7 +407,7 @@
unsigned int shared;
unsigned int objsize;
/* 2) touched by every alloc & free from the backend */
- struct kmem_list3 *nodelists[MAX_NUMNODES];
+ struct kmem_list3 *nodelists[NUMA_NODES];
unsigned int flags; /* constant flags */
unsigned int num; /* # of objs per slab */
spinlock_t spinlock;
@@ -792,7 +794,7 @@
static inline struct array_cache **alloc_alien_cache(int cpu, int limit)
{
struct array_cache **ac_ptr;
- int memsize = sizeof(void*)*MAX_NUMNODES;
+ int memsize = sizeof(void*)*NUMA_NODES;
int node = cpu_to_node(cpu);
int i;

@@ -800,7 +802,7 @@
limit = 12;
ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
if (ac_ptr) {
- for (i = 0; i < MAX_NUMNODES; i++) {
+ for (i = 0; i < NUMA_NODES; i++) {
if (i == node) {
ac_ptr[i] = NULL;
continue;
@@ -823,7 +825,7 @@

if (!ac_ptr)
return;
- for (i = 0; i < MAX_NUMNODES; i++)
+ for (i = 0; i < NUMA_NODES; i++)
kfree(ac_ptr[i]);

kfree(ac_ptr);
@@ -847,7 +849,7 @@
struct array_cache *ac;
unsigned long flags;

- for (i = 0; i < MAX_NUMNODES; i++) {
+ for (i = 0; i < NUMA_NODES; i++) {
ac = l3->alien[i];
if (ac) {
spin_lock_irqsave(&ac->lock, flags);
@@ -1028,7 +1030,7 @@

for (i = 0; i < NUM_INIT_LISTS; i++) {
LIST3_INIT(&initkmem_list3[i]);
- if (i < MAX_NUMNODES)
+ if (i < NUMA_NODES)
cache_cache.nodelists[i] = NULL;
}

@@ -1065,7 +1067,7 @@
list_add(&cache_cache.next, &cache_chain);
cache_cache.colour_off = cache_line_size();
cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
- cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
+ cache_cache.nodelists[NUMA_NODE_ID] = &initkmem_list3[CACHE_CACHE];

cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());

@@ -1154,7 +1156,7 @@
int node;
/* Replace the static kmem_list3 structures for the boot cpu */
init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
- numa_node_id());
+ NUMA_NODE_ID);

for_each_online_node(node) {
init_list(malloc_sizes[INDEX_L3].cs_cachep,
@@ -1163,7 +1165,7 @@
if (INDEX_AC != INDEX_L3) {
init_list(malloc_sizes[INDEX_AC].cs_cachep,
&initkmem_list3[SIZE_AC],
- numa_node_id());
+ NUMA_NODE_ID);
}
}

@@ -1778,7 +1780,7 @@
set_up_list3s(cachep);
g_cpucache_up = PARTIAL_L3;
} else {
- cachep->nodelists[numa_node_id()] =
+ cachep->nodelists[NUMA_NODE_ID] =
&initkmem_list3[SIZE_AC];
g_cpucache_up = PARTIAL_AC;
}
@@ -1791,18 +1793,18 @@
set_up_list3s(cachep);
g_cpucache_up = PARTIAL_L3;
} else {
- cachep->nodelists[numa_node_id()] =
+ cachep->nodelists[NUMA_NODE_ID] =
kmalloc(sizeof(struct kmem_list3),
GFP_KERNEL);
- LIST3_INIT(cachep->nodelists[numa_node_id()]);
+ LIST3_INIT(cachep->nodelists[NUMA_NODE_ID]);
}
}
- cachep->nodelists[numa_node_id()]->next_reap =
+ cachep->nodelists[NUMA_NODE_ID]->next_reap =
jiffies + REAPTIMEOUT_LIST3 +
((unsigned long)cachep)%REAPTIMEOUT_LIST3;

BUG_ON(!ac_data(cachep));
- BUG_ON(!cachep->nodelists[numa_node_id()]);
+ BUG_ON(!cachep->nodelists[NUMA_NODE_ID]);
ac_data(cachep)->avail = 0;
ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
ac_data(cachep)->batchcount = 1;
@@ -1986,7 +1988,7 @@
drain_cpu_caches(cachep);

check_irq_on();
- for (i = 0; i < MAX_NUMNODES; i++) {
+ for (i = 0; i < NUMA_NODES; i++) {
l3 = cachep->nodelists[i];
if (l3) {
spin_lock_irq(&l3->list_lock);
@@ -2068,7 +2070,7 @@
kfree(cachep->array[i]);

/* NUMA: free the list3 structures */
- for (i = 0; i < MAX_NUMNODES; i++) {
+ for (i = 0; i < NUMA_NODES; i++) {
if ((l3 = cachep->nodelists[i])) {
kfree(l3->shared);
#ifdef CONFIG_NUMA
@@ -2482,7 +2484,7 @@

if (unlikely(!ac->avail)) {
int x;
- x = cache_grow(cachep, flags, numa_node_id());
+ x = cache_grow(cachep, flags, NUMA_NODE_ID);

// cache_grow can reenable interrupts, then ac could change.
ac = ac_data(cachep);
@@ -2786,7 +2788,7 @@
{
struct slab *slabp;
slabp = GET_PAGE_SLAB(virt_to_page(objp));
- if (unlikely(slabp->nodeid != numa_node_id())) {
+ if (unlikely(slabp->nodeid != NUMA_NODE_ID)) {
struct array_cache *alien = NULL;
int nodeid = slabp->nodeid;
struct kmem_list3 *l3 = list3_data(cachep);
@@ -2896,7 +2898,7 @@
unsigned long save_flags;
void *ptr;

- if (nodeid == numa_node_id() || nodeid == -1)
+ if (nodeid == NUMA_NODE_ID || nodeid == -1)
return __cache_alloc(cachep, flags);

cache_alloc_debugcheck_before(cachep, flags);
@@ -3437,7 +3439,7 @@
spin_lock_irq(&l3->list_lock);

drain_array_locked(searchp, ac_data(searchp), 0,
- numa_node_id());
+ NUMA_NODE_ID);

#if DEBUG
if (time_before(searchp->redzonetest, jiffies)) {
@@ -3452,7 +3454,7 @@

if (l3->shared)
drain_array_locked(searchp, l3->shared, 0,
- numa_node_id());
+ NUMA_NODE_ID);

if (l3->free_touched) {
l3->free_touched = 0;

2005-05-17 00:27:43

by Dave Hansen

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

> +#ifdef CONFIG_NUMA
> +#define NUMA_NODES MAX_NUMNODES
> +#define NUMA_NODE_ID numa_node_id()
> +#else
> +#define NUMA_NODES 1
> +#define NUMA_NODE_ID 0
> #endif

I think numa_node_id() should always do what you want. It is never
related to discontig nodes, and #defines down to the same thing you have
in the end, anyway:

#define numa_node_id() (cpu_to_node(_smp_processor_id()))

asm-i386/topology.h
#ifdef CONFIG_NUMA
...
static inline int cpu_to_node(int cpu)
{
return cpu_2_node[cpu];
}

asm-generic/topology.h:
#ifndef cpu_to_node
#define cpu_to_node(cpu) (0)
#endif

As for the MAX_NUMNODES, I'd just continue to use it, instead of a new
#define. There is no case where there can be more NUMA nodes than
DISCONTIG nodes, and this assumption appears in plenty of other code.

I'm cc'ing Matt Dobson, who's touched this MAX_NUMNODES business a lot
more recently than I.

-- Dave

2005-05-17 23:30:52

by Matthew Dobson

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V2

Excuse for the week-late response...

Andrew Morton wrote:
> Christoph Lameter <[email protected]> wrote:
>
>> Could we boot the box without quiet so that we can get better debug
>> messages?
>
>
> It didn't produce anything interesting. For some reason the console output
> stops when start_kernel() runs console_init() (I guess it all comes out
> later) so the machine is running blind when we run kmem_cache_init().
> Irritating. I just moved the console_init() call to happen later on.
>
> It's going BUG() in kmem_cache_init()->set_up_list3s->is_node_online
> because for some reason the !CONFIG_NUMA ppc build has MAX_NUMNODES=16,
> even though there's only one node.
>
> Doing
>
> #define is_node_online(node) node_online(node)

As Dave Hansen mentioned elsewhere in this thread, there is no need to
define this is_node_online() macro, as node_online() does EXACTLY the same
thing (minus the BUG() which is probably overkill).


> unconditionally fixes that up (your patch shuld be using
> for_each_online_node() everywhere?) but it oopses later - I think it's the
> first time kmem_cache_alloc() is called.

Christoph should replace all the for (i = 0; i < MAX_NUMNODES; i++) loops
with for_each_node(i) and the one loop that does this:
for (i = 0; i < MAX_NUMNODES; i++) {
if (!node_online(i))
continue;
(or something similar) with for_each_online_node(i)

Also, there is a similar loop for CPUs which should be replaced with
for_each_online_cpu(i).

These for_each_FOO macros are cleaner and less likely to break in the
future, since we can simply modify the one definition if the way to
itterate over nodes/cpus changes, rather than auditing 100 open coded
implementations and trying to determine the intent of the loop's author.

-Matt

2005-05-17 23:43:26

by Matthew Dobson

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

Dave Hansen wrote:
>>+#ifdef CONFIG_NUMA
>>+#define NUMA_NODES MAX_NUMNODES
>>+#define NUMA_NODE_ID numa_node_id()
>>+#else
>>+#define NUMA_NODES 1
>>+#define NUMA_NODE_ID 0
>> #endif
>
>
> I think numa_node_id() should always do what you want. It is never
> related to discontig nodes, and #defines down to the same thing you have
> in the end, anyway:
>
> #define numa_node_id() (cpu_to_node(_smp_processor_id()))
>
> asm-i386/topology.h
> #ifdef CONFIG_NUMA
> ...
> static inline int cpu_to_node(int cpu)
> {
> return cpu_2_node[cpu];
> }
>
> asm-generic/topology.h:
> #ifndef cpu_to_node
> #define cpu_to_node(cpu) (0)
> #endif
>
> As for the MAX_NUMNODES, I'd just continue to use it, instead of a new
> #define. There is no case where there can be more NUMA nodes than
> DISCONTIG nodes, and this assumption appears in plenty of other code.
>
> I'm cc'ing Matt Dobson, who's touched this MAX_NUMNODES business a lot
> more recently than I.
>
> -- Dave


You're right, Dave. The series of #defines at the top resolve to the same
thing as numa_node_id(). Adding the above #defines will serve only to
obfuscate the code.

Another thing that will really help, Christoph, would be replacing all your
open-coded for (i = 0; i < MAX_NUMNODES/NR_CPUS; i++) loops. We have
macros that make that all nice and clean and (should?) do the right thing
for various combinations of SMP/DISCONTIG/NUMA/etc. Use those and if they
DON'T do the right thing, please let me know and we'll fix them ASAP.

for_each_cpu(i)
for_each_online_cpu(i)
for_each_node(i)
for_each_online_node(i)

Those 4 macros should replace all your open-coded loops, Christoph.

-Matt

2005-05-17 23:52:46

by Christoph Lameter

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

On Tue, 17 May 2005, Matthew Dobson wrote:

> You're right, Dave. The series of #defines at the top resolve to the same
> thing as numa_node_id(). Adding the above #defines will serve only to
> obfuscate the code.

Ok.

> Another thing that will really help, Christoph, would be replacing all your
> open-coded for (i = 0; i < MAX_NUMNODES/NR_CPUS; i++) loops. We have
> macros that make that all nice and clean and (should?) do the right thing
> for various combinations of SMP/DISCONTIG/NUMA/etc. Use those and if they
> DON'T do the right thing, please let me know and we'll fix them ASAP.

Some of that was already done but I can check again.

2005-05-18 01:08:27

by Christoph Lameter

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V2

On Tue, 17 May 2005, Matthew Dobson wrote:

> Also, there is a similar loop for CPUs which should be replaced with
> for_each_online_cpu(i).
>
> These for_each_FOO macros are cleaner and less likely to break in the
> future, since we can simply modify the one definition if the way to
> itterate over nodes/cpus changes, rather than auditing 100 open coded
> implementations and trying to determine the intent of the loop's author.

Ok. Done.

2005-05-18 17:28:04

by Matthew Dobson

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

Christoph Lameter wrote:
> On Tue, 17 May 2005, Matthew Dobson wrote:
>
>
>>You're right, Dave. The series of #defines at the top resolve to the same
>>thing as numa_node_id(). Adding the above #defines will serve only to
>>obfuscate the code.
>
>
> Ok.
>
>
>>Another thing that will really help, Christoph, would be replacing all your
>>open-coded for (i = 0; i < MAX_NUMNODES/NR_CPUS; i++) loops. We have
>>macros that make that all nice and clean and (should?) do the right thing
>>for various combinations of SMP/DISCONTIG/NUMA/etc. Use those and if they
>>DON'T do the right thing, please let me know and we'll fix them ASAP.
>
>
> Some of that was already done but I can check again.

Thanks! I just looked at V2 & V3 of the patch and saw some open-coded
loops. I may have missed a later version of the patch which has fixes.
Feel free to CC me on future versions of the patch...

-Matt

2005-05-18 17:50:24

by Christoph Lameter

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

On Wed, 18 May 2005, Matthew Dobson wrote:

> Thanks! I just looked at V2 & V3 of the patch and saw some open-coded
> loops. I may have missed a later version of the patch which has fixes.
> Feel free to CC me on future versions of the patch...

I will when I get everything together. The hold up at the moment is that
Martin has found a boot failure with the new slab allocator on ppc64 that
I am unable to explain.

Strangely, the panic is in the page allocator. I have no means of
testing since I do not have a ppc64 system available. Could you help me
figure out what is going on?

2005-05-18 21:15:37

by Matthew Dobson

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

Christoph Lameter wrote:
> On Wed, 18 May 2005, Matthew Dobson wrote:
>
>
>>Thanks! I just looked at V2 & V3 of the patch and saw some open-coded
>>loops. I may have missed a later version of the patch which has fixes.
>>Feel free to CC me on future versions of the patch...
>
>
> I will when I get everything together. The hold up at the moment is that
> Martin has found a boot failure with the new slab allocator on ppc64 that
> I am unable to explain.
>
> Strangely, the panic is in the page allocator. I have no means of
> testing since I do not have a ppc64 system available. Could you help me
> figure out what is going on?

I can't promise anything, but if you send me the latest version of your
patch (preferably with the loops fixed to eliminate the possibility of it
accessing an unavailable/unusable node), I can build & boot it on a PPC64
box and see what happens.

-Matt

2005-05-18 21:41:25

by Christoph Lameter

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

On Wed, 18 May 2005, Matthew Dobson wrote:

> I can't promise anything, but if you send me the latest version of your
> patch (preferably with the loops fixed to eliminate the possibility of it
> accessing an unavailable/unusable node), I can build & boot it on a PPC64
> box and see what happens.

Ok. Maybe one of the other issues addressed will fix the issue.

------------------

Fixes to the slab allocator in 2.6.12-rc4-mm2

- Remove MAX_NUMNODES check
- use for_each_node/cpu
- Fix determination of INDEX_AC

Signed-off-by: Christoph Lameter <[email protected]>
Signed-off-by: Alok N Kataria <[email protected]>

Index: linux-2.6.12-rc4/mm/slab.c
===================================================================
--- linux-2.6.12-rc4.orig/mm/slab.c 2005-05-17 02:20:02.000000000 +0000
+++ linux-2.6.12-rc4/mm/slab.c 2005-05-18 21:36:51.000000000 +0000
@@ -108,17 +108,6 @@
#include <asm/page.h>

/*
- * Some Linux kernels currently have weird notions of NUMA. Make sure that
- * there is only a single node if CONFIG_NUMA is not set. Remove this check
- * after the situation has stabilized.
- */
-#ifndef CONFIG_NUMA
-#if MAX_NUMNODES != 1
-#error "Broken Configuration: CONFIG_NUMA not set but MAX_NUMNODES !=1 !!"
-#endif
-#endif
-
-/*
* DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
* SLAB_RED_ZONE & SLAB_POISON.
* 0 for faster, smaller code (especially in the critical paths).
@@ -341,7 +330,7 @@
}
}

-#define INDEX_AC index_of(sizeof(struct array_cache))
+#define INDEX_AC index_of(sizeof(struct arraycache_init))
#define INDEX_L3 index_of(sizeof(struct kmem_list3))

#ifdef CONFIG_NUMA
@@ -800,7 +789,7 @@
limit = 12;
ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
if (ac_ptr) {
- for (i = 0; i < MAX_NUMNODES; i++) {
+ for_each_node(i) {
if (i == node) {
ac_ptr[i] = NULL;
continue;
@@ -823,7 +812,7 @@

if (!ac_ptr)
return;
- for (i = 0; i < MAX_NUMNODES; i++)
+ for_each_node(i)
kfree(ac_ptr[i]);

kfree(ac_ptr);
@@ -847,7 +836,7 @@
struct array_cache *ac;
unsigned long flags;

- for (i = 0; i < MAX_NUMNODES; i++) {
+ for_each_node(i) {
ac = l3->alien[i];
if (ac) {
spin_lock_irqsave(&ac->lock, flags);
@@ -1197,7 +1186,7 @@
* Register the timers that return unneeded
* pages to gfp.
*/
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ for_each_cpu(cpu) {
if (cpu_online(cpu))
start_cpu_timer(cpu);
}
@@ -1986,7 +1975,7 @@
drain_cpu_caches(cachep);

check_irq_on();
- for (i = 0; i < MAX_NUMNODES; i++) {
+ for_each_node(i) {
l3 = cachep->nodelists[i];
if (l3) {
spin_lock_irq(&l3->list_lock);
@@ -2064,11 +2053,11 @@
/* no cpu_online check required here since we clear the percpu
* array on cpu offline and set this to NULL.
*/
- for (i = 0; i < NR_CPUS; i++)
+ for_each_cpu(i)
kfree(cachep->array[i]);

/* NUMA: free the list3 structures */
- for (i = 0; i < MAX_NUMNODES; i++) {
+ for_each_node(i) {
if ((l3 = cachep->nodelists[i])) {
kfree(l3->shared);
#ifdef CONFIG_NUMA
@@ -2975,7 +2964,7 @@
if (!pdata)
return NULL;

- for (i = 0; i < NR_CPUS; i++) {
+ for_each_cpu(i) {
if (!cpu_possible(i))
continue;
pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL,
@@ -3075,7 +3064,7 @@
int i;
struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);

- for (i = 0; i < NR_CPUS; i++) {
+ for_each_cpu(i) {
if (!cpu_possible(i))
continue;
kfree(p->ptrs[i]);
@@ -3189,7 +3178,7 @@
struct kmem_list3 *l3;
int err = 0;

- for (i = 0; i < NR_CPUS; i++) {
+ for_each_cpu(i) {
if (cpu_online(i)) {
struct array_cache *nc = NULL, *new;
#ifdef CONFIG_NUMA
@@ -3280,7 +3269,7 @@
int i, err;

memset(&new.new,0,sizeof(new.new));
- for (i = 0; i < NR_CPUS; i++) {
+ for_each_cpu(i) {
if (cpu_online(i)) {
new.new[i] = alloc_arraycache(i, limit, batchcount);
if (!new.new[i]) {
@@ -3302,7 +3291,7 @@
cachep->shared = shared;
spin_unlock_irq(&cachep->spinlock);

- for (i = 0; i < NR_CPUS; i++) {
+ for_each_cpu(i) {
struct array_cache *ccold = new.new[i];
if (!ccold)
continue;

2005-05-19 05:08:28

by Christoph Lameter

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

On Wed, 18 May 2005, Christoph Lameter wrote:

> Fixes to the slab allocator in 2.6.12-rc4-mm2
> - Remove MAX_NUMNODES check
> - use for_each_node/cpu
> - Fix determination of INDEX_AC

Rats! The whole thing with cpu online and node online is not as easy as I
thought. There may be bugs in V3 of the numa slab allocator
because offline cpus and offline are not properly handled. Maybe
that also contributed to the ppc64 issues.

The earlier patch fails if I boot an x86_64 NUMA kernel on a x86_64 single
processor system.

Here is a revised patch. Would be good if someone could review my use
of online_cpu / online_node etc. Is there some way to bring cpus
online and offline to test if this really works? Seems that the code in
alloc_percpu is suspect even in the old allocator because it may have
to allocate memory for non present cpus.

-----
Fixes to the slab allocator in 2.6.12-rc4-mm2

- Remove MAX_NUMNODES check
- use for_each_node/cpu
- Fix determination of INDEX_AC

Signed-off-by: Christoph Lameter <[email protected]>
Signed-off-by: Alok N Kataria <[email protected]>

Index: linux-2.6.12-rc4/mm/slab.c
===================================================================
--- linux-2.6.12-rc4.orig/mm/slab.c 2005-05-18 21:20:49.000000000 -0700
+++ linux-2.6.12-rc4/mm/slab.c 2005-05-18 21:57:11.000000000 -0700
@@ -108,17 +108,6 @@
#include <asm/page.h>

/*
- * Some Linux kernels currently have weird notions of NUMA. Make sure that
- * there is only a single node if CONFIG_NUMA is not set. Remove this check
- * after the situation has stabilized.
- */
-#ifndef CONFIG_NUMA
-#if MAX_NUMNODES != 1
-#error "Broken Configuration: CONFIG_NUMA not set but MAX_NUMNODES !=1 !!"
-#endif
-#endif
-
-/*
* DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
* SLAB_RED_ZONE & SLAB_POISON.
* 0 for faster, smaller code (especially in the critical paths).
@@ -341,7 +330,7 @@ static inline int index_of(const size_t
}
}

-#define INDEX_AC index_of(sizeof(struct array_cache))
+#define INDEX_AC index_of(sizeof(struct arraycache_init))
#define INDEX_L3 index_of(sizeof(struct kmem_list3))

#ifdef CONFIG_NUMA
@@ -800,7 +789,7 @@ static inline struct array_cache **alloc
limit = 12;
ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
if (ac_ptr) {
- for (i = 0; i < MAX_NUMNODES; i++) {
+ for_each_online_node(i) {
if (i == node) {
ac_ptr[i] = NULL;
continue;
@@ -823,7 +812,8 @@ static inline void free_alien_cache(stru

if (!ac_ptr)
return;
- for (i = 0; i < MAX_NUMNODES; i++)
+
+ for_each_online_node(i)
kfree(ac_ptr[i]);

kfree(ac_ptr);
@@ -847,7 +837,7 @@ static void drain_alien_cache(kmem_cache
struct array_cache *ac;
unsigned long flags;

- for (i = 0; i < MAX_NUMNODES; i++) {
+ for_each_online_node(i) {
ac = l3->alien[i];
if (ac) {
spin_lock_irqsave(&ac->lock, flags);
@@ -1197,10 +1187,8 @@ static int __init cpucache_init(void)
* Register the timers that return unneeded
* pages to gfp.
*/
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
- if (cpu_online(cpu))
- start_cpu_timer(cpu);
- }
+ for_each_online_cpu(cpu)
+ start_cpu_timer(cpu);

return 0;
}
@@ -1986,7 +1974,7 @@ static int __cache_shrink(kmem_cache_t *
drain_cpu_caches(cachep);

check_irq_on();
- for (i = 0; i < MAX_NUMNODES; i++) {
+ for_each_online_node(i) {
l3 = cachep->nodelists[i];
if (l3) {
spin_lock_irq(&l3->list_lock);
@@ -2061,14 +2049,11 @@ int kmem_cache_destroy(kmem_cache_t * ca
if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
synchronize_rcu();

- /* no cpu_online check required here since we clear the percpu
- * array on cpu offline and set this to NULL.
- */
- for (i = 0; i < NR_CPUS; i++)
+ for_each_online_cpu(i)
kfree(cachep->array[i]);

/* NUMA: free the list3 structures */
- for (i = 0; i < MAX_NUMNODES; i++) {
+ for_each_online_node(i) {
if ((l3 = cachep->nodelists[i])) {
kfree(l3->shared);
#ifdef CONFIG_NUMA
@@ -2975,9 +2960,12 @@ void *__alloc_percpu(size_t size, size_t
if (!pdata)
return NULL;

- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_possible(i))
- continue;
+ /*
+ * Cannot use for_each_online cpus since a cpu may come online
+ * and we have no way of figuring out how to fix the array
+ * that we have allocated then....
+ */
+ for_each_cpu (i) {
pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL,
cpu_to_node(i));

@@ -3075,11 +3063,11 @@ free_percpu(const void *objp)
int i;
struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);

- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_possible(i))
- continue;
+ /*
+ * We allocate for all cpus so we cannot use for online cpu here.
+ */
+ for_each_cpu(i)
kfree(p->ptrs[i]);
- }
kfree(p);
}
EXPORT_SYMBOL(free_percpu);
@@ -3189,65 +3177,63 @@ static int alloc_kmemlist(kmem_cache_t *
struct kmem_list3 *l3;
int err = 0;

- for (i = 0; i < NR_CPUS; i++) {
- if (cpu_online(i)) {
- struct array_cache *nc = NULL, *new;
+ for_each_online_cpu(i) {
+ struct array_cache *nc = NULL, *new;
#ifdef CONFIG_NUMA
- struct array_cache **new_alien = NULL;
+ struct array_cache **new_alien = NULL;
#endif
- node = cpu_to_node(i);
+ node = cpu_to_node(i);
#ifdef CONFIG_NUMA
- if (!(new_alien = alloc_alien_cache(i, cachep->limit)))
- goto fail;
+ if (!(new_alien = alloc_alien_cache(i, cachep->limit)))
+ goto fail;
#endif
- if (!(new = alloc_arraycache(i, (cachep->shared*
- cachep->batchcount), 0xbaadf00d)))
- goto fail;
- if ((l3 = cachep->nodelists[node])) {
+ if (!(new = alloc_arraycache(i, (cachep->shared*
+ cachep->batchcount), 0xbaadf00d)))
+ goto fail;
+ if ((l3 = cachep->nodelists[node])) {

- spin_lock_irq(&l3->list_lock);
+ spin_lock_irq(&l3->list_lock);

- if ((nc = cachep->nodelists[node]->shared))
- free_block(cachep, nc->entry,
+ if ((nc = cachep->nodelists[node]->shared))
+ free_block(cachep, nc->entry,
nc->avail);

- l3->shared = new;
-#ifdef CONFIG_NUMA
- if (!cachep->nodelists[node]->alien) {
- l3->alien = new_alien;
- new_alien = NULL;
- }
- l3->free_limit = (1 + nr_cpus_node(node))*
- cachep->batchcount + cachep->num;
-#else
- l3->free_limit = (1 + num_online_cpus())*
- cachep->batchcount + cachep->num;
-#endif
- spin_unlock_irq(&l3->list_lock);
- kfree(nc);
-#ifdef CONFIG_NUMA
- free_alien_cache(new_alien);
-#endif
- continue;
- }
- if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
- GFP_KERNEL, node)))
- goto fail;
-
- LIST3_INIT(l3);
- l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
l3->shared = new;
#ifdef CONFIG_NUMA
- l3->alien = new_alien;
+ if (!cachep->nodelists[node]->alien) {
+ l3->alien = new_alien;
+ new_alien = NULL;
+ }
l3->free_limit = (1 + nr_cpus_node(node))*
cachep->batchcount + cachep->num;
#else
l3->free_limit = (1 + num_online_cpus())*
cachep->batchcount + cachep->num;
#endif
- cachep->nodelists[node] = l3;
+ spin_unlock_irq(&l3->list_lock);
+ kfree(nc);
+#ifdef CONFIG_NUMA
+ free_alien_cache(new_alien);
+#endif
+ continue;
}
+ if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
+ GFP_KERNEL, node)))
+ goto fail;
+
+ LIST3_INIT(l3);
+ l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+ l3->shared = new;
+#ifdef CONFIG_NUMA
+ l3->alien = new_alien;
+ l3->free_limit = (1 + nr_cpus_node(node))*
+ cachep->batchcount + cachep->num;
+#else
+ l3->free_limit = (1 + num_online_cpus())*
+ cachep->batchcount + cachep->num;
+#endif
+ cachep->nodelists[node] = l3;
}
return err;
fail:
@@ -3280,15 +3266,11 @@ static int do_tune_cpucache(kmem_cache_t
int i, err;

memset(&new.new,0,sizeof(new.new));
- for (i = 0; i < NR_CPUS; i++) {
- if (cpu_online(i)) {
- new.new[i] = alloc_arraycache(i, limit, batchcount);
- if (!new.new[i]) {
- for (i--; i >= 0; i--) kfree(new.new[i]);
- return -ENOMEM;
- }
- } else {
- new.new[i] = NULL;
+ for_each_online_cpu(i) {
+ new.new[i] = alloc_arraycache(i, limit, batchcount);
+ if (!new.new[i]) {
+ for (i--; i >= 0; i--) kfree(new.new[i]);
+ return -ENOMEM;
}
}
new.cachep = cachep;
@@ -3302,7 +3284,7 @@ static int do_tune_cpucache(kmem_cache_t
cachep->shared = shared;
spin_unlock_irq(&cachep->spinlock);

- for (i = 0; i < NR_CPUS; i++) {
+ for_each_online_cpu(i) {
struct array_cache *ccold = new.new[i];
if (!ccold)
continue;

2005-05-19 16:14:19

by Jesse Barnes

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

On Wednesday, May 18, 2005 10:07 pm, Christoph Lameter wrote:
> Here is a revised patch. Would be good if someone could review my use
> of online_cpu / online_node etc. Is there some way to bring cpus
> online and offline to test if this really works? Seems that the code
> in alloc_percpu is suspect even in the old allocator because it may
> have to allocate memory for non present cpus.

If you have hotplug enabled, I think you'll see an 'online' file that
you can echo 1 or 0 into, somewhere in /sys/devices/system/cpu/cpu0 for
example. It should work even on machines where it doesn't actually
power down the slot (it'll just remove it from the online map, and it
won't get scheduled on, etc.); at least it did last time I tested it.

Jesse

2005-05-19 19:03:26

by Matthew Dobson

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

Christoph Lameter wrote:
> On Wed, 18 May 2005, Christoph Lameter wrote:
>
>>Fixes to the slab allocator in 2.6.12-rc4-mm2
>>- Remove MAX_NUMNODES check
>>- use for_each_node/cpu
>>- Fix determination of INDEX_AC
>
> Rats! The whole thing with cpu online and node online is not as easy as I
> thought. There may be bugs in V3 of the numa slab allocator
> because offline cpus and offline are not properly handled. Maybe
> that also contributed to the ppc64 issues.

Running this test through the "wringer" (aka building/booting on one of our
PPC64 boxen). I'll let you know if this fixes any problems.


> The earlier patch fails if I boot an x86_64 NUMA kernel on a x86_64 single
> processor system.
>
> Here is a revised patch. Would be good if someone could review my use
> of online_cpu / online_node etc. Is there some way to bring cpus
> online and offline to test if this really works? Seems that the code in
> alloc_percpu is suspect even in the old allocator because it may have
> to allocate memory for non present cpus.

I'll look through and see what I can tell you, but I gotta run to a meeting
now. :(

-Matt

2005-05-19 21:49:15

by Matthew Dobson

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM

1 = SMS Menu 5 = Default Boot List
8 = Open Firmware Prompt 6 = Stored Boot List


memory keyboard network scsi speaker 
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM STARTING SOFTWARE IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM PLEASE WAIT... IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/
Elapsed time since release of system processors: 174425 mins 56 secs

Config file read, 1336 bytes
Welcome to yaboot version 1.3.11.SuSE
Enter "help" to get some basic usage information
boot: -- 0:conmux-control -- time-stamp -- May/19/05 13:39:50 --
autobench
Please wait, loading kernel...
Elf32 kernel loaded...

zImage starting: loaded at 0x400000
Allocating 0x8aa000 bytes for kernel ...
gunzipping (0x1c00000 <- 0x407000:0x67125a)...done 0x6ea568 bytes
0xd7e8 bytes of heap consumed, max in use 0xa248
initrd head: 0x0
OF stdout device is: /vdevice/vty@30000000
Hypertas detected, assuming LPAR !
command line: root=/dev/sda3 selinux=0 elevator=cfq autobench_args:
memory layout at init:
memory_limit : 0000000000000000 (16 MB aligned)
alloc_bottom : 00000000023be000
alloc_top : 0000000008000000
alloc_top_hi : 0000000100000000
rmo_top : 0000000008000000
ram_top : 0000000100000000
Looking for displays
instantiating rtas at 0x00000000077d9000... done
0000000000000000 : boot cpu 0000000000000000
0000000000000002 : starting cpu hw idx 0000000000000002... done
copying OF device tree ...
Building dt strings...
Building dt structure...
Device tree strings 0x00000000024bf000 -> 0x00000000024c01a9
Device tree struct 0x00000000024c1000 -> 0x00000000024c9000
Calling quiesce ...
returning from prom_init
firmware_features = 0x1ffd5f
Partition configured for 4 cpus.
Starting Linux PPC64 2.6.12-rc4-mm2-autokern1
-----------------------------------------------------
ppc64_pft_size = 0x1a
ppc64_debug_switch = 0x0
ppc64_interrupt_controller = 0x2
systemcfg = 0xc000000000520000
systemcfg->platform = 0x101
systemcfg->processorCount = 0x4
systemcfg->physicalMemorySize = 0x100000000
ppc64_caches.dcache_line_size = 0x80
ppc64_caches.icache_line_size = 0x80
htab_address = 0x0000000000000000
htab_hash_mask = 0x7ffff
-----------------------------------------------------
[boot]0100 MM Init
[boot]0100 MM Init Done
Linux version 2.6.12-rc4-mm2-autokern1 (root@gekko-lp1) (gcc version 3.3.3-hammer) #1 SMP Thu May 19 17:39:58 CDT 2005
[boot]0012 Setup Arch
WARNING: memory at 48000000 maps to invalid NUMA node 1
WARNING: memory at 50000000 maps to invalid NUMA node 1
WARNING: memory at 58000000 maps to invalid NUMA node 1
WARNING: memory at 60000000 maps to invalid NUMA node 1
WARNING: memory at 68000000 maps to invalid NUMA node 1
WARNING: memory at 70000000 maps to invalid NUMA node 1
WARNING: memory at 78000000 maps to invalid NUMA node 1
WARNING: memory at 80000000 maps to invalid NUMA node 1
WARNING: memory at 88000000 maps to invalid NUMA node 2
WARNING: memory at 90000000 maps to invalid NUMA node 2
WARNING: memory at 98000000 maps to invalid NUMA node 2
WARNING: memory at a0000000 maps to invalid NUMA node 2
WARNING: memory at a8000000 maps to invalid NUMA node 2
WARNING: memory at b0000000 maps to invalid NUMA node 2
WARNING: memory at b8000000 maps to invalid NUMA node 2
WARNING: memory at c0000000 maps to invalid NUMA node 2
WARNING: memory at c8000000 maps to invalid NUMA node 3
WARNING: memory at d0000000 maps to invalid NUMA node 3
WARNING: memory at d8000000 maps to invalid NUMA node 3
WARNING: memory at e0000000 maps to invalid NUMA node 3
WARNING: memory at e8000000 maps to invalid NUMA node 3
WARNING: memory at f0000000 maps to invalid NUMA node 3
WARNING: memory at f8000000 maps to invalid NUMA node 3
Node 0 Memory: 0x0-0x100000000
Syscall map setup, 234 32 bits and 210 64 bits syscalls
No ramdisk, default root is /dev/sda2
EEH: PCI Enhanced I/O Error Handling Enabled
PPC64 nvram contains 7168 bytes
Using dedicated idle loop
[boot]0015 Setup Done
Built 1 zonelists
Kernel command line: root=/dev/sda3 selinux=0 elevator=cfq autobench_args:
[boot]0020 XICS Init
xics: no ISA interrupt controller
[boot]0021 XICS Done
PID hash table entries: 4096 (order: 12, 131072 bytes)
time_init: decrementer frequency = 238.058000 MHz
time_init: processor frequency = 1904.464000 MHz
firmware_features = 0x1ffd5f
Partition configured for 4 cpus.
Starting Linux PPC64 2.6.12-rc4-mm2-autokern1
-----------------------------------------------------
ppc64_pft_size = 0x1a
ppc64_debug_switch = 0x0
ppc64_interrupt_controller = 0x2
systemcfg = 0xc000000000520000
systemcfg->platform = 0x101
systemcfg->processorCount = 0x4
systemcfg->physicalMemorySize = 0x100000000
ppc64_caches.dcache_line_size = 0x80
ppc64_caches.icache_line_size = 0x80
htab_address = 0x0000000000000000
htab_hash_mask = 0x7ffff
-----------------------------------------------------
[boot]0100 MM Init
[boot]0100 MM Init Done
Linux version 2.6.12-rc4-mm2-autokern1 (root@gekko-lp1) (gcc version 3.3.3-hammer) #1 SMP Thu May 19 17:39:58 CDT 2005
[boot]0012 Setup Arch
WARNING: memory at 48000000 maps to invalid NUMA node 1
WARNING: memory at 50000000 maps to invalid NUMA node 1
WARNING: memory at 58000000 maps to invalid NUMA node 1
WARNING: memory at 60000000 maps to invalid NUMA node 1
WARNING: memory at 68000000 maps to invalid NUMA node 1
WARNING: memory at 70000000 maps to invalid NUMA node 1
WARNING: memory at 78000000 maps to invalid NUMA node 1
WARNING: memory at 80000000 maps to invalid NUMA node 1
WARNING: memory at 88000000 maps to invalid NUMA node 2
WARNING: memory at 90000000 maps to invalid NUMA node 2
WARNING: memory at 98000000 maps to invalid NUMA node 2
WARNING: memory at a0000000 maps to invalid NUMA node 2
WARNING: memory at a8000000 maps to invalid NUMA node 2
WARNING: memory at b0000000 maps to invalid NUMA node 2
WARNING: memory at b8000000 maps to invalid NUMA node 2
WARNING: memory at c0000000 maps to invalid NUMA node 2
WARNING: memory at c8000000 maps to invalid NUMA node 3
WARNING: memory at d0000000 maps to invalid NUMA node 3
WARNING: memory at d8000000 maps to invalid NUMA node 3
WARNING: memory at e0000000 maps to invalid NUMA node 3
WARNING: memory at e8000000 maps to invalid NUMA node 3
WARNING: memory at f0000000 maps to invalid NUMA node 3
WARNING: memory at f8000000 maps to invalid NUMA node 3
Node 0 Memory: 0x0-0x100000000
Syscall map setup, 234 32 bits and 210 64 bits syscalls
No ramdisk, default root is /dev/sda2
EEH: PCI Enhanced I/O Error Handling Enabled
PPC64 nvram contains 7168 bytes
Using dedicated idle loop
[boot]0015 Setup Done
Built 1 zonelists
Kernel command line: root=/dev/sda3 selinux=0 elevator=cfq autobench_args:
[boot]0020 XICS Init
xics: no ISA interrupt controller
[boot]0021 XICS Done
PID hash table entries: 4096 (order: 12, 131072 bytes)
time_init: decrementer frequency = 238.058000 MHz
time_init: processor frequency = 1904.464000 MHz
Console: colour dummy device 80x25
Dentry cache hash table entries: 1048576 (order: 11, 8388608 bytes)
Inode-cache hash table entries: 524288 (order: 10, 4194304 bytes)
freeing bootmem node 0
Memory: 1088636k/4194304k available (4352k kernel code, 3104944k reserved, 1764k data, 849k bss, 280k init)
kernel BUG in interleave_nodes at mm/mempolicy.c:701!
Oops: Exception in kernel mode, sig: 5 [#1]
SMP NR_CPUS=128 NUMA PSERIES LPAR
Modules linked in:
NIP: C0000000000AC6DC XER: 0000000D LR: C0000000000AD0CC CTR: C000000000267760
REGS: c00000000051fa10 TRAP: 0700 Not tainted (2.6.12-rc4-mm2-autokern1)
MSR: 8000000000029032 EE: 1 PR: 0 FP: 0 ME: 1 IR/DR: 11 CR: 24004022
DAR: ffffffffffffffff DSISR: c0000000006e38e7
TASK: c0000000005bedb0[0] 'swapper' THREAD: c00000000051c000 CPU: 0
GPR00: 0000000000000001 C00000000051FC90 C0000000006D7380 C00000000FFDF590
GPR04: 0000000000000000 FFFFFFFFFFFFFFFF C0000000006E39F8 C0000000005D8B24
GPR08: C0000000005D8B18 0000000000000000 C0000000006E39F0 C0000000006E3910
GPR12: 000000000000000A C000000000579C00 0000000000000000 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000001C10000
GPR20: 0000000002111830 0000000002111830 BFFFFFFFFE3F0000 000000000199F9C0
GPR24: C000000000579C00 C0000000006D5208 C000000000518E78 0000000000008000
GPR28: 0000000000000000 00000000000080D0 C0000000005BEDB0 0000000000000001
NIP [c0000000000ac6dc] .interleave_nodes+0x38/0xd0
LR [c0000000000ad0cc] .alloc_pages_current+0x100/0x134
Call Trace:
[c00000000051fc90] [000000000000001d] 0x1d (unreliable)
[c00000000051fd20] [c0000000000ad0cc] .alloc_pages_current+0x100/0x134
[c00000000051fdc0] [c00000000008c6c8] .get_zeroed_page+0x28/0x90
[c00000000051fe40] [c0000000004ec2e8] .pidmap_init+0x24/0xa0
[c00000000051fed0] [c0000000004d6734] .start_kernel+0x21c/0x314
[c00000000051ff90] [c00000000000bfb4] .__setup_cpu_power3+0x0/0x4
Instruction dump:
fba1ffe8 fbc1fff0 f8010010 f821ff71 60000000 ebcd0170 a93e07a0 793f0020
7fe9fe70 7d20fa78 7c004850 54000ffe <0b000000> 3ba30010 38bf0001 38800001
<0>Kernel panic - not syncing: Attempted to kill the idle task!
























IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM

1 = SMS Menu 5 = Default Boot List
8 = Open Firmware Prompt 6 = Stored Boot List


memory keyboard network scsi speaker 
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM STARTING SOFTWARE IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM PLEASE WAIT... IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM IBM
-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/
Elapsed time since release of system processors: 174429 mins 12 secs

Config file read, 1336 bytes
Welcome to yaboot version 1.3.11.SuSE
Enter "help" to get some basic usage information
boot: -- 0:conmux-control -- time-stamp -- May/19/05 13:43:05 --
autobench
Please wait, loading kernel...
Elf32 kernel loaded...

zImage starting: loaded at 0x400000
Allocating 0x8aa000 bytes for kernel ...
gunzipping (0x1c00000 <- 0x407000:0x67125a)...done 0x6ea568 bytes
0xd7e8 bytes of heap consumed, max in use 0xa248
initrd head: 0x0
OF stdout device is: /vdevice/vty@30000000
Hypertas detected, assuming LPAR !
command line: root=/dev/sda3 selinux=0 elevator=cfq autobench_args:
memory layout at init:
memory_limit : 0000000000000000 (16 MB aligned)
alloc_bottom : 00000000023be000
alloc_top : 0000000008000000
alloc_top_hi : 0000000100000000
rmo_top : 0000000008000000
ram_top : 0000000100000000
Looking for displays
instantiating rtas at 0x00000000077d9000... done
0000000000000000 : boot cpu 0000000000000000
0000000000000002 : starting cpu hw idx 0000000000000002... done
copying OF device tree ...
Building dt strings...
Building dt structure...
Device tree strings 0x00000000024bf000 -> 0x00000000024c01a9
Device tree struct 0x00000000024c1000 -> 0x00000000024c9000
Calling quiesce ...
returning from prom_init
firmware_features = 0x1ffd5f
Partition configured for 4 cpus.
Starting Linux PPC64 2.6.12-rc4-mm2-autokern1
-----------------------------------------------------
ppc64_pft_size = 0x1a
ppc64_debug_switch = 0x0
ppc64_interrupt_controller = 0x2
systemcfg = 0xc000000000520000
systemcfg->platform = 0x101
systemcfg->processorCount = 0x4
systemcfg->physicalMemorySize = 0x100000000
ppc64_caches.dcache_line_size = 0x80
ppc64_caches.icache_line_size = 0x80
htab_address = 0x0000000000000000
htab_hash_mask = 0x7ffff
-----------------------------------------------------
[boot]0100 MM Init
[boot]0100 MM Init Done
Linux version 2.6.12-rc4-mm2-autokern1 (root@gekko-lp1) (gcc version 3.3.3-hammer) #1 SMP Thu May 19 17:39:58 CDT 2005
[boot]0012 Setup Arch
WARNING: memory at 48000000 maps to invalid NUMA node 1
WARNING: memory at 50000000 maps to invalid NUMA node 1
WARNING: memory at 58000000 maps to invalid NUMA node 1
WARNING: memory at 60000000 maps to invalid NUMA node 1
WARNING: memory at 68000000 maps to invalid NUMA node 1
WARNING: memory at 70000000 maps to invalid NUMA node 1
WARNING: memory at 78000000 maps to invalid NUMA node 1
WARNING: memory at 80000000 maps to invalid NUMA node 1
WARNING: memory at 88000000 maps to invalid NUMA node 2
WARNING: memory at 90000000 maps to invalid NUMA node 2
WARNING: memory at 98000000 maps to invalid NUMA node 2
WARNING: memory at a0000000 maps to invalid NUMA node 2
WARNING: memory at a8000000 maps to invalid NUMA node 2
WARNING: memory at b0000000 maps to invalid NUMA node 2
WARNING: memory at b8000000 maps to invalid NUMA node 2
WARNING: memory at c0000000 maps to invalid NUMA node 2
WARNING: memory at c8000000 maps to invalid NUMA node 3
WARNING: memory at d0000000 maps to invalid NUMA node 3
WARNING: memory at d8000000 maps to invalid NUMA node 3
WARNING: memory at e0000000 maps to invalid NUMA node 3
WARNING: memory at e8000000 maps to invalid NUMA node 3
WARNING: memory at f0000000 maps to invalid NUMA node 3
WARNING: memory at f8000000 maps to invalid NUMA node 3
Node 0 Memory: 0x0-0x100000000
Syscall map setup, 234 32 bits and 210 64 bits syscalls
No ramdisk, default root is /dev/sda2
EEH: PCI Enhanced I/O Error Handling Enabled
PPC64 nvram contains 7168 bytes
Using dedicated idle loop
[boot]0015 Setup Done
Built 1 zonelists
Kernel command line: root=/dev/sda3 selinux=0 elevator=cfq autobench_args:
[boot]0020 XICS Init
xics: no ISA interrupt controller
[boot]0021 XICS Done
PID hash table entries: 4096 (order: 12, 131072 bytes)
time_init: decrementer frequency = 238.058000 MHz
time_init: processor frequency = 1904.464000 MHz
firmware_features = 0x1ffd5f
Partition configured for 4 cpus.
Starting Linux PPC64 2.6.12-rc4-mm2-autokern1
-----------------------------------------------------
ppc64_pft_size = 0x1a
ppc64_debug_switch = 0x0
ppc64_interrupt_controller = 0x2
systemcfg = 0xc000000000520000
systemcfg->platform = 0x101
systemcfg->processorCount = 0x4
systemcfg->physicalMemorySize = 0x100000000
ppc64_caches.dcache_line_size = 0x80
ppc64_caches.icache_line_size = 0x80
htab_address = 0x0000000000000000
htab_hash_mask = 0x7ffff
-----------------------------------------------------
[boot]0100 MM Init
[boot]0100 MM Init Done
Linux version 2.6.12-rc4-mm2-autokern1 (root@gekko-lp1) (gcc version 3.3.3-hammer) #1 SMP Thu May 19 17:39:58 CDT 2005
[boot]0012 Setup Arch
WARNING: memory at 48000000 maps to invalid NUMA node 1
WARNING: memory at 50000000 maps to invalid NUMA node 1
WARNING: memory at 58000000 maps to invalid NUMA node 1
WARNING: memory at 60000000 maps to invalid NUMA node 1
WARNING: memory at 68000000 maps to invalid NUMA node 1
WARNING: memory at 70000000 maps to invalid NUMA node 1
WARNING: memory at 78000000 maps to invalid NUMA node 1
WARNING: memory at 80000000 maps to invalid NUMA node 1
WARNING: memory at 88000000 maps to invalid NUMA node 2
WARNING: memory at 90000000 maps to invalid NUMA node 2
WARNING: memory at 98000000 maps to invalid NUMA node 2
WARNING: memory at a0000000 maps to invalid NUMA node 2
WARNING: memory at a8000000 maps to invalid NUMA node 2
WARNING: memory at b0000000 maps to invalid NUMA node 2
WARNING: memory at b8000000 maps to invalid NUMA node 2
WARNING: memory at c0000000 maps to invalid NUMA node 2
WARNING: memory at c8000000 maps to invalid NUMA node 3
WARNING: memory at d0000000 maps to invalid NUMA node 3
WARNING: memory at d8000000 maps to invalid NUMA node 3
WARNING: memory at e0000000 maps to invalid NUMA node 3
WARNING: memory at e8000000 maps to invalid NUMA node 3
WARNING: memory at f0000000 maps to invalid NUMA node 3
WARNING: memory at f8000000 maps to invalid NUMA node 3
Node 0 Memory: 0x0-0x100000000
Syscall map setup, 234 32 bits and 210 64 bits syscalls
No ramdisk, default root is /dev/sda2
EEH: PCI Enhanced I/O Error Handling Enabled
PPC64 nvram contains 7168 bytes
Using dedicated idle loop
[boot]0015 Setup Done
Built 1 zonelists
Kernel command line: root=/dev/sda3 selinux=0 elevator=cfq autobench_args:
[boot]0020 XICS Init
xics: no ISA interrupt controller
[boot]0021 XICS Done
PID hash table entries: 4096 (order: 12, 131072 bytes)
time_init: decrementer frequency = 238.058000 MHz
time_init: processor frequency = 1904.464000 MHz
Console: colour dummy device 80x25
Dentry cache hash table entries: 1048576 (order: 11, 8388608 bytes)
Inode-cache hash table entries: 524288 (order: 10, 4194304 bytes)
freeing bootmem node 0
Memory: 1088636k/4194304k available (4352k kernel code, 3104944k reserved, 1764k data, 849k bss, 280k init)
kernel BUG in interleave_nodes at mm/mempolicy.c:701!
Oops: Exception in kernel mode, sig: 5 [#1]
SMP NR_CPUS=128 NUMA PSERIES LPAR
Modules linked in:
NIP: C0000000000AC6DC XER: 0000000D LR: C0000000000AD0CC CTR: C000000000267760
REGS: c00000000051fa10 TRAP: 0700 Not tainted (2.6.12-rc4-mm2-autokern1)
MSR: 8000000000029032 EE: 1 PR: 0 FP: 0 ME: 1 IR/DR: 11 CR: 24004022
DAR: ffffffffffffffff DSISR: c0000000006e38e7
TASK: c0000000005bedb0[0] 'swapper' THREAD: c00000000051c000 CPU: 0
GPR00: 0000000000000001 C00000000051FC90 C0000000006D7380 C00000000FFDF590
GPR04: 0000000000000000 FFFFFFFFFFFFFFFF C0000000006E39F8 C0000000005D8B24
GPR08: C0000000005D8B18 0000000000000000 C0000000006E39F0 C0000000006E3910
GPR12: 000000000000000A C000000000579C00 0000000000000000 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000001C10000
GPR20: 0000000002111830 0000000002111830 BFFFFFFFFE3F0000 000000000199F9C0
GPR24: C000000000579C00 C0000000006D5208 C000000000518E78 0000000000008000
GPR28: 0000000000000000 00000000000080D0 C0000000005BEDB0 0000000000000001
NIP [c0000000000ac6dc] .interleave_nodes+0x38/0xd0
LR [c0000000000ad0cc] .alloc_pages_current+0x100/0x134
Call Trace:
[c00000000051fc90] [000000000000001d] 0x1d (unreliable)
[c00000000051fd20] [c0000000000ad0cc] .alloc_pages_current+0x100/0x134
[c00000000051fdc0] [c00000000008c6c8] .get_zeroed_page+0x28/0x90
[c00000000051fe40] [c0000000004ec2e8] .pidmap_init+0x24/0xa0
[c00000000051fed0] [c0000000004d6734] .start_kernel+0x21c/0x314
[c00000000051ff90] [c00000000000bfb4] .__setup_cpu_power3+0x0/0x4
Instruction dump:
fba1ffe8 fbc1fff0 f8010010 f821ff71 60000000 ebcd0170 a93e07a0 793f0020
7fe9fe70 7d20fa78 7c004850 54000ffe <0b000000> 3ba30010 38bf0001 38800001
<0>Kernel panic - not syncing: Attempted to kill the idle task!
-- 0:conmux-control -- time-stamp -- May/19/05 13:45:33 --


Attachments:
console.log.new (25.87 kB)

2005-05-20 19:04:03

by Matthew Dobson

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

Christoph, I'm getting the following errors building rc4-mm2 w/ GCC 2.95.4:

mm/slab.c:281: field `entry' has incomplete typemm/slab.c: In function
'cache_alloc_refill':
mm/slab.c:2497: warning: control reaches end of non-void function
mm/slab.c: In function `kmem_cache_alloc':
mm/slab.c:2567: warning: `objp' might be used uninitialized in this function
mm/slab.c: In function `kmem_cache_alloc_node':
mm/slab.c:2567: warning: `objp' might be used uninitialized in this function
mm/slab.c: In function `__kmalloc':
mm/slab.c:2567: warning: `objp' might be used uninitialized in this function
make[1]: *** [mm/slab.o] Error 1
make[1]: *** Waiting for unfinished jobs....

-Matt

2005-05-20 19:24:14

by Christoph Lameter

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

On Fri, 20 May 2005, Matthew Dobson wrote:

> Christoph, I'm getting the following errors building rc4-mm2 w/ GCC 2.95.4:

Works fine here with gcc 2.95.4.ds15-22 but that is a debian gcc
2.95.4 patched up to work correctly. If you need to address the pathology in pristine
gcc 2.95.4 by changing the source then declare the entry field with 0
members.

Index: linux-2.6.12-rc4/mm/slab.c
===================================================================
--- linux-2.6.12-rc4.orig/mm/slab.c 2005-05-19 21:29:45.000000000 +0000
+++ linux-2.6.12-rc4/mm/slab.c 2005-05-20 19:18:22.000000000 +0000
@@ -267,7 +267,7 @@
#ifdef CONFIG_NUMA
spinlock_t lock;
#endif
- void *entry[];
+ void *entry[0];
};

/* bootstrap: The caches do not work without cpuarrays anymore,



gcc 2.95 can produce proper code for ppc64?



> mm/slab.c:281: field `entry' has incomplete typemm/slab.c: In function
> 'cache_alloc_refill':

See patch above?

> mm/slab.c:2497: warning: control reaches end of non-void function

That is the end of cache_alloc_debug_check right? This is a void
function in my source.

> mm/slab.c: In function `kmem_cache_alloc':
> mm/slab.c:2567: warning: `objp' might be used uninitialized in this function
> mm/slab.c: In function `kmem_cache_alloc_node':
> mm/slab.c:2567: warning: `objp' might be used uninitialized in this function
> mm/slab.c: In function `__kmalloc':
> mm/slab.c:2567: warning: `objp' might be used uninitialized in this function

There is a branch there and the object is initialized in either branch.

2005-05-20 20:20:41

by Matthew Dobson

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

Christoph Lameter wrote:
> On Fri, 20 May 2005, Matthew Dobson wrote:
>
>
>>Christoph, I'm getting the following errors building rc4-mm2 w/ GCC 2.95.4:
>
>
> Works fine here with gcc 2.95.4.ds15-22 but that is a debian gcc
> 2.95.4 patched up to work correctly. If you need to address the pathology in pristine
> gcc 2.95.4 by changing the source then declare the entry field with 0
> members.
>
> Index: linux-2.6.12-rc4/mm/slab.c
> ===================================================================
> --- linux-2.6.12-rc4.orig/mm/slab.c 2005-05-19 21:29:45.000000000 +0000
> +++ linux-2.6.12-rc4/mm/slab.c 2005-05-20 19:18:22.000000000 +0000
> @@ -267,7 +267,7 @@
> #ifdef CONFIG_NUMA
> spinlock_t lock;
> #endif
> - void *entry[];
> + void *entry[0];
> };
>
> /* bootstrap: The caches do not work without cpuarrays anymore,
>
>
>
> gcc 2.95 can produce proper code for ppc64?

Apparently...?


>>mm/slab.c:281: field `entry' has incomplete typemm/slab.c: In function
>>'cache_alloc_refill':
>
>
> See patch above?

Will do.


>>mm/slab.c:2497: warning: control reaches end of non-void function
>
>
> That is the end of cache_alloc_debug_check right? This is a void
> function in my source.

Nope. It's the end of this function:
static void *cache_alloc_refill(kmem_cache_t *cachep, unsigned int __nocast
flags)

Though I'm not sure why I'm getting this warning, since the function ends
like this:
ac->touched = 1;
return ac->entry[--ac->avail];
} <<-- Line 2497


>>mm/slab.c: In function `kmem_cache_alloc':
>>mm/slab.c:2567: warning: `objp' might be used uninitialized in this function
>>mm/slab.c: In function `kmem_cache_alloc_node':
>>mm/slab.c:2567: warning: `objp' might be used uninitialized in this function
>>mm/slab.c: In function `__kmalloc':
>>mm/slab.c:2567: warning: `objp' might be used uninitialized in this function
>
>
> There is a branch there and the object is initialized in either branch.

I agree. Not sure why this warning is occurring, either.

I tried to build this twice on this particular box, to no avail. 3x == charm?

-Matt

2005-05-20 21:31:42

by Matthew Dobson

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

Christoph Lameter wrote:
> On Fri, 20 May 2005, Matthew Dobson wrote:
>
>
>>Christoph, I'm getting the following errors building rc4-mm2 w/ GCC 2.95.4:
>
>
> Works fine here with gcc 2.95.4.ds15-22 but that is a debian gcc
> 2.95.4 patched up to work correctly. If you need to address the pathology in pristine
> gcc 2.95.4 by changing the source then declare the entry field with 0
> members.
>
> Index: linux-2.6.12-rc4/mm/slab.c
> ===================================================================
> --- linux-2.6.12-rc4.orig/mm/slab.c 2005-05-19 21:29:45.000000000 +0000
> +++ linux-2.6.12-rc4/mm/slab.c 2005-05-20 19:18:22.000000000 +0000
> @@ -267,7 +267,7 @@
> #ifdef CONFIG_NUMA
> spinlock_t lock;
> #endif
> - void *entry[];
> + void *entry[0];
> };
>
> /* bootstrap: The caches do not work without cpuarrays anymore,
>
>
>
> gcc 2.95 can produce proper code for ppc64?
>
>
>
>
>>mm/slab.c:281: field `entry' has incomplete typemm/slab.c: In function
>>'cache_alloc_refill':
>
>
> See patch above?

I can't for the life of me explain why, but the above patch makes ALL the
warnings go away, despite the fact that they seem unrelated. I dunno...
Maybe we should upgrade the compiler on that box?

-Matt

2005-05-20 23:42:55

by Christoph Lameter

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

On Fri, 20 May 2005, Matthew Dobson wrote:

> > See patch above?
>
> I can't for the life of me explain why, but the above patch makes ALL the
> warnings go away, despite the fact that they seem unrelated. I dunno...
> Maybe we should upgrade the compiler on that box?

I better not comment on gcc 2.95 since I may say something that would
not be so helpful...,.

2005-05-24 21:38:06

by Christoph Lameter

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

On Fri, 20 May 2005, Matthew Dobson wrote:

> I can't for the life of me explain why, but the above patch makes ALL the
> warnings go away, despite the fact that they seem unrelated. I dunno...
> Maybe we should upgrade the compiler on that box?

Is the NUMA slab patch now working on ppc64?

2005-05-24 23:03:17

by Matthew Dobson

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

Christoph Lameter wrote:
> On Fri, 20 May 2005, Matthew Dobson wrote:
>
>
>>I can't for the life of me explain why, but the above patch makes ALL the
>>warnings go away, despite the fact that they seem unrelated. I dunno...
>>Maybe we should upgrade the compiler on that box?
>
>
> Is the NUMA slab patch now working on ppc64?

No... It does compile with that trivial patch, though! :)

-mm2 isn't booting on my 32-way x86 box, nor does it boot on my PPC64 box.
I figured -mm3 would be out shortly and I'd give the boxes another kick in
the pants then...

-Matt

2005-05-25 05:22:12

by Christoph Lameter

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

On Tue, 24 May 2005, Matthew Dobson wrote:

> No... It does compile with that trivial patch, though! :)
>
> -mm2 isn't booting on my 32-way x86 box, nor does it boot on my PPC64 box.
> I figured -mm3 would be out shortly and I'd give the boxes another kick in
> the pants then...

Umm.. How does it fail? Any relationship to the slab allocator?

2005-05-25 18:38:47

by Matthew Dobson

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

Christoph Lameter wrote:
> On Tue, 24 May 2005, Matthew Dobson wrote:
>
>
>>No... It does compile with that trivial patch, though! :)
>>
>>-mm2 isn't booting on my 32-way x86 box, nor does it boot on my PPC64 box.
>> I figured -mm3 would be out shortly and I'd give the boxes another kick in
>>the pants then...
>
>
> Umm.. How does it fail? Any relationship to the slab allocator?

It dies really early om my x86 box. I'm not 100% sure that it is b/c of
your patches, since it dies so early I get nothing on the console. Grub
tells me it's loading the kernel image then.... nothing.

-Matt

2005-05-25 21:03:19

by Christoph Lameter

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3

On Wed, 25 May 2005, Matthew Dobson wrote:

> > Umm.. How does it fail? Any relationship to the slab allocator?
>
> It dies really early om my x86 box. I'm not 100% sure that it is b/c of
> your patches, since it dies so early I get nothing on the console. Grub
> tells me it's loading the kernel image then.... nothing.

Hmmm. Do you have an emulator? For IA32 and IA64 we have something that
simulates a boot up sequence and can tell us what is going on.

2005-05-26 06:48:49

by Martin Bligh

[permalink] [raw]
Subject: Re: NUMA aware slab allocator V3



--Christoph Lameter <[email protected]> wrote (on Wednesday, May 25, 2005 14:03:06 -0700):

> On Wed, 25 May 2005, Matthew Dobson wrote:
>
>> > Umm.. How does it fail? Any relationship to the slab allocator?
>>
>> It dies really early om my x86 box. I'm not 100% sure that it is b/c of
>> your patches, since it dies so early I get nothing on the console. Grub
>> tells me it's loading the kernel image then.... nothing.
>
> Hmmm. Do you have an emulator? For IA32 and IA64 we have something that
> simulates a boot up sequence and can tell us what is going on.

Turning on early printk is probably easier. Not that it seems to work nearly
as early as some of hte other implementations we had, but still.

M.

2005-05-28 02:03:38

by Christoph Lameter

[permalink] [raw]
Subject: NUMA aware slab allocator V4

The NUMA API change that introduced kmalloc_node was accepted for 2.6.12-rc3.
Now it is possible to do slab allocations on a node to localize
memory structures. This API was used by the pageset localization patch and
the block layer localization patch now in mm. The existing kmalloc_node is
slow since it simply searches through all pages of the slab to find a page
that is on the node requested. The two patches do a one time allocation of
slab structures at initialization and therefore the speed of kmalloc node
does not matter.

This patch allows kmalloc_node to be as fast as kmalloc by introducing node
specific page lists for partial, free and full slabs. Slab allocation
improves in a NUMA system so that we are seeing a performance gain in AIM7
of about 5% with this patch alone.

More NUMA localizations are possible if kmalloc_node operates in an fast
way like kmalloc.

Test run on a 32p systems with 32G Ram.

w/o patch
Tasks jobs/min jti jobs/min/task real cpu
1 485.36 100 485.3640 11.99 1.91 Sat Apr 30 14:01:51 2005
100 26582.63 88 265.8263 21.89 144.96 Sat Apr 30 14:02:14 2005
200 29866.83 81 149.3342 38.97 286.08 Sat Apr 30 14:02:53 2005
300 33127.16 78 110.4239 52.71 426.54 Sat Apr 30 14:03:46 2005
400 34889.47 80 87.2237 66.72 568.90 Sat Apr 30 14:04:53 2005
500 35654.34 76 71.3087 81.62 714.55 Sat Apr 30 14:06:15 2005
600 36460.83 75 60.7681 95.77 853.42 Sat Apr 30 14:07:51 2005
700 35957.00 75 51.3671 113.30 990.67 Sat Apr 30 14:09:45 2005
800 33380.65 73 41.7258 139.48 1140.86 Sat Apr 30 14:12:05 2005
900 35095.01 76 38.9945 149.25 1281.30 Sat Apr 30 14:14:35 2005
1000 36094.37 74 36.0944 161.24 1419.66 Sat Apr 30 14:17:17 2005

w/patch
Tasks jobs/min jti jobs/min/task real cpu
1 484.27 100 484.2736 12.02 1.93 Sat Apr 30 15:59:45 2005
100 28262.03 90 282.6203 20.59 143.57 Sat Apr 30 16:00:06 2005
200 32246.45 82 161.2322 36.10 282.89 Sat Apr 30 16:00:42 2005
300 37945.80 83 126.4860 46.01 418.75 Sat Apr 30 16:01:28 2005
400 40000.69 81 100.0017 58.20 561.48 Sat Apr 30 16:02:27 2005
500 40976.10 78 81.9522 71.02 696.95 Sat Apr 30 16:03:38 2005
600 41121.54 78 68.5359 84.92 834.86 Sat Apr 30 16:05:04 2005
700 44052.77 78 62.9325 92.48 971.53 Sat Apr 30 16:06:37 2005
800 41066.89 79 51.3336 113.38 1111.15 Sat Apr 30 16:08:31 2005
900 38918.77 79 43.2431 134.59 1252.57 Sat Apr 30 16:10:46 2005
1000 41842.21 76 41.8422 139.09 1392.33 Sat Apr 30 16:13:05 2005

These are measurement taken directly after boot and show a greater improvement than 5%.
However, the performance improvements become less over time if the AIM7 runs are repeated
and settle down at around 5%.

Links to earlier discussions:
http://marc.theaimsgroup.com/?t=111094594500003&r=1&w=2
http://marc.theaimsgroup.com/?t=111603406600002&r=1&w=2

Changelog V3-V4:
- Patch against 2.6.12-rc5-mm1
- Cleanup patch integrated
- More and better use of for_each_node and for_each_cpu
- GCC 2.95 fix (do not use [] use [0])
- Correct determination of INDEX_AC
- Remove hack to cause an error on platforms that have no CONFIG_NUMA but nodes.
- Remove list3_data and list3_data_ptr macros for better readability

Changelog V2-V3:
- Made to patch against 2.6.12-rc4-mm1
- Revised bootstrap mechanism so that larger size kmem_list3 structs can be
supported. Do a generic solution so that the right slab can be found
for the internal structs.
- use for_each_online_node

Changelog V1-V2:
- Batching for freeing of wrong-node objects (alien caches)
- Locking changes and NUMA #ifdefs as requested by Manfred

Signed-off-by: Alok N Kataria <[email protected]>
Signed-off-by: Shobhit Dayal <[email protected]>
Signed-off-by: Shai Fultheim <[email protected]>
Signed-off-by: Christoph Lameter <[email protected]>

Index: linux-2.6.12-rc5/mm/slab.c
===================================================================
--- linux-2.6.12-rc5.orig/mm/slab.c 2005-05-27 19:51:43.000000000 +0000
+++ linux-2.6.12-rc5/mm/slab.c 2005-05-27 20:02:09.000000000 +0000
@@ -75,6 +75,14 @@
*
* At present, each engine can be growing a cache. This should be blocked.
*
+ * 15 March 2005. NUMA slab allocator.
+ * Shobhit Dayal <[email protected]>
+ * Alok N Kataria <[email protected]>
+ * Christoph Lameter <[email protected]>
+ *
+ * Modified the slab allocator to be node aware on NUMA systems.
+ * Each node has its own list of partial, free and full slabs.
+ * All object allocations for a node occur from node specific slab lists.
*/

#include <linux/config.h>
@@ -93,7 +101,7 @@
#include <linux/module.h>
#include <linux/rcupdate.h>
#include <linux/string.h>
-
+#include <linux/nodemask.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -211,6 +219,9 @@
void *s_mem; /* including colour offset */
unsigned int inuse; /* num of objs active in slab */
kmem_bufctl_t free;
+#ifdef CONFIG_NUMA
+ unsigned short nodeid;
+#endif
};

/*
@@ -253,6 +264,15 @@
unsigned int limit;
unsigned int batchcount;
unsigned int touched;
+#ifdef CONFIG_NUMA
+ spinlock_t lock;
+#endif
+ void *entry[0]; /*
+ * Must have this definition in here for the proper
+ * alignment of array_cache. Also simplifies accessing
+ * the entries.
+ * [0] is for gcc 2.95. It should really be [].
+ */
};

/* bootstrap: The caches do not work without cpuarrays anymore,
@@ -265,34 +285,98 @@
};

/*
- * The slab lists of all objects.
- * Hopefully reduce the internal fragmentation
- * NUMA: The spinlock could be moved from the kmem_cache_t
- * into this structure, too. Figure out what causes
- * fewer cross-node spinlock operations.
+ * The slab lists for all objects.
*/
struct kmem_list3 {
struct list_head slabs_partial; /* partial list first, better asm code */
struct list_head slabs_full;
struct list_head slabs_free;
unsigned long free_objects;
- int free_touched;
unsigned long next_reap;
+ int free_touched;
+ unsigned int free_limit;
+ spinlock_t list_lock;
struct array_cache *shared;
+#ifdef CONFIG_NUMA
+ struct array_cache **alien;
+#endif
};

+/*
+ * Need this for bootstrapping a per node allocator.
+ */
+#define NUM_INIT_LISTS (2 + MAX_NUMNODES)
+struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
+#define CACHE_CACHE 0
+#define SIZE_AC 1
+#define SIZE_L3 2
+
+/*
+ * This function may be completely optimized away if
+ * a constant is passed to it. Mostly the same as
+ * what is in linux/slab.h except it returns an
+ * index.
+ */
+static inline int index_of(const size_t size)
+{
+ int i = 0;
+
+#define CACHE(x) \
+ if (size <=x) \
+ return i; \
+ else \
+ i++;
+#include "linux/kmalloc_sizes.h"
+#undef CACHE
+ {
+ extern void __bad_size(void);
+ __bad_size();
+ return 0;
+ }
+}
+
+#define INDEX_AC index_of(sizeof(struct arraycache_init))
+#define INDEX_L3 index_of(sizeof(struct kmem_list3))
+
+#ifdef CONFIG_NUMA
+
#define LIST3_INIT(parent) \
- { \
- .slabs_full = LIST_HEAD_INIT(parent.slabs_full), \
- .slabs_partial = LIST_HEAD_INIT(parent.slabs_partial), \
- .slabs_free = LIST_HEAD_INIT(parent.slabs_free) \
- }
-#define list3_data(cachep) \
- (&(cachep)->lists)
-
-/* NUMA: per-node */
-#define list3_data_ptr(cachep, ptr) \
- list3_data(cachep)
+ do { \
+ INIT_LIST_HEAD(&(parent)->slabs_full); \
+ INIT_LIST_HEAD(&(parent)->slabs_partial); \
+ INIT_LIST_HEAD(&(parent)->slabs_free); \
+ (parent)->shared = NULL; \
+ (parent)->alien = NULL; \
+ (parent)->list_lock = SPIN_LOCK_UNLOCKED; \
+ (parent)->free_objects = 0; \
+ (parent)->free_touched = 0; \
+ } while (0)
+#else
+
+#define LIST3_INIT(parent) \
+ do { \
+ INIT_LIST_HEAD(&(parent)->slabs_full); \
+ INIT_LIST_HEAD(&(parent)->slabs_partial); \
+ INIT_LIST_HEAD(&(parent)->slabs_free); \
+ (parent)->shared = NULL; \
+ (parent)->list_lock = SPIN_LOCK_UNLOCKED; \
+ (parent)->free_objects = 0; \
+ (parent)->free_touched = 0; \
+ } while (0)
+#endif
+
+#define MAKE_LIST(cachep, listp, slab, nodeid) \
+ do { \
+ INIT_LIST_HEAD(listp); \
+ list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
+ } while (0)
+
+#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
+ do { \
+ MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \
+ MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
+ MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
+ } while (0)

/*
* kmem_cache_t
@@ -305,13 +389,12 @@
struct array_cache *array[NR_CPUS];
unsigned int batchcount;
unsigned int limit;
-/* 2) touched by every alloc & free from the backend */
- struct kmem_list3 lists;
- /* NUMA: kmem_3list_t *nodelists[MAX_NUMNODES] */
+ unsigned int shared;
unsigned int objsize;
+/* 2) touched by every alloc & free from the backend */
+ struct kmem_list3 *nodelists[MAX_NUMNODES];
unsigned int flags; /* constant flags */
unsigned int num; /* # of objs per slab */
- unsigned int free_limit; /* upper limit of objects in the lists */
spinlock_t spinlock;

/* 3) cache_grow/shrink */
@@ -348,6 +431,7 @@
unsigned long errors;
unsigned long max_freeable;
unsigned long node_allocs;
+ unsigned long node_frees;
atomic_t allochit;
atomic_t allocmiss;
atomic_t freehit;
@@ -385,6 +469,7 @@
} while (0)
#define STATS_INC_ERR(x) ((x)->errors++)
#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
+#define STATS_INC_NODEFREES(x) ((x)->node_frees++)
#define STATS_SET_FREEABLE(x, i) \
do { if ((x)->max_freeable < i) \
(x)->max_freeable = i; \
@@ -403,6 +488,7 @@
#define STATS_SET_HIGH(x) do { } while (0)
#define STATS_INC_ERR(x) do { } while (0)
#define STATS_INC_NODEALLOCS(x) do { } while (0)
+#define STATS_INC_NODEFREES(x) do { } while (0)
#define STATS_SET_FREEABLE(x, i) \
do { } while (0)

@@ -535,9 +621,9 @@

/* internal cache of cache description objs */
static kmem_cache_t cache_cache = {
- .lists = LIST3_INIT(cache_cache.lists),
.batchcount = 1,
.limit = BOOT_CPUCACHE_ENTRIES,
+ .shared = 1,
.objsize = sizeof(kmem_cache_t),
.flags = SLAB_NO_REAP,
.spinlock = SPIN_LOCK_UNLOCKED,
@@ -565,7 +651,8 @@
*/
static enum {
NONE,
- PARTIAL,
+ PARTIAL_AC,
+ PARTIAL_L3,
FULL
} g_cpucache_up;

@@ -574,11 +661,7 @@
static void free_block(kmem_cache_t* cachep, void** objpp, int len);
static void enable_cpucache (kmem_cache_t *cachep);
static void cache_reap (void *unused);
-
-static inline void **ac_entry(struct array_cache *ac)
-{
- return (void**)(ac+1);
-}
+static int __node_shrink(kmem_cache_t *cachep, int node);

static inline struct array_cache *ac_data(kmem_cache_t *cachep)
{
@@ -680,42 +763,152 @@
int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
struct array_cache *nc = NULL;

- if (cpu == -1)
- nc = kmalloc(memsize, GFP_KERNEL);
- else
- nc = kmalloc_node(memsize, GFP_KERNEL, cpu_to_node(cpu));
-
+ nc = kmalloc_node(memsize, GFP_KERNEL, cpu_to_node(cpu));
if (nc) {
nc->avail = 0;
nc->limit = entries;
nc->batchcount = batchcount;
nc->touched = 0;
+#ifdef CONFIG_NUMA
+ spin_lock_init(&nc->lock);
+#endif
}
return nc;
}
+#ifdef CONFIG_NUMA
+static inline struct array_cache **alloc_alien_cache(int cpu, int limit)
+{
+ struct array_cache **ac_ptr;
+ int memsize = sizeof(void*)*MAX_NUMNODES;
+ int node = cpu_to_node(cpu);
+ int i;
+
+ if (limit > 1)
+ limit = 12;
+ ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
+ if (ac_ptr) {
+ for_each_node(i) {
+ if (i == node || !node_online(i)) {
+ ac_ptr[i] = NULL;
+ continue;
+ }
+ ac_ptr[i] = alloc_arraycache(cpu, limit, 0xbaadf00d);
+ if (!ac_ptr[i]) {
+ for (i--; i <=0; i--)
+ kfree(ac_ptr[i]);
+ kfree(ac_ptr);
+ return NULL;
+ }
+ }
+ }
+ return ac_ptr;
+}
+
+static inline void free_alien_cache(struct array_cache **ac_ptr)
+{
+ int i;
+
+ if (!ac_ptr)
+ return;
+
+ for_each_node(i)
+ kfree(ac_ptr[i]);
+
+ kfree(ac_ptr);
+}
+
+static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node)
+{
+ struct kmem_list3 *rl3 = cachep->nodelists[node];
+
+ if (ac->avail) {
+ spin_lock(&rl3->list_lock);
+ free_block(cachep, ac->entry, ac->avail);
+ ac->avail = 0;
+ spin_unlock(&rl3->list_lock);
+ }
+}
+
+static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
+{
+ int i=0;
+ struct array_cache *ac;
+ unsigned long flags;
+
+ for_each_online_node(i) {
+ ac = l3->alien[i];
+ if (ac) {
+ spin_lock_irqsave(&ac->lock, flags);
+ __drain_alien_cache(cachep, ac, i);
+ spin_unlock_irqrestore(&ac->lock, flags);
+ }
+ }
+}
+#endif

static int __devinit cpuup_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
kmem_cache_t* cachep;
+ struct kmem_list3 *l3 = NULL;
+ int node = cpu_to_node(cpu);
+ int memsize = sizeof(struct kmem_list3);
+ struct array_cache *nc = NULL;

switch (action) {
case CPU_UP_PREPARE:
down(&cache_chain_sem);
+ /* we need to do this right in the begining since
+ * alloc_arraycache's are going to use this list.
+ * kmalloc_node allows us to add the slab to the right
+ * kmem_list3 and not this cpu's kmem_list3
+ */
+
list_for_each_entry(cachep, &cache_chain, next) {
- struct array_cache *nc;
+ /* setup the size64 kmemlist for hcpu before we can
+ * begin anything. Make sure some other cpu on this
+ * node has not already allocated this
+ */
+ if (!cachep->nodelists[node]) {
+ if (!(l3 = kmalloc_node(memsize,
+ GFP_KERNEL, node)))
+ goto bad;
+ LIST3_INIT(l3);
+ l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+
+ cachep->nodelists[node] = l3;
+ }

- nc = alloc_arraycache(cpu, cachep->limit, cachep->batchcount);
+ spin_lock_irq(&cachep->nodelists[node]->list_lock);
+ cachep->nodelists[node]->free_limit =
+ (1 + nr_cpus_node(node)) *
+ cachep->batchcount + cachep->num;
+ spin_unlock_irq(&cachep->nodelists[node]->list_lock);
+ }
+
+ /* Now we can go ahead with allocating the shared array's
+ & array cache's */
+ list_for_each_entry(cachep, &cache_chain, next) {
+ nc = alloc_arraycache(cpu, cachep->limit,
+ cachep->batchcount);
if (!nc)
goto bad;
-
- spin_lock_irq(&cachep->spinlock);
cachep->array[cpu] = nc;
- cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
- + cachep->num;
- spin_unlock_irq(&cachep->spinlock);

+ l3 = cachep->nodelists[node];
+ BUG_ON(!l3);
+ if (!l3->shared) {
+ if (!(nc = alloc_arraycache(cpu,
+ cachep->shared*cachep->batchcount,
+ 0xbaadf00d)))
+ goto bad;
+
+ /* we are serialised from CPU_DEAD or
+ CPU_UP_CANCELLED by the cpucontrol lock */
+ l3->shared = nc;
+ }
}
up(&cache_chain_sem);
break;
@@ -730,13 +923,53 @@

list_for_each_entry(cachep, &cache_chain, next) {
struct array_cache *nc;
+ cpumask_t mask;

+ mask = node_to_cpumask(node);
spin_lock_irq(&cachep->spinlock);
/* cpu is dead; no one can alloc from it. */
nc = cachep->array[cpu];
cachep->array[cpu] = NULL;
- cachep->free_limit -= cachep->batchcount;
- free_block(cachep, ac_entry(nc), nc->avail);
+ l3 = cachep->nodelists[node];
+
+ if (!l3)
+ goto unlock_cache;
+
+ spin_lock(&l3->list_lock);
+
+ /* Free limit for this kmem_list3 */
+ l3->free_limit -= cachep->batchcount;
+ if (nc)
+ free_block(cachep, nc->entry, nc->avail);
+
+ if (!cpus_empty(mask)) {
+ spin_unlock(&l3->list_lock);
+ goto unlock_cache;
+ }
+
+ if (l3->shared) {
+ free_block(cachep, l3->shared->entry,
+ l3->shared->avail);
+ kfree(l3->shared);
+ l3->shared = NULL;
+ }
+#ifdef CONFIG_NUMA
+ if (l3->alien) {
+ drain_alien_cache(cachep, l3);
+ free_alien_cache(l3->alien);
+ l3->alien = NULL;
+ }
+#endif
+
+ /* free slabs belonging to this node */
+ if (__node_shrink(cachep, node)) {
+ cachep->nodelists[node] = NULL;
+ spin_unlock(&l3->list_lock);
+ kfree(l3);
+ } else {
+ spin_unlock(&l3->list_lock);
+ }
+unlock_cache:
spin_unlock_irq(&cachep->spinlock);
kfree(nc);
}
@@ -752,6 +985,25 @@

static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };

+/*
+ * swap the static kmem_list3 with kmalloced memory
+ */
+static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list,
+ int nodeid)
+{
+ struct kmem_list3 *ptr;
+
+ BUG_ON(cachep->nodelists[nodeid] != list);
+ ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
+ BUG_ON(!ptr);
+
+ local_irq_disable();
+ memcpy(ptr, list, sizeof(struct kmem_list3));
+ MAKE_ALL_LISTS(cachep, ptr, nodeid);
+ cachep->nodelists[nodeid] = ptr;
+ local_irq_enable();
+}
+
/* Initialisation.
* Called after the gfp() functions have been enabled, and before smp_init().
*/
@@ -760,6 +1012,13 @@
size_t left_over;
struct cache_sizes *sizes;
struct cache_names *names;
+ int i;
+
+ for (i = 0; i < NUM_INIT_LISTS; i++) {
+ LIST3_INIT(&initkmem_list3[i]);
+ if (i < MAX_NUMNODES)
+ cache_cache.nodelists[i] = NULL;
+ }

/*
* Fragmentation resistance on low memory - only use bigger
@@ -768,21 +1027,24 @@
if (num_physpages > (32 << 20) >> PAGE_SHIFT)
slab_break_gfp_order = BREAK_GFP_ORDER_HI;

-
/* Bootstrap is tricky, because several objects are allocated
* from caches that do not exist yet:
* 1) initialize the cache_cache cache: it contains the kmem_cache_t
* structures of all caches, except cache_cache itself: cache_cache
* is statically allocated.
- * Initially an __init data area is used for the head array, it's
- * replaced with a kmalloc allocated array at the end of the bootstrap.
+ * Initially an __init data area is used for the head array and the
+ * kmem_list3 structures, it's replaced with a kmalloc allocated
+ * array at the end of the bootstrap.
* 2) Create the first kmalloc cache.
- * The kmem_cache_t for the new cache is allocated normally. An __init
- * data area is used for the head array.
- * 3) Create the remaining kmalloc caches, with minimally sized head arrays.
+ * The kmem_cache_t for the new cache is allocated normally.
+ * An __init data area is used for the head array.
+ * 3) Create the remaining kmalloc caches, with minimally sized
+ * head arrays.
* 4) Replace the __init data head arrays for cache_cache and the first
* kmalloc cache with kmalloc allocated arrays.
- * 5) Resize the head arrays of the kmalloc caches to their final sizes.
+ * 5) Replace the __init data for kmem_list3 for cache_cache and
+ * the other cache's with kmalloc allocated memory.
+ * 6) Resize the head arrays of the kmalloc caches to their final sizes.
*/

/* 1) create the cache_cache */
@@ -791,6 +1053,7 @@
list_add(&cache_cache.next, &cache_chain);
cache_cache.colour_off = cache_line_size();
cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
+ cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];

cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());

@@ -808,15 +1071,33 @@
sizes = malloc_sizes;
names = cache_names;

+ /* Initialize the caches that provide memory for the array cache
+ * and the kmem_list3 structures first.
+ * Without this, further allocations will bug
+ */
+
+ sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
+ sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN,
+ (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+
+ if (INDEX_AC != INDEX_L3)
+ sizes[INDEX_L3].cs_cachep =
+ kmem_cache_create(names[INDEX_L3].name,
+ sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN,
+ (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+
while (sizes->cs_size != ULONG_MAX) {
- /* For performance, all the general caches are L1 aligned.
+ /*
+ * For performance, all the general caches are L1 aligned.
* This should be particularly beneficial on SMP boxes, as it
* eliminates "false sharing".
* Note for systems short on memory removing the alignment will
- * allow tighter packing of the smaller caches. */
- sizes->cs_cachep = kmem_cache_create(names->name,
- sizes->cs_size, ARCH_KMALLOC_MINALIGN,
- (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+ * allow tighter packing of the smaller caches.
+ */
+ if(!sizes->cs_cachep)
+ sizes->cs_cachep = kmem_cache_create(names->name,
+ sizes->cs_size, ARCH_KMALLOC_MINALIGN,
+ (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);

/* Inc off-slab bufctl limit until the ceiling is hit. */
if (!(OFF_SLAB(sizes->cs_cachep))) {
@@ -835,24 +1116,46 @@
/* 4) Replace the bootstrap head arrays */
{
void * ptr;
-
+
ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+
local_irq_disable();
BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
- memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init));
+ memcpy(ptr, ac_data(&cache_cache),
+ sizeof(struct arraycache_init));
cache_cache.array[smp_processor_id()] = ptr;
local_irq_enable();
-
+
ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+
local_irq_disable();
- BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache);
- memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep),
+ BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep)
+ != &initarray_generic.cache);
+ memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep),
sizeof(struct arraycache_init));
- malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr;
+ malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
+ ptr;
local_irq_enable();
}
+ /* 5) Replace the bootstrap kmem_list3's */
+ {
+ int node;
+ /* Replace the static kmem_list3 structures for the boot cpu */
+ init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
+ numa_node_id());

- /* 5) resize the head arrays to their final sizes */
+ for_each_online_node(node) {
+ init_list(malloc_sizes[INDEX_L3].cs_cachep,
+ &initkmem_list3[SIZE_L3+node], node);
+ }
+ if (INDEX_AC != INDEX_L3) {
+ init_list(malloc_sizes[INDEX_AC].cs_cachep,
+ &initkmem_list3[SIZE_AC],
+ numa_node_id());
+ }
+ }
+
+ /* 6) resize the head arrays to their final sizes */
{
kmem_cache_t *cachep;
down(&cache_chain_sem);
@@ -868,7 +1171,6 @@
* that initializes ac_data for all new cpus
*/
register_cpu_notifier(&cpucache_notifier);
-

/* The reap timers are started later, with a module init call:
* That part of the kernel is not yet operational.
@@ -883,10 +1185,8 @@
* Register the timers that return unneeded
* pages to gfp.
*/
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
- if (cpu_online(cpu))
- start_cpu_timer(cpu);
- }
+ for_each_online_cpu(cpu)
+ start_cpu_timer(cpu);

return 0;
}
@@ -1165,6 +1465,20 @@
}
}

+/* For setting up all the kmem_list3s for cache whose objsize is same
+ as size of kmem_list3. */
+static inline void set_up_list3s(kmem_cache_t *cachep)
+{
+ int node;
+
+ for_each_online_node(node) {
+ cachep->nodelists[node] = &initkmem_list3[SIZE_L3+node];
+ cachep->nodelists[node]->next_reap = jiffies +
+ REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+ }
+}
+
/**
* kmem_cache_create - Create a cache.
* @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -1420,10 +1734,6 @@
cachep->gfpflags |= GFP_DMA;
spin_lock_init(&cachep->spinlock);
cachep->objsize = size;
- /* NUMA */
- INIT_LIST_HEAD(&cachep->lists.slabs_full);
- INIT_LIST_HEAD(&cachep->lists.slabs_partial);
- INIT_LIST_HEAD(&cachep->lists.slabs_free);

if (flags & CFLGS_OFF_SLAB)
cachep->slabp_cache = kmem_find_general_cachep(slab_size,0);
@@ -1442,24 +1752,51 @@
* the cache that's used by kmalloc(24), otherwise
* the creation of further caches will BUG().
*/
- cachep->array[smp_processor_id()] = &initarray_generic.cache;
- g_cpucache_up = PARTIAL;
+ cachep->array[smp_processor_id()] =
+ &initarray_generic.cache;
+
+ /* If the cache that's used by
+ * kmalloc(sizeof(kmem_list3)) is the first cache,
+ * then we need to set up all its list3s, otherwise
+ * the creation of further caches will BUG().
+ */
+ if (INDEX_AC == INDEX_L3) {
+ set_up_list3s(cachep);
+ g_cpucache_up = PARTIAL_L3;
+ } else {
+ cachep->nodelists[numa_node_id()] =
+ &initkmem_list3[SIZE_AC];
+ g_cpucache_up = PARTIAL_AC;
+ }
} else {
- cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL);
+ cachep->array[smp_processor_id()] =
+ kmalloc(sizeof(struct arraycache_init),
+ GFP_KERNEL);
+
+ if (g_cpucache_up == PARTIAL_AC) {
+ set_up_list3s(cachep);
+ g_cpucache_up = PARTIAL_L3;
+ } else {
+ cachep->nodelists[numa_node_id()] =
+ kmalloc(sizeof(struct kmem_list3),
+ GFP_KERNEL);
+ LIST3_INIT(cachep->nodelists[numa_node_id()]);
+ }
}
+ cachep->nodelists[numa_node_id()]->next_reap =
+ jiffies + REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+
BUG_ON(!ac_data(cachep));
+ BUG_ON(!cachep->nodelists[numa_node_id()]);
ac_data(cachep)->avail = 0;
ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
ac_data(cachep)->batchcount = 1;
ac_data(cachep)->touched = 0;
cachep->batchcount = 1;
cachep->limit = BOOT_CPUCACHE_ENTRIES;
- cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
- + cachep->num;
}

- cachep->lists.next_reap = jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep/L1_CACHE_BYTES)%REAPTIMEOUT_LIST3;
#if DEBUG
cachep->redzonetest = jiffies + REDZONETIMEOUT +
((unsigned long)cachep/L1_CACHE_BYTES)%REDZONETIMEOUT;
@@ -1521,13 +1858,23 @@
{
#ifdef CONFIG_SMP
check_irq_off();
- BUG_ON(spin_trylock(&cachep->spinlock));
+ assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
+#endif
+}
+
+static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
+{
+#ifdef CONFIG_SMP
+ check_irq_off();
+ assert_spin_locked(&cachep->nodelists[node]->list_lock);
#endif
}
+
#else
#define check_irq_off() do { } while(0)
#define check_irq_on() do { } while(0)
#define check_spinlock_acquired(x) do { } while(0)
+#define check_spinlock_acquired_node(x, y) do { } while(0)
#endif

/*
@@ -1549,7 +1896,7 @@
}

static void drain_array_locked(kmem_cache_t* cachep,
- struct array_cache *ac, int force);
+ struct array_cache *ac, int force, int node);

static void do_drain(void *arg)
{
@@ -1558,59 +1905,84 @@

check_irq_off();
ac = ac_data(cachep);
- spin_lock(&cachep->spinlock);
- free_block(cachep, &ac_entry(ac)[0], ac->avail);
- spin_unlock(&cachep->spinlock);
+ spin_lock(&cachep->nodelists[numa_node_id()]->list_lock);
+ free_block(cachep, ac->entry, ac->avail);
+ spin_unlock(&cachep->nodelists[numa_node_id()]->list_lock);
ac->avail = 0;
}

static void drain_cpu_caches(kmem_cache_t *cachep)
{
+ struct kmem_list3 *l3;
+ int node;
+
smp_call_function_all_cpus(do_drain, cachep);
check_irq_on();
spin_lock_irq(&cachep->spinlock);
- if (cachep->lists.shared)
- drain_array_locked(cachep, cachep->lists.shared, 1);
+ for_each_online_node(node) {
+ l3 = cachep->nodelists[node];
+ if (l3) {
+ spin_lock(&l3->list_lock);
+ drain_array_locked(cachep, l3->shared, 1, node);
+ spin_unlock(&l3->list_lock);
+#ifdef CONFIG_NUMA
+ if (l3->alien)
+ drain_alien_cache(cachep, l3);
+#endif
+ }
+ }
spin_unlock_irq(&cachep->spinlock);
}

-
-/* NUMA shrink all list3s */
-static int __cache_shrink(kmem_cache_t *cachep)
+static int __node_shrink(kmem_cache_t *cachep, int node)
{
struct slab *slabp;
+ struct kmem_list3 *l3 = cachep->nodelists[node];
int ret;

- drain_cpu_caches(cachep);
-
- check_irq_on();
- spin_lock_irq(&cachep->spinlock);
-
- for(;;) {
+ for (;;) {
struct list_head *p;

- p = cachep->lists.slabs_free.prev;
- if (p == &cachep->lists.slabs_free)
+ p = l3->slabs_free.prev;
+ if (p == &l3->slabs_free)
break;

- slabp = list_entry(cachep->lists.slabs_free.prev, struct slab, list);
+ slabp = list_entry(l3->slabs_free.prev, struct slab, list);
#if DEBUG
if (slabp->inuse)
BUG();
#endif
list_del(&slabp->list);

- cachep->lists.free_objects -= cachep->num;
- spin_unlock_irq(&cachep->spinlock);
+ l3->free_objects -= cachep->num;
+ spin_unlock_irq(&l3->list_lock);
slab_destroy(cachep, slabp);
- spin_lock_irq(&cachep->spinlock);
+ spin_lock_irq(&l3->list_lock);
}
- ret = !list_empty(&cachep->lists.slabs_full) ||
- !list_empty(&cachep->lists.slabs_partial);
- spin_unlock_irq(&cachep->spinlock);
+ ret = !list_empty(&l3->slabs_full) ||
+ !list_empty(&l3->slabs_partial);
return ret;
}

+static int __cache_shrink(kmem_cache_t *cachep)
+{
+ int ret = 0, i = 0;
+ struct kmem_list3 *l3;
+
+ drain_cpu_caches(cachep);
+
+ check_irq_on();
+ for_each_online_node(i) {
+ l3 = cachep->nodelists[i];
+ if (l3) {
+ spin_lock_irq(&l3->list_lock);
+ ret += __node_shrink(cachep, i);
+ spin_unlock_irq(&l3->list_lock);
+ }
+ }
+ return (ret ? 1 : 0);
+}
+
/**
* kmem_cache_shrink - Shrink a cache.
* @cachep: The cache to shrink.
@@ -1647,6 +2019,7 @@
int kmem_cache_destroy(kmem_cache_t * cachep)
{
int i;
+ struct kmem_list3 *l3;

if (!cachep || in_interrupt())
BUG();
@@ -1674,15 +2047,19 @@
if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
synchronize_rcu();

- /* no cpu_online check required here since we clear the percpu
- * array on cpu offline and set this to NULL.
- */
- for (i = 0; i < NR_CPUS; i++)
+ for_each_online_cpu(i)
kfree(cachep->array[i]);

/* NUMA: free the list3 structures */
- kfree(cachep->lists.shared);
- cachep->lists.shared = NULL;
+ for_each_online_node(i) {
+ if ((l3 = cachep->nodelists[i])) {
+ kfree(l3->shared);
+#ifdef CONFIG_NUMA
+ free_alien_cache(l3->alien);
+#endif
+ kfree(l3);
+ }
+ }
kmem_cache_free(&cache_cache, cachep);

unlock_cpu_hotplug();
@@ -1692,8 +2069,8 @@
EXPORT_SYMBOL(kmem_cache_destroy);

/* Get the memory for a slab management obj. */
-static struct slab* alloc_slabmgmt(kmem_cache_t *cachep,
- void *objp, int colour_off, unsigned int __nocast local_flags)
+static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
+ int colour_off, unsigned int __nocast local_flags)
{
struct slab *slabp;

@@ -1724,7 +2101,7 @@
int i;

for (i = 0; i < cachep->num; i++) {
- void* objp = slabp->s_mem+cachep->objsize*i;
+ void *objp = slabp->s_mem+cachep->objsize*i;
#if DEBUG
/* need to poison the objs? */
if (cachep->flags & SLAB_POISON)
@@ -1801,6 +2178,7 @@
size_t offset;
unsigned int local_flags;
unsigned long ctor_flags;
+ struct kmem_list3 *l3;

/* Be lazy and only check for valid flags here,
* keeping it out of the critical path in kmem_cache_alloc().
@@ -1832,6 +2210,7 @@

spin_unlock(&cachep->spinlock);

+ check_irq_off();
if (local_flags & __GFP_WAIT)
local_irq_enable();

@@ -1843,8 +2222,9 @@
*/
kmem_flagcheck(cachep, flags);

-
- /* Get mem for the objs. */
+ /* Get mem for the objs.
+ * Attempt to allocate a physical page from 'nodeid',
+ */
if (!(objp = kmem_getpages(cachep, flags, nodeid)))
goto failed;

@@ -1852,6 +2232,9 @@
if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
goto opps1;

+#ifdef CONFIG_NUMA
+ slabp->nodeid = nodeid;
+#endif
set_slab_attr(cachep, slabp, objp);

cache_init_objs(cachep, slabp, ctor_flags);
@@ -1859,13 +2242,14 @@
if (local_flags & __GFP_WAIT)
local_irq_disable();
check_irq_off();
- spin_lock(&cachep->spinlock);
+ l3 = cachep->nodelists[nodeid];
+ spin_lock(&l3->list_lock);

/* Make slab active. */
- list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_free));
+ list_add_tail(&slabp->list, &(l3->slabs_free));
STATS_INC_GROWN(cachep);
- list3_data(cachep)->free_objects += cachep->num;
- spin_unlock(&cachep->spinlock);
+ l3->free_objects += cachep->num;
+ spin_unlock(&l3->list_lock);
return 1;
opps1:
kmem_freepages(cachep, objp);
@@ -1971,7 +2355,6 @@
kmem_bufctl_t i;
int entries = 0;

- check_spinlock_acquired(cachep);
/* Check slab's freelist to see if this obj is there. */
for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
entries++;
@@ -2014,10 +2397,11 @@
*/
batchcount = BATCHREFILL_LIMIT;
}
- l3 = list3_data(cachep);
+ l3 = cachep->nodelists[numa_node_id()];
+
+ BUG_ON(ac->avail > 0 || !l3);
+ spin_lock(&l3->list_lock);

- BUG_ON(ac->avail > 0);
- spin_lock(&cachep->spinlock);
if (l3->shared) {
struct array_cache *shared_array = l3->shared;
if (shared_array->avail) {
@@ -2025,8 +2409,9 @@
batchcount = shared_array->avail;
shared_array->avail -= batchcount;
ac->avail = batchcount;
- memcpy(ac_entry(ac), &ac_entry(shared_array)[shared_array->avail],
- sizeof(void*)*batchcount);
+ memcpy(ac->entry,
+ &(shared_array->entry[shared_array->avail]),
+ sizeof(void*)*batchcount);
shared_array->touched = 1;
goto alloc_done;
}
@@ -2053,7 +2438,8 @@
STATS_SET_HIGH(cachep);

/* get obj pointer */
- ac_entry(ac)[ac->avail++] = slabp->s_mem + slabp->free*cachep->objsize;
+ ac->entry[ac->avail++] = slabp->s_mem +
+ slabp->free*cachep->objsize;

slabp->inuse++;
next = slab_bufctl(slabp)[slabp->free];
@@ -2075,12 +2461,12 @@
must_grow:
l3->free_objects -= ac->avail;
alloc_done:
- spin_unlock(&cachep->spinlock);
+ spin_unlock(&l3->list_lock);

if (unlikely(!ac->avail)) {
int x;
- x = cache_grow(cachep, flags, -1);
-
+ x = cache_grow(cachep, flags, numa_node_id());
+
// cache_grow can reenable interrupts, then ac could change.
ac = ac_data(cachep);
if (!x && ac->avail == 0) // no objects in sight? abort
@@ -2090,7 +2476,7 @@
goto retry;
}
ac->touched = 1;
- return ac_entry(ac)[--ac->avail];
+ return ac->entry[--ac->avail];
}

static inline void
@@ -2171,7 +2557,7 @@
if (likely(ac->avail)) {
STATS_INC_ALLOCHIT(cachep);
ac->touched = 1;
- objp = ac_entry(ac)[--ac->avail];
+ objp = ac->entry[--ac->avail];
} else {
STATS_INC_ALLOCMISS(cachep);
objp = cache_alloc_refill(cachep, flags);
@@ -2181,29 +2567,102 @@
return objp;
}

-/*
- * NUMA: different approach needed if the spinlock is moved into
- * the l3 structure
+#ifdef CONFIG_NUMA
+/*
+ * A interface to enable slab creation on nodeid
*/
+static void *__cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid)
+{
+ struct list_head *entry;
+ struct slab *slabp;
+ struct kmem_list3 *l3;
+ void *obj;
+ kmem_bufctl_t next;
+ int x;

+ l3 = cachep->nodelists[nodeid];
+ BUG_ON(!l3);
+
+retry:
+ spin_lock(&l3->list_lock);
+ entry = l3->slabs_partial.next;
+ if (entry == &l3->slabs_partial) {
+ l3->free_touched = 1;
+ entry = l3->slabs_free.next;
+ if (entry == &l3->slabs_free)
+ goto must_grow;
+ }
+
+ slabp = list_entry(entry, struct slab, list);
+ check_spinlock_acquired_node(cachep, nodeid);
+ check_slabp(cachep, slabp);
+
+ STATS_INC_NODEALLOCS(cachep);
+ STATS_INC_ACTIVE(cachep);
+ STATS_SET_HIGH(cachep);
+
+ BUG_ON(slabp->inuse == cachep->num);
+
+ /* get obj pointer */
+ obj = slabp->s_mem + slabp->free*cachep->objsize;
+ slabp->inuse++;
+ next = slab_bufctl(slabp)[slabp->free];
+#if DEBUG
+ slab_bufctl(slabp)[slabp->free] = BUFCTL_ALLOC;
+#endif
+ slabp->free = next;
+ check_slabp(cachep, slabp);
+ l3->free_objects--;
+ /* move slabp to correct slabp list: */
+ list_del(&slabp->list);
+
+ if (slabp->free == BUFCTL_END) {
+ list_add(&slabp->list, &l3->slabs_full);
+ } else {
+ list_add(&slabp->list, &l3->slabs_partial);
+ }
+
+ spin_unlock(&l3->list_lock);
+ goto done;
+
+must_grow:
+ spin_unlock(&l3->list_lock);
+ x = cache_grow(cachep, flags, nodeid);
+
+ if (!x)
+ return NULL;
+
+ goto retry;
+done:
+ return obj;
+}
+#endif
+
+/*
+ * Caller needs to acquire correct kmem_list's list_lock
+ */
static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
{
int i;
-
- check_spinlock_acquired(cachep);
-
- /* NUMA: move add into loop */
- cachep->lists.free_objects += nr_objects;
+ struct kmem_list3 *l3;

for (i = 0; i < nr_objects; i++) {
void *objp = objpp[i];
struct slab *slabp;
unsigned int objnr;
+ int nodeid = 0;

slabp = GET_PAGE_SLAB(virt_to_page(objp));
+#ifdef CONFIG_NUMA
+ nodeid = slabp->nodeid;
+#endif
+ l3 = cachep->nodelists[nodeid];
list_del(&slabp->list);
objnr = (objp - slabp->s_mem) / cachep->objsize;
+ check_spinlock_acquired_node(cachep, nodeid);
check_slabp(cachep, slabp);
+
+
#if 0 /* disabled, not compatible with leak detection */
if (slab_bufctl(slabp)[objnr] != BUFCTL_ALLOC) {
printk(KERN_ERR "slab: double free detected in cache "
@@ -2215,24 +2674,23 @@
slabp->free = objnr;
STATS_DEC_ACTIVE(cachep);
slabp->inuse--;
+ l3->free_objects++;
check_slabp(cachep, slabp);

/* fixup slab chains */
if (slabp->inuse == 0) {
- if (cachep->lists.free_objects > cachep->free_limit) {
- cachep->lists.free_objects -= cachep->num;
+ if (l3->free_objects > l3->free_limit) {
+ l3->free_objects -= cachep->num;
slab_destroy(cachep, slabp);
} else {
- list_add(&slabp->list,
- &list3_data_ptr(cachep, objp)->slabs_free);
+ list_add(&slabp->list, &l3->slabs_free);
}
} else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
*/
- list_add_tail(&slabp->list,
- &list3_data_ptr(cachep, objp)->slabs_partial);
+ list_add_tail(&slabp->list, &l3->slabs_partial);
}
}
}
@@ -2240,36 +2698,38 @@
static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
{
int batchcount;
+ struct kmem_list3 *l3;

batchcount = ac->batchcount;
#if DEBUG
BUG_ON(!batchcount || batchcount > ac->avail);
#endif
check_irq_off();
- spin_lock(&cachep->spinlock);
- if (cachep->lists.shared) {
- struct array_cache *shared_array = cachep->lists.shared;
+ l3 = cachep->nodelists[numa_node_id()];
+ spin_lock(&l3->list_lock);
+ if (l3->shared) {
+ struct array_cache *shared_array = l3->shared;
int max = shared_array->limit-shared_array->avail;
if (max) {
if (batchcount > max)
batchcount = max;
- memcpy(&ac_entry(shared_array)[shared_array->avail],
- &ac_entry(ac)[0],
+ memcpy(&(shared_array->entry[shared_array->avail]),
+ ac->entry,
sizeof(void*)*batchcount);
shared_array->avail += batchcount;
goto free_done;
}
}

- free_block(cachep, &ac_entry(ac)[0], batchcount);
+ free_block(cachep, ac->entry, batchcount);
free_done:
#if STATS
{
int i = 0;
struct list_head *p;

- p = list3_data(cachep)->slabs_free.next;
- while (p != &(list3_data(cachep)->slabs_free)) {
+ p = l3->slabs_free.next;
+ while (p != &(l3->slabs_free)) {
struct slab *slabp;

slabp = list_entry(p, struct slab, list);
@@ -2281,12 +2741,13 @@
STATS_SET_FREEABLE(cachep, i);
}
#endif
- spin_unlock(&cachep->spinlock);
+ spin_unlock(&l3->list_lock);
ac->avail -= batchcount;
- memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount],
+ memmove(ac->entry, &(ac->entry[batchcount]),
sizeof(void*)*ac->avail);
}

+
/*
* __cache_free
* Release an obj back to its cache. If the obj has a constructed
@@ -2301,14 +2762,46 @@
check_irq_off();
objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));

+ /* Make sure we are not freeing a object from another
+ * node to the array cache on this cpu.
+ */
+#ifdef CONFIG_NUMA
+ {
+ struct slab *slabp;
+ slabp = GET_PAGE_SLAB(virt_to_page(objp));
+ if (unlikely(slabp->nodeid != numa_node_id())) {
+ struct array_cache *alien = NULL;
+ int nodeid = slabp->nodeid;
+ struct kmem_list3 *l3 = cachep->nodelists[numa_node_id()];
+
+ STATS_INC_NODEFREES(cachep);
+ if (l3->alien && l3->alien[nodeid]) {
+ alien = l3->alien[nodeid];
+ spin_lock(&alien->lock);
+ if (unlikely(alien->avail == alien->limit))
+ __drain_alien_cache(cachep,
+ alien, nodeid);
+ alien->entry[alien->avail++] = objp;
+ spin_unlock(&alien->lock);
+ } else {
+ spin_lock(&(cachep->nodelists[nodeid])->
+ list_lock);
+ free_block(cachep, &objp, 1);
+ spin_unlock(&(cachep->nodelists[nodeid])->
+ list_lock);
+ }
+ return;
+ }
+ }
+#endif
if (likely(ac->avail < ac->limit)) {
STATS_INC_FREEHIT(cachep);
- ac_entry(ac)[ac->avail++] = objp;
+ ac->entry[ac->avail++] = objp;
return;
} else {
STATS_INC_FREEMISS(cachep);
cache_flusharray(cachep, ac);
- ac_entry(ac)[ac->avail++] = objp;
+ ac->entry[ac->avail++] = objp;
}
}

@@ -2378,78 +2871,24 @@
* Identical to kmem_cache_alloc, except that this function is slow
* and can sleep. And it will allocate memory on the given node, which
* can improve the performance for cpu bound structures.
+ * New and improved: it will now make sure that the object gets
+ * put on the correct node list so that there is no false sharing.
*/
void *kmem_cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid)
{
- int loop;
- void *objp;
- struct slab *slabp;
- kmem_bufctl_t next;
-
- for (loop = 0;;loop++) {
- struct list_head *q;
-
- objp = NULL;
- check_irq_on();
- spin_lock_irq(&cachep->spinlock);
- /* walk through all partial and empty slab and find one
- * from the right node */
- list_for_each(q,&cachep->lists.slabs_partial) {
- slabp = list_entry(q, struct slab, list);
-
- if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid ||
- loop > 2)
- goto got_slabp;
- }
- list_for_each(q, &cachep->lists.slabs_free) {
- slabp = list_entry(q, struct slab, list);
-
- if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid ||
- loop > 2)
- goto got_slabp;
- }
- spin_unlock_irq(&cachep->spinlock);
-
- local_irq_disable();
- if (!cache_grow(cachep, flags, nodeid)) {
- local_irq_enable();
- return NULL;
- }
- local_irq_enable();
- }
-got_slabp:
- /* found one: allocate object */
- check_slabp(cachep, slabp);
- check_spinlock_acquired(cachep);
-
- STATS_INC_ALLOCED(cachep);
- STATS_INC_ACTIVE(cachep);
- STATS_SET_HIGH(cachep);
- STATS_INC_NODEALLOCS(cachep);
-
- objp = slabp->s_mem + slabp->free*cachep->objsize;
-
- slabp->inuse++;
- next = slab_bufctl(slabp)[slabp->free];
-#if DEBUG
- slab_bufctl(slabp)[slabp->free] = BUFCTL_ALLOC;
-#endif
- slabp->free = next;
- check_slabp(cachep, slabp);
+ unsigned long save_flags;
+ void *ptr;

- /* move slabp to correct slabp list: */
- list_del(&slabp->list);
- if (slabp->free == BUFCTL_END)
- list_add(&slabp->list, &cachep->lists.slabs_full);
- else
- list_add(&slabp->list, &cachep->lists.slabs_partial);
+ if (nodeid == numa_node_id() || nodeid == -1)
+ return __cache_alloc(cachep, flags);

- list3_data(cachep)->free_objects--;
- spin_unlock_irq(&cachep->spinlock);
+ cache_alloc_debugcheck_before(cachep, flags);
+ local_irq_save(save_flags);
+ ptr = __cache_alloc_node(cachep, flags, nodeid);
+ local_irq_restore(save_flags);
+ ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0));

- objp = cache_alloc_debugcheck_after(cachep, GFP_KERNEL, objp,
- __builtin_return_address(0));
- return objp;
+ return ptr;
}
EXPORT_SYMBOL(kmem_cache_alloc_node);

@@ -2519,11 +2958,18 @@
if (!pdata)
return NULL;

- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_possible(i))
- continue;
- pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL,
- cpu_to_node(i));
+ /*
+ * Cannot use for_each_online_cpu since a cpu may come online
+ * and we have no way of figuring out how to fix the array
+ * that we have allocated then....
+ */
+ for_each_cpu(i) {
+ int node = cpu_to_node(i);
+
+ if (node_online(node))
+ pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node);
+ else
+ pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);

if (!pdata->ptrs[i])
goto unwind_oom;
@@ -2619,11 +3065,11 @@
int i;
struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);

- for (i = 0; i < NR_CPUS; i++) {
- if (!cpu_possible(i))
- continue;
+ /*
+ * We allocate for all cpus so we cannot use for online cpu here.
+ */
+ for_each_cpu(i)
kfree(p->ptrs[i]);
- }
kfree(p);
}
EXPORT_SYMBOL(free_percpu);
@@ -2679,43 +3125,124 @@
{
struct list_head *q;
struct slab *slabp;
+ int node;
+ struct kmem_list3 *l3;

check_spinlock_acquired(cachep);

- list_for_each(q,&cachep->lists.slabs_full) {
- slabp = list_entry(q, struct slab, list);
+ for_each_online_node(node) {
+ l3 = cachep->nodelists[node];
+ if (!l3)
+ continue;
+
+ list_for_each(q,&l3->slabs_full) {
+ slabp = list_entry(q, struct slab, list);

- if (slabp->inuse != cachep->num) {
- printk(KERN_INFO "slab %s: wrong slabp found in full slab chain at %p (%d/%d).\n",
- cachep->name, slabp, slabp->inuse, cachep->num);
+ if (slabp->inuse != cachep->num) {
+ printk(KERN_INFO "slab %s: wrong slabp found in full slab chain at %p (%d/%d).\n",
+ cachep->name, slabp, slabp->inuse, cachep->num);
+ }
+ check_slabp(cachep, slabp);
+ check_slabuse(cachep, slabp);
}
- check_slabp(cachep, slabp);
- check_slabuse(cachep, slabp);
- }
- list_for_each(q,&cachep->lists.slabs_partial) {
- slabp = list_entry(q, struct slab, list);
+ list_for_each(q,&l3->slabs_partial) {
+ slabp = list_entry(q, struct slab, list);

- if (slabp->inuse == cachep->num || slabp->inuse == 0) {
- printk(KERN_INFO "slab %s: wrong slab found in partial chain at %p (%d/%d).\n",
- cachep->name, slabp, slabp->inuse, cachep->num);
+ if (slabp->inuse == cachep->num || slabp->inuse == 0) {
+ printk(KERN_INFO "slab %s: wrong slab found in partial chain at %p (%d/%d).\n",
+ cachep->name, slabp, slabp->inuse, cachep->num);
+ }
+ check_slabp(cachep, slabp);
+ check_slabuse(cachep, slabp);
}
- check_slabp(cachep, slabp);
- check_slabuse(cachep, slabp);
- }
- list_for_each(q,&cachep->lists.slabs_free) {
- slabp = list_entry(q, struct slab, list);
+ list_for_each(q,&l3->slabs_free) {
+ slabp = list_entry(q, struct slab, list);

- if (slabp->inuse != 0) {
- printk(KERN_INFO "slab %s: wrong slab found in free chain at %p (%d/%d).\n",
- cachep->name, slabp, slabp->inuse, cachep->num);
+ if (slabp->inuse != 0) {
+ printk(KERN_INFO "slab %s: wrong slab found in free chain at %p (%d/%d).\n",
+ cachep->name, slabp, slabp->inuse, cachep->num);
+ }
+ check_slabp(cachep, slabp);
+ check_slabuse(cachep, slabp);
}
- check_slabp(cachep, slabp);
- check_slabuse(cachep, slabp);
}
}

#endif

+/*
+ * This initializes kmem_list3 for all nodes.
+ */
+static int alloc_kmemlist(kmem_cache_t *cachep)
+{
+ int node, i;
+ struct kmem_list3 *l3;
+ int err = 0;
+
+ for_each_online_cpu(i) {
+ struct array_cache *nc = NULL, *new;
+#ifdef CONFIG_NUMA
+ struct array_cache **new_alien = NULL;
+#endif
+ node = cpu_to_node(i);
+#ifdef CONFIG_NUMA
+ if (!(new_alien = alloc_alien_cache(i, cachep->limit)))
+ goto fail;
+#endif
+ if (!(new = alloc_arraycache(i, (cachep->shared*
+ cachep->batchcount), 0xbaadf00d)))
+ goto fail;
+ if ((l3 = cachep->nodelists[node])) {
+
+ spin_lock_irq(&l3->list_lock);
+
+ if ((nc = cachep->nodelists[node]->shared))
+ free_block(cachep, nc->entry,
+ nc->avail);
+
+ l3->shared = new;
+#ifdef CONFIG_NUMA
+ if (!cachep->nodelists[node]->alien) {
+ l3->alien = new_alien;
+ new_alien = NULL;
+ }
+ l3->free_limit = (1 + nr_cpus_node(node))*
+ cachep->batchcount + cachep->num;
+#else
+ l3->free_limit = (1 + num_online_cpus())*
+ cachep->batchcount + cachep->num;
+#endif
+ spin_unlock_irq(&l3->list_lock);
+ kfree(nc);
+#ifdef CONFIG_NUMA
+ free_alien_cache(new_alien);
+#endif
+ continue;
+ }
+ if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
+ GFP_KERNEL, node)))
+ goto fail;
+
+ LIST3_INIT(l3);
+ l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
+ ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+ l3->shared = new;
+#ifdef CONFIG_NUMA
+ l3->alien = new_alien;
+ l3->free_limit = (1 + nr_cpus_node(node))*
+ cachep->batchcount + cachep->num;
+#else
+ l3->free_limit = (1 + num_online_cpus())*
+ cachep->batchcount + cachep->num;
+#endif
+ cachep->nodelists[node] = l3;
+ }
+ return err;
+fail:
+ err = -ENOMEM;
+ return err;
+}
+
struct ccupdate_struct {
kmem_cache_t *cachep;
struct array_cache *new[NR_CPUS];
@@ -2738,54 +3265,43 @@
int shared)
{
struct ccupdate_struct new;
- struct array_cache *new_shared;
- int i;
+ int i, err;

memset(&new.new,0,sizeof(new.new));
- for (i = 0; i < NR_CPUS; i++) {
- if (cpu_online(i)) {
- new.new[i] = alloc_arraycache(i, limit, batchcount);
- if (!new.new[i]) {
- for (i--; i >= 0; i--) kfree(new.new[i]);
- return -ENOMEM;
- }
- } else {
- new.new[i] = NULL;
+ for_each_online_cpu(i) {
+ new.new[i] = alloc_arraycache(i, limit, batchcount);
+ if (!new.new[i]) {
+ for (i--; i >= 0; i--) kfree(new.new[i]);
+ return -ENOMEM;
}
}
new.cachep = cachep;

smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
-
+
check_irq_on();
spin_lock_irq(&cachep->spinlock);
cachep->batchcount = batchcount;
cachep->limit = limit;
- cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num;
+ cachep->shared = shared;
spin_unlock_irq(&cachep->spinlock);

- for (i = 0; i < NR_CPUS; i++) {
+ for_each_online_cpu(i) {
struct array_cache *ccold = new.new[i];
if (!ccold)
continue;
- spin_lock_irq(&cachep->spinlock);
- free_block(cachep, ac_entry(ccold), ccold->avail);
- spin_unlock_irq(&cachep->spinlock);
+ spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
+ free_block(cachep, ccold->entry, ccold->avail);
+ spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
kfree(ccold);
}
- new_shared = alloc_arraycache(-1, batchcount*shared, 0xbaadf00d);
- if (new_shared) {
- struct array_cache *old;
-
- spin_lock_irq(&cachep->spinlock);
- old = cachep->lists.shared;
- cachep->lists.shared = new_shared;
- if (old)
- free_block(cachep, ac_entry(old), old->avail);
- spin_unlock_irq(&cachep->spinlock);
- kfree(old);
- }

+ err = alloc_kmemlist(cachep);
+ if (err) {
+ printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
+ cachep->name, -err);
+ BUG();
+ }
return 0;
}

@@ -2843,11 +3359,11 @@
}

static void drain_array_locked(kmem_cache_t *cachep,
- struct array_cache *ac, int force)
+ struct array_cache *ac, int force, int node)
{
int tofree;

- check_spinlock_acquired(cachep);
+ check_spinlock_acquired_node(cachep, node);
if (ac->touched && !force) {
ac->touched = 0;
} else if (ac->avail) {
@@ -2855,9 +3371,9 @@
if (tofree > ac->avail) {
tofree = (ac->avail+1)/2;
}
- free_block(cachep, ac_entry(ac), tofree);
+ free_block(cachep, ac->entry, tofree);
ac->avail -= tofree;
- memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree],
+ memmove(ac->entry, &(ac->entry[tofree]),
sizeof(void*)*ac->avail);
}
}
@@ -2876,6 +3392,7 @@
static void cache_reap(void *unused)
{
struct list_head *walk;
+ struct kmem_list3 *l3;

if (down_trylock(&cache_chain_sem)) {
/* Give up. Setup the next iteration. */
@@ -2896,33 +3413,40 @@

check_irq_on();

- spin_lock_irq(&searchp->spinlock);
+ l3 = searchp->nodelists[numa_node_id()];
+#ifdef CONFIG_NUMA
+ if (l3->alien)
+ drain_alien_cache(searchp, l3);
+#endif
+ spin_lock_irq(&l3->list_lock);

- drain_array_locked(searchp, ac_data(searchp), 0);
+ drain_array_locked(searchp, ac_data(searchp), 0,
+ numa_node_id());

#if DEBUG
- if(time_before(searchp->redzonetest, jiffies)) {
+ if (time_before(searchp->redzonetest, jiffies)) {
searchp->redzonetest = jiffies + REDZONETIMEOUT;
check_redzone(searchp);
}
#endif
- if(time_after(searchp->lists.next_reap, jiffies))
+ if (time_after(l3->next_reap, jiffies))
goto next_unlock;

- searchp->lists.next_reap = jiffies + REAPTIMEOUT_LIST3;
+ l3->next_reap = jiffies + REAPTIMEOUT_LIST3;

- if (searchp->lists.shared)
- drain_array_locked(searchp, searchp->lists.shared, 0);
+ if (l3->shared)
+ drain_array_locked(searchp, l3->shared, 0,
+ numa_node_id());

- if (searchp->lists.free_touched) {
- searchp->lists.free_touched = 0;
+ if (l3->free_touched) {
+ l3->free_touched = 0;
goto next_unlock;
}

- tofree = (searchp->free_limit+5*searchp->num-1)/(5*searchp->num);
+ tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num);
do {
- p = list3_data(searchp)->slabs_free.next;
- if (p == &(list3_data(searchp)->slabs_free))
+ p = l3->slabs_free.next;
+ if (p == &(l3->slabs_free))
break;

slabp = list_entry(p, struct slab, list);
@@ -2935,13 +3459,13 @@
* searchp cannot disappear, we hold
* cache_chain_lock
*/
- searchp->lists.free_objects -= searchp->num;
- spin_unlock_irq(&searchp->spinlock);
+ l3->free_objects -= searchp->num;
+ spin_unlock_irq(&l3->list_lock);
slab_destroy(searchp, slabp);
- spin_lock_irq(&searchp->spinlock);
+ spin_lock_irq(&l3->list_lock);
} while(--tofree > 0);
next_unlock:
- spin_unlock_irq(&searchp->spinlock);
+ spin_unlock_irq(&l3->list_lock);
next:
cond_resched();
}
@@ -2974,7 +3498,7 @@
seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
#if STATS
seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>"
- " <error> <maxfreeable> <freelimit> <nodeallocs>");
+ " <error> <maxfreeable> <nodeallocs> <remotefrees>");
seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
#endif
seq_putc(m, '\n');
@@ -3009,39 +3533,53 @@
unsigned long active_objs;
unsigned long num_objs;
unsigned long active_slabs = 0;
- unsigned long num_slabs;
- const char *name;
+ unsigned long num_slabs, free_objects = 0, shared_avail = 0;
+ const char *name;
char *error = NULL;
+ int node;
+ struct kmem_list3 *l3;

check_irq_on();
spin_lock_irq(&cachep->spinlock);
active_objs = 0;
num_slabs = 0;
- list_for_each(q,&cachep->lists.slabs_full) {
- slabp = list_entry(q, struct slab, list);
- if (slabp->inuse != cachep->num && !error)
- error = "slabs_full accounting error";
- active_objs += cachep->num;
- active_slabs++;
- }
- list_for_each(q,&cachep->lists.slabs_partial) {
- slabp = list_entry(q, struct slab, list);
- if (slabp->inuse == cachep->num && !error)
- error = "slabs_partial inuse accounting error";
- if (!slabp->inuse && !error)
- error = "slabs_partial/inuse accounting error";
- active_objs += slabp->inuse;
- active_slabs++;
- }
- list_for_each(q,&cachep->lists.slabs_free) {
- slabp = list_entry(q, struct slab, list);
- if (slabp->inuse && !error)
- error = "slabs_free/inuse accounting error";
- num_slabs++;
+ for_each_online_node(node) {
+ l3 = cachep->nodelists[node];
+ if (!l3)
+ continue;
+
+ spin_lock(&l3->list_lock);
+
+ list_for_each(q,&l3->slabs_full) {
+ slabp = list_entry(q, struct slab, list);
+ if (slabp->inuse != cachep->num && !error)
+ error = "slabs_full accounting error";
+ active_objs += cachep->num;
+ active_slabs++;
+ }
+ list_for_each(q,&l3->slabs_partial) {
+ slabp = list_entry(q, struct slab, list);
+ if (slabp->inuse == cachep->num && !error)
+ error = "slabs_partial inuse accounting error";
+ if (!slabp->inuse && !error)
+ error = "slabs_partial/inuse accounting error";
+ active_objs += slabp->inuse;
+ active_slabs++;
+ }
+ list_for_each(q,&l3->slabs_free) {
+ slabp = list_entry(q, struct slab, list);
+ if (slabp->inuse && !error)
+ error = "slabs_free/inuse accounting error";
+ num_slabs++;
+ }
+ free_objects += l3->free_objects;
+ shared_avail += l3->shared->avail;
+
+ spin_unlock(&l3->list_lock);
}
num_slabs+=active_slabs;
num_objs = num_slabs*cachep->num;
- if (num_objs - active_objs != cachep->lists.free_objects && !error)
+ if (num_objs - active_objs != free_objects && !error)
error = "free_objects accounting error";

name = cachep->name;
@@ -3053,9 +3591,9 @@
cachep->num, (1<<cachep->gfporder));
seq_printf(m, " : tunables %4u %4u %4u",
cachep->limit, cachep->batchcount,
- cachep->lists.shared->limit/cachep->batchcount);
- seq_printf(m, " : slabdata %6lu %6lu %6u",
- active_slabs, num_slabs, cachep->lists.shared->avail);
+ cachep->shared);
+ seq_printf(m, " : slabdata %6lu %6lu %6lu",
+ active_slabs, num_slabs, shared_avail);
#if STATS
{ /* list3 stats */
unsigned long high = cachep->high_mark;
@@ -3064,12 +3602,13 @@
unsigned long reaped = cachep->reaped;
unsigned long errors = cachep->errors;
unsigned long max_freeable = cachep->max_freeable;
- unsigned long free_limit = cachep->free_limit;
unsigned long node_allocs = cachep->node_allocs;
+ unsigned long node_frees = cachep->node_frees;

- seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu",
- allocs, high, grown, reaped, errors,
- max_freeable, free_limit, node_allocs);
+ seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
+ %4lu %4lu %4lu %4lu",
+ allocs, high, grown, reaped, errors,
+ max_freeable, node_allocs, node_frees);
}
/* cpu stats */
{
@@ -3112,19 +3651,27 @@
{
#if DEBUG
struct list_head *q;
+ int node;
+ struct kmem_list3 *l3;

check_irq_on();
spin_lock_irq(&cachep->spinlock);
- list_for_each(q,&cachep->lists.slabs_full) {
- struct slab *slabp;
- int i;
- slabp = list_entry(q, struct slab, list);
- for (i = 0; i < cachep->num; i++) {
- unsigned long sym = slab_bufctl(slabp)[i];
+ for_each_online_node(node) {
+ l3 = cachep->nodelists[node];
+ if (!l3)
+ continue;

- printk("obj %p/%d: %p", slabp, i, (void *)sym);
- print_symbol(" <%s>", sym);
- printk("\n");
+ list_for_each(q,&l3->slabs_full) {
+ struct slab *slabp;
+ int i;
+ slabp = list_entry(q, struct slab, list);
+ for (i = 0; i < cachep->num; i++) {
+ unsigned long sym = slab_bufctl(slabp)[i];
+
+ printk("obj %p/%d: %p", slabp, i, (void *)sym);
+ print_symbol(" <%s>", sym);
+ printk("\n");
+ }
}
}
spin_unlock_irq(&cachep->spinlock);