2010-08-04 02:53:48

by Christoph Lameter

[permalink] [raw]
Subject: [S+Q3 20/23] slub: Shared cache to exploit cross cpu caching abilities.

Strictly a performance enhancement by better tracking of objects
that are likely in the lowest cpu caches of processors.

SLAB uses one shared cache per NUMA node or one globally. However, that
is not satifactory for contemporary cpus. Those may have multiple
independent cpu caches per node. SLAB in these situation treats
cache cold objects like cache hot objects.

The shared caches of slub are per physical cpu cache for all cpus using
that cache. Shared cache content will not cross physical caches.

The shared cache can be dynamically configured via
/sys/kernel/slab/<cache>/shared_queue

The current shared cache state is available via
cat /sys/kernel/slab/<cache/<shared_caches>

Shared caches are always allocated in the sizes available in the kmalloc
array. Cache sizes are rounded up to the sizes available.

F.e. on my Dell with 8 cpus in 2 packages in which each 2 cpus shared
an l2 cache I get:

christoph@:/sys/kernel/slab$ cat kmalloc-64/shared_caches
384 C0,2=66/126 C1,3=126/126 C4,6=126/126 C5,7=66/126
christoph@:/sys/kernel/slab$ cat kmalloc-64/per_cpu_caches
617 C0=54/125 C1=37/125 C2=102/125 C3=76/125 C4=81/125 C5=108/125 C6=72/125 C7=87/125

Signed-off-by: Christoph Lameter <[email protected]>

---
include/linux/slub_def.h | 9 +
mm/slub.c | 423 ++++++++++++++++++++++++++++++++++++++++++++---
2 files changed, 405 insertions(+), 27 deletions(-)

Index: linux-2.6/include/linux/slub_def.h
===================================================================
--- linux-2.6.orig/include/linux/slub_def.h 2010-08-03 13:04:49.000000000 -0500
+++ linux-2.6/include/linux/slub_def.h 2010-08-03 15:52:01.000000000 -0500
@@ -24,6 +24,8 @@ enum stat_item {
ALLOC_FROM_PARTIAL, /* slab with objects acquired from partial */
ALLOC_SLAB, /* New slab acquired from page allocator */
ALLOC_REMOTE, /* Allocation from remote slab */
+ ALLOC_SHARED, /* Allocation caused a shared cache transaction */
+ FREE_SHARED, /* Free caused a shared cache transaction */
FREE_ALIEN, /* Free to alien node */
FREE_SLAB, /* Slab freed to the page allocator */
QUEUE_FLUSH, /* Flushing of the per cpu queue */
@@ -34,6 +36,10 @@ enum stat_item {
struct kmem_cache_queue {
int objects; /* Available objects */
int max; /* Queue capacity */
+ union {
+ struct kmem_cache_queue *shared; /* cpu q -> shared q */
+ spinlock_t lock; /* shared queue: lock */
+ };
void *object[];
};

@@ -87,12 +93,15 @@ struct kmem_cache {
int align; /* Alignment */
int queue; /* specified queue size */
int cpu_queue; /* cpu queue size */
+ int shared_queue; /* Actual shared queue size */
+ int nr_shared; /* Total # of shared caches */
unsigned long min_partial;
const char *name; /* Name (only for display!) */
struct list_head list; /* List of slab caches */
#ifdef CONFIG_SLUB_DEBUG
struct kobject kobj; /* For sysfs */
#endif
+ int shared_queue_sysfs; /* Desired shared queue size */

#ifdef CONFIG_NUMA
/*
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c 2010-08-03 13:04:49.000000000 -0500
+++ linux-2.6/mm/slub.c 2010-08-03 15:52:01.000000000 -0500
@@ -1556,7 +1556,8 @@ void drain_objects(struct kmem_cache *s,
}
}

-static inline void drain_queue(struct kmem_cache *s, struct kmem_cache_queue *q, int nr)
+static inline int drain_queue(struct kmem_cache *s,
+ struct kmem_cache_queue *q, int nr)
{
int t = min(nr, q->objects);

@@ -1566,13 +1567,35 @@ static inline void drain_queue(struct km
if (q->objects)
memcpy(q->object, q->object + t,
q->objects * sizeof(void *));
+ return t;
}
+
+static inline int drain_shared_cache(struct kmem_cache *s,
+ struct kmem_cache_queue *q)
+{
+ int n = 0;
+
+ if (!q)
+ return n;
+
+ if (!queue_empty(q)) {
+ spin_lock(&q->lock);
+ if (q->objects)
+ n = drain_queue(s, q, q->objects);
+ spin_unlock(&q->lock);
+ }
+ return n;
+}
+
/*
* Drain all objects from a per cpu queue
*/
static void flush_cpu_objects(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
- drain_queue(s, &c->q, c->q.objects);
+ struct kmem_cache_queue *q = &c->q;
+
+ drain_queue(s, q, q->objects);
+ drain_shared_cache(s, q->shared);
stat(s, QUEUE_FLUSH);
}

@@ -1629,6 +1652,207 @@ struct kmem_cache_cpu *alloc_kmem_cache_
return k;
}

+/* Shared cache management */
+
+static inline int get_shared_objects(struct kmem_cache_queue *q,
+ void **l, int nr)
+{
+ int d;
+
+ spin_lock(&q->lock);
+ d = min(nr, q->objects);
+ q->objects -= d;
+ memcpy(l, q->object + q->objects, d * sizeof(void *));
+ spin_unlock(&q->lock);
+
+ return d;
+}
+
+static inline int put_shared_objects(struct kmem_cache_queue *q,
+ void **l, int nr)
+{
+ int d;
+
+ spin_lock(&q->lock);
+ d = min(nr, q->max - q->objects);
+ memcpy(q->object + q->objects, l, d * sizeof(void *));
+ q->objects += d;
+ spin_unlock(&q->lock);
+
+ return d;
+}
+
+static struct kmem_cache *get_slab(size_t size, gfp_t flags);
+
+static inline unsigned long shared_cache_size(int n)
+{
+ return n * sizeof(void *) + sizeof(struct kmem_cache_queue);
+}
+
+static inline unsigned long shared_cache_capacity(unsigned long size)
+{
+ return (size - sizeof(struct kmem_cache_queue)) / sizeof(void *);
+}
+
+static inline void init_shared_cache(struct kmem_cache_queue *q, int max)
+{
+ q->max = max;
+ spin_lock_init(&q->lock);
+}
+
+
+/* Determine a list of the active shared caches */
+struct kmem_cache_queue **shared_caches(struct kmem_cache *s)
+{
+ int cpu;
+ struct kmem_cache_queue **caches;
+ int nr;
+ int n;
+
+ caches = kmalloc(sizeof(struct kmem_cache_queue *)
+ * (s->nr_shared + 1), GFP_KERNEL);
+ if (!caches)
+ return NULL;
+
+ nr = 0;
+
+ /* Build list of shared caches */
+ for_each_online_cpu(cpu) {
+ struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu, cpu);
+ struct kmem_cache_queue *q = c->q.shared;
+
+ if (!q)
+ continue;
+
+ for (n = 0; n < nr; n++)
+ if (caches[n] == q)
+ break;
+
+ if (n < nr)
+ continue;
+
+ caches[nr++] = q;
+ }
+ caches[nr] = NULL;
+ BUG_ON(nr != s->nr_shared);
+ return caches;
+}
+
+/*
+ * Allocate shared cpu caches.
+ * A shared cache is allocated for each series of cpus sharing a single cache
+ */
+static void alloc_shared_caches(struct kmem_cache *s)
+{
+ int cpu;
+ int max;
+ int size;
+ void *p;
+
+ if (slab_state < SYSFS || s->shared_queue_sysfs == 0)
+ return;
+
+ /*
+ * Determine the size. Round it up to the size that a kmalloc cache
+ * supporting that size has. This will often align the size to a
+ * power of 2 especially on machines that have large kmalloc
+ * alignment requirements.
+ */
+ size = shared_cache_size(s->shared_queue_sysfs);
+ if (size < PAGE_SIZE / 2)
+ size = get_slab(size, GFP_KERNEL)->objsize;
+ else
+ size = PAGE_SHIFT << get_order(size);
+
+ max = shared_cache_capacity(size);
+
+ for_each_online_cpu(cpu) {
+ struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu, cpu);
+ struct kmem_cache_queue *l;
+ int x;
+ const struct cpumask *map =
+ per_cpu(cpu_info.llc_shared_map, cpu) ;
+
+ /* Skip cpus that already have assigned shared caches */
+ if (c->q.shared)
+ continue;
+
+ /* Allocate shared cache */
+ p = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, c->node);
+ if (!p) {
+ printk(KERN_WARNING "SLUB: Out of memory allocating"
+ " shared cache for %s cpu %d node %d\n",
+ s->name, cpu, c->node);
+ continue;
+ }
+
+ l = p;
+ init_shared_cache(l, max);
+
+ if (cpumask_weight(map) < 2) {
+
+ /*
+ * No information available on how to setup the shared
+ * caches. Cpu will not have shared or alien caches.
+ */
+ printk_once(KERN_WARNING "SLUB: Cache topology"
+ " information unusable. No shared caches\n");
+
+ kfree(p);
+ continue;
+ }
+
+ /* Link all cpus in this group to the shared cache */
+ for_each_cpu(x, map) {
+ struct kmem_cache_cpu *z = per_cpu_ptr(s->cpu, x);
+
+ if (z->node == c->node)
+ z->q.shared = l;
+ }
+ s->nr_shared++;
+ }
+ s->shared_queue = max;
+}
+
+/*
+ * Flush shared caches.
+ *
+ * Called from IPI handler with interrupts disabled.
+ */
+static void __remove_shared_cache(void *d)
+{
+ struct kmem_cache *s = d;
+ struct kmem_cache_cpu *c = __this_cpu_ptr(s->cpu);
+ struct kmem_cache_queue *q = c->q.shared;
+
+ c->q.shared = NULL;
+ drain_shared_cache(s, q);
+}
+
+
+static int remove_shared_caches(struct kmem_cache *s)
+{
+ struct kmem_cache_queue **caches;
+ int i;
+
+ caches = shared_caches(s);
+ if (!caches)
+ return -ENOMEM;
+
+ /* Go through a transaction on each cpu removing the pointers to the shared caches */
+ on_each_cpu(__remove_shared_cache, s, 1);
+
+ for(i = 0; i < s->nr_shared; i++) {
+ void *p = caches[i];
+
+ kfree(p);
+ }
+
+ kfree(caches);
+ s->nr_shared = 0;
+ s->shared_queue = 0;
+ return 0;
+}

static void resize_cpu_queue(struct kmem_cache *s, int queue)
{
@@ -1792,8 +2016,9 @@ static inline void refill_queue(struct k
struct kmem_cache_queue *q, struct page *page, int nr)
{
int d;
+ int batch = min_t(int, q->max, s->batch);

- d = min(s->batch - q->objects, nr);
+ d = min(batch - q->objects, nr);
retrieve_objects(s, page, q->object + q->objects, d);
q->objects += d;
}
@@ -1886,6 +2111,20 @@ redo:
q = &c->q;
if (unlikely(queue_empty(q))) {

+ struct kmem_cache_queue *l = q->shared;
+
+ if (l && !queue_empty(l)) {
+
+ /*
+ * Refill the cpu queue with the hottest objects
+ * from the shared cache queue
+ */
+ q->objects = get_shared_objects(l,
+ q->object, s->batch);
+ stat(s, ALLOC_SHARED);
+
+ }
+ else
while (q->objects < s->batch) {
struct page *new;

@@ -2022,9 +2261,22 @@ static void slab_free(struct kmem_cache

if (unlikely(queue_full(q))) {

- drain_queue(s, q, s->batch);
- stat(s, FREE_SLOWPATH);
+ struct kmem_cache_queue *l = q->shared;

+ /* Shared queue available and has space ? */
+ if (l && !queue_full(l)) {
+ /* Push coldest objects into the shared queue */
+ int d = put_shared_objects(l, q->object, s->batch);
+
+ q->objects -= d;
+ memcpy(q->object, q->object + d,
+ q->objects * sizeof(void *));
+ stat(s, FREE_SHARED);
+ }
+ if (queue_full(q))
+ drain_queue(s, q, s->batch);
+
+ stat(s, FREE_SLOWPATH);
} else
stat(s, FREE_FASTPATH);

@@ -2498,8 +2750,11 @@ static int kmem_cache_open(struct kmem_c
s->queue = initial_queue_size(s->size);
s->batch = (s->queue + 1) / 2;

- if (alloc_kmem_cache_cpus(s))
+ if (alloc_kmem_cache_cpus(s)) {
+ s->shared_queue_sysfs = s->queue;
+ alloc_shared_caches(s);
return 1;
+ }

free_kmem_cache_nodes(s);
error:
@@ -3270,12 +3525,21 @@ void __init kmem_cache_init(void)
/* Now the kmalloc array is fully functional (*not* the dma array) */
slab_state = UP;

- /* Provide the correct kmalloc names now that the caches are up */
- for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
- char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
+ /*
+ * Provide the correct kmalloc names and enable the shared caches
+ * now that the kmalloc array is functional
+ */
+ for (i = 0; i < SLUB_PAGE_SHIFT; i++) {
+ struct kmem_cache *s = kmalloc_caches[i];

- BUG_ON(!s);
- kmalloc_caches[i]->name = s;
+ if (!s)
+ continue;
+
+ if (strcmp(s->name, "kmalloc") == 0)
+ s->name = kasprintf(GFP_NOWAIT,
+ "kmalloc-%d", s->objsize);
+
+ BUG_ON(!s->name);
}

#ifdef CONFIG_SMP
@@ -3298,6 +3562,9 @@ void __init kmem_cache_init_late(void)

create_kmalloc_cache(&kmalloc_dma_caches[i],
name, s->objsize, SLAB_CACHE_DMA);
+
+ /* DMA caches are rarely used. Reduce memory consumption */
+ kmalloc_dma_caches[i]->shared_queue_sysfs = 0;
}
}
#endif
@@ -4047,10 +4314,40 @@ static ssize_t min_partial_store(struct
}
SLAB_ATTR(min_partial);

-static ssize_t cpu_queue_size_show(struct kmem_cache *s, char *buf)
+static ssize_t queue_size_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%u\n", s->queue);
}
+SLAB_ATTR_RO(queue_size);
+
+
+static ssize_t batch_size_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%u\n", s->batch);
+}
+
+static ssize_t batch_size_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ unsigned long batch;
+ int err;
+
+ err = strict_strtoul(buf, 10, &batch);
+ if (err)
+ return err;
+
+ if (batch < s->queue || batch < 4)
+ return -EINVAL;
+
+ s->batch = batch;
+ return length;
+}
+SLAB_ATTR(batch_size);
+
+static ssize_t cpu_queue_size_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%u\n", s->cpu_queue);
+}

static ssize_t cpu_queue_size_store(struct kmem_cache *s,
const char *buf, size_t length)
@@ -4075,28 +4372,82 @@ static ssize_t cpu_queue_size_store(stru
}
SLAB_ATTR(cpu_queue_size);

-static ssize_t cpu_batch_size_show(struct kmem_cache *s, char *buf)
+static ssize_t shared_queue_size_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%u\n", s->batch);
+ return sprintf(buf, "%u %u\n", s->shared_queue, s->shared_queue_sysfs);
}

-static ssize_t cpu_batch_size_store(struct kmem_cache *s,
+static ssize_t shared_queue_size_store(struct kmem_cache *s,
const char *buf, size_t length)
{
- unsigned long batch;
+ unsigned long queue;
int err;

- err = strict_strtoul(buf, 10, &batch);
+ err = strict_strtoul(buf, 10, &queue);
if (err)
return err;

- if (batch < s->queue || batch < 4)
+ if (queue > 10000 || queue < 4)
return -EINVAL;

- s->batch = batch;
- return length;
+ if (s->batch > queue)
+ s->batch = queue;
+
+ down_write(&slub_lock);
+ s->shared_queue_sysfs = queue;
+ err = remove_shared_caches(s);
+ if (!err)
+ alloc_shared_caches(s);
+ up_write(&slub_lock);
+ return err ? err : length;
+}
+SLAB_ATTR(shared_queue_size);
+
+static ssize_t shared_caches_show(struct kmem_cache *s, char *buf)
+{
+ unsigned long total = 0;
+ int x, n;
+ int cpu;
+ struct kmem_cache_queue **caches;
+
+ down_read(&slub_lock);
+ caches = shared_caches(s);
+ if (!caches) {
+ up_read(&slub_lock);
+ return -ENOMEM;
+ }
+
+ for (n = 0; n < s->nr_shared; n++)
+ total += caches[n]->objects;
+
+ x = sprintf(buf, "%lu", total);
+
+ for (n = 0; n < s->nr_shared; n++) {
+ int first = 1;
+ struct kmem_cache_queue *q = caches[n];
+
+ x += sprintf(buf + x, " C");
+
+ /* Find cpus using the shared cache */
+ for_each_online_cpu(cpu) {
+ struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu, cpu);
+
+ if (q != c->q.shared)
+ continue;
+
+ if (first)
+ first = 0;
+ else
+ x += sprintf(buf + x, ",");
+ x += sprintf(buf + x, "%d", cpu);
+ }
+ x += sprintf(buf +x, "=%d/%d", q->objects, q->max);
+ }
+ up_read(&slub_lock);
+ kfree(caches);
+ return x + sprintf(buf + x, "\n");
}
-SLAB_ATTR(cpu_batch_size);
+SLAB_ATTR_RO(shared_caches);

static ssize_t ctor_show(struct kmem_cache *s, char *buf)
{
@@ -4127,7 +4478,7 @@ static ssize_t partial_show(struct kmem_
}
SLAB_ATTR_RO(partial);

-static ssize_t cpu_queues_show(struct kmem_cache *s, char *buf)
+static ssize_t per_cpu_caches_show(struct kmem_cache *s, char *buf)
{
unsigned long total = 0;
int x;
@@ -4159,7 +4510,7 @@ static ssize_t cpu_queues_show(struct km
kfree(cpus);
return x + sprintf(buf + x, "\n");
}
-SLAB_ATTR_RO(cpu_queues);
+SLAB_ATTR_RO(per_cpu_caches);

static ssize_t objects_show(struct kmem_cache *s, char *buf)
{
@@ -4472,14 +4823,17 @@ static struct attribute *slab_attrs[] =
&objs_per_slab_attr.attr,
&order_attr.attr,
&min_partial_attr.attr,
+ &queue_size_attr.attr,
+ &batch_size_attr.attr,
&cpu_queue_size_attr.attr,
- &cpu_batch_size_attr.attr,
+ &shared_queue_size_attr.attr,
&objects_attr.attr,
&objects_partial_attr.attr,
&total_objects_attr.attr,
&slabs_attr.attr,
&partial_attr.attr,
- &cpu_queues_attr.attr,
+ &per_cpu_caches_attr.attr,
+ &shared_caches_attr.attr,
&ctor_attr.attr,
&aliases_attr.attr,
&align_attr.attr,
@@ -4750,6 +5104,7 @@ static int __init slab_sysfs_init(void)
if (err)
printk(KERN_ERR "SLUB: Unable to add boot slab %s"
" to sysfs\n", s->name);
+ alloc_shared_caches(s);
}

while (alias_list) {
@@ -4806,6 +5161,19 @@ static void s_stop(struct seq_file *m, v
up_read(&slub_lock);
}

+static unsigned long shared_objects(struct kmem_cache *s)
+{
+ unsigned long shared;
+ int n;
+ struct kmem_cache_queue **caches;
+
+ caches = shared_caches(s);
+ for(n = 0; n < s->nr_shared; n++)
+ shared += caches[n]->objects;
+
+ kfree(caches);
+ return shared;
+}
static int s_show(struct seq_file *m, void *p)
{
unsigned long nr_partials = 0;
@@ -4835,9 +5203,10 @@ static int s_show(struct seq_file *m, vo
seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse,
nr_objs, s->size, oo_objects(s->oo),
(1 << oo_order(s->oo)));
- seq_printf(m, " : tunables %4u %4u %4u", s->queue, s->batch, 0);
+ seq_printf(m, " : tunables %4u %4u %4u", s->cpu_queue, s->batch, s->shared_queue);
+
seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
- 0UL);
+ shared_objects(s));
seq_putc(m, '\n');
return 0;
}


2010-08-17 05:52:49

by David Rientjes

[permalink] [raw]
Subject: Re: [S+Q3 20/23] slub: Shared cache to exploit cross cpu caching abilities.

On Tue, 3 Aug 2010, Christoph Lameter wrote:

> Strictly a performance enhancement by better tracking of objects
> that are likely in the lowest cpu caches of processors.
>
> SLAB uses one shared cache per NUMA node or one globally. However, that
> is not satifactory for contemporary cpus. Those may have multiple
> independent cpu caches per node. SLAB in these situation treats
> cache cold objects like cache hot objects.
>
> The shared caches of slub are per physical cpu cache for all cpus using
> that cache. Shared cache content will not cross physical caches.
>
> The shared cache can be dynamically configured via
> /sys/kernel/slab/<cache>/shared_queue
>
> The current shared cache state is available via
> cat /sys/kernel/slab/<cache/<shared_caches>
>
> Shared caches are always allocated in the sizes available in the kmalloc
> array. Cache sizes are rounded up to the sizes available.
>
> F.e. on my Dell with 8 cpus in 2 packages in which each 2 cpus shared
> an l2 cache I get:
>
> christoph@:/sys/kernel/slab$ cat kmalloc-64/shared_caches
> 384 C0,2=66/126 C1,3=126/126 C4,6=126/126 C5,7=66/126
> christoph@:/sys/kernel/slab$ cat kmalloc-64/per_cpu_caches
> 617 C0=54/125 C1=37/125 C2=102/125 C3=76/125 C4=81/125 C5=108/125 C6=72/125 C7=87/125
>

This explodes on the memset() in slab_alloc() because of __GFP_ZERO on my
system:

[ 1.922641] BUG: unable to handle kernel paging request at 0000007e7e581f70
[ 1.923625] IP: [<ffffffff811053ee>] slab_alloc+0x549/0x590
[ 1.923625] PGD 0
[ 1.923625] Oops: 0002 [#1] SMP
[ 1.923625] last sysfs file:
[ 1.923625] CPU 12
[ 1.923625] Modules linked in:
[ 1.923625]
[ 1.923625] Pid: 1, comm: swapper Not tainted 2.6.35-slubq #1
[ 1.923625] RIP: 0010:[<ffffffff811053ee>] [<ffffffff811053ee>] slab_alloc+0x549/0x590
[ 1.923625] RSP: 0000:ffff88047e09dd30 EFLAGS: 00010246
[ 1.923625] RAX: 0000000000000000 RBX: ffff88047fc04500 RCX: 0000000000000010
[ 1.923625] RDX: 0000000000000003 RSI: 0000000000000348 RDI: 0000007e7e581f70
[ 1.923625] RBP: ffff88047e09dde0 R08: ffff88048e200000 R09: ffffffff81ad2c70
[ 1.923625] R10: ffff88107e51fd20 R11: 0000000000000000 R12: 0000007e7e581f70
[ 1.923625] R13: 0000000000000001 R14: ffff880c7e54eb28 R15: 00000000000080d0
[ 1.923625] FS: 0000000000000000(0000) GS:ffff880c8e200000(0000) knlGS:0000000000000000
[ 1.923625] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 1.923625] CR2: 0000007e7e581f70 CR3: 0000000001a04000 CR4: 00000000000006e0
[ 1.923625] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1.923625] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 1.923625] Process swapper (pid: 1, threadinfo ffff88047e09c000, task ffff88107e468000)
[ 1.923625] Stack:
[ 1.923625] ffff88047e09dd60 ffffffff81162c4d 0000000000000008 ffff88087dd5f870
[ 1.923625] <0> ffff88047e09dfd8 ffffffff81106e14 ffff88047e09dd80 ffff88107e468670
[ 1.923625] <0> ffff88107e468670 ffff88107e468000 ffff88047e09ddd0 ffff88107e468000
[ 1.923625] Call Trace:
[ 1.923625] [<ffffffff81162c4d>] ? sysfs_find_dirent+0x3f/0x58
[ 1.923625] [<ffffffff81106e14>] ? alloc_shared_caches+0x10f/0x277
[ 1.923625] [<ffffffff811060f8>] __kmalloc_node+0x78/0xa3
[ 1.923625] [<ffffffff81106e14>] alloc_shared_caches+0x10f/0x277
[ 1.923625] [<ffffffff811065e8>] ? kfree+0x85/0x8d
[ 1.923625] [<ffffffff81b09661>] slab_sysfs_init+0x96/0x10a
[ 1.923625] [<ffffffff81b095cb>] ? slab_sysfs_init+0x0/0x10a
[ 1.923625] [<ffffffff810001f9>] do_one_initcall+0x5e/0x14e
[ 1.923625] [<ffffffff81aec6bb>] kernel_init+0x178/0x202
[ 1.923625] [<ffffffff81030954>] kernel_thread_helper+0x4/0x10
[ 1.923625] [<ffffffff81aec543>] ? kernel_init+0x0/0x202
[ 1.923625] [<ffffffff81030950>] ? kernel_thread_helper+0x0/0x10
[ 1.923625] Code: 95 78 ff ff ff 4c 89 e6 48 89 df e8 13 f4 ff ff 85 c0 0f 84 44 fb ff ff ff 75 b0 9d 66 45 85 ff 79 3b 48 63 4b 14 31 c0 4c 89 e7 <f3> aa eb 2e ff 75 b0 9d 41 f7 c7 00 02 00 00 75 1e 48 c7 c7 10
[ 1.923625] RIP [<ffffffff811053ee>] slab_alloc+0x549/0x590
[ 1.923625] RSP <ffff88047e09dd30>
[ 1.923625] CR2: 0000007e7e581f70

2010-08-17 17:51:17

by Christoph Lameter

[permalink] [raw]
Subject: Re: [S+Q3 20/23] slub: Shared cache to exploit cross cpu caching abilities.

On Mon, 16 Aug 2010, David Rientjes wrote:

> This explodes on the memset() in slab_alloc() because of __GFP_ZERO on my
> system:

Well that seems to be because __kmalloc_node returned invalid address. Run
with full debugging please?

2010-08-17 18:42:40

by David Rientjes

[permalink] [raw]
Subject: Re: [S+Q3 20/23] slub: Shared cache to exploit cross cpu caching abilities.

On Tue, 17 Aug 2010, Christoph Lameter wrote:

> > This explodes on the memset() in slab_alloc() because of __GFP_ZERO on my
> > system:
>
> Well that seems to be because __kmalloc_node returned invalid address. Run
> with full debugging please?
>

Lots of data, so I trimmed it down to something reasonable by eliminating
reports that were very similar. (It also looks like some metadata is
getting displayed incorrectly such as negative pid's and 10-digit cpu
numbers.)

[ 14.152177] =============================================================================
[ 14.153172] BUG kmalloc-16: Object padding overwritten
[ 14.153172] -----------------------------------------------------------------------------
[ 14.153172]
[ 14.153172] INFO: 0xffff88107e595ea8-0xffff88107e595eab. First byte 0x0 instead of 0x5a
[ 14.153172] INFO: Freed in 0x7e00000000 age=18446743536838353798 cpu=0 pid=0
[ 14.153172] INFO: Slab 0xffffea0039ba3898 objects=51 new=4 fp=0x0007800000000000 flags=0xe00000000000080
[ 14.153172] INFO: Object 0xffff88107e595e60 @offset=3680
[ 14.153172]
[ 14.153172] Bytes b4 0xffff88107e595e50: 00 00 00 00 7e 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ....~...ZZZZZZZZ
[ 14.153172] Object 0xffff88107e595e60: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b a5 kkkkkkkkkkkkkkk
[ 14.153172] Redzone 0xffff88107e595e70: bb bb bb bb bb bb bb bb
[ 14.153172] Padding 0xffff88107e595ea8: 00 00 00 00 5a 5a 5a 5a ....ZZZZ
[ 14.153172] Pid: 1, comm: swapper Not tainted 2.6.35-slubq #1
[ 14.153172] Call Trace:
[ 14.153172] [<ffffffff81104333>] print_trailer+0x134/0x13f
[ 14.153172] [<ffffffff811043f5>] check_bytes_and_report+0xb7/0xe8
[ 14.153172] [<ffffffff8110455e>] check_object+0x138/0x150
[ 14.153172] [<ffffffff81106f18>] ? alloc_shared_caches+0x10f/0x277
[ 14.153172] [<ffffffff811049bf>] alloc_debug_processing+0xd5/0x160
[ 14.153172] [<ffffffff811054d7>] slab_alloc+0x52e/0x590
[ 14.153172] [<ffffffff81106f18>] ? alloc_shared_caches+0x10f/0x277
[ 14.153172] [<ffffffff811061fc>] __kmalloc_node+0x78/0xa3
[ 14.153172] [<ffffffff81106f18>] alloc_shared_caches+0x10f/0x277
[ 14.153172] [<ffffffff81b09661>] slab_sysfs_init+0x96/0x10a
[ 14.153172] [<ffffffff81b095cb>] ? slab_sysfs_init+0x0/0x10a
[ 14.153172] [<ffffffff810001f9>] do_one_initcall+0x5e/0x14e
[ 14.153172] [<ffffffff81aec6bb>] kernel_init+0x178/0x202
[ 14.153172] [<ffffffff81030954>] kernel_thread_helper+0x4/0x10
[ 14.153172] [<ffffffff81aec543>] ? kernel_init+0x0/0x202
[ 14.153172] [<ffffffff81030950>] ? kernel_thread_helper+0x0/0x10
[ 14.153172] FIX kmalloc-16: Restoring 0xffff88107e595ea8-0xffff88107e595eab=0x5a

...

[ 15.751474] =============================================================================
[ 15.752467] BUG kmalloc-16: Redzone overwritten
[ 15.752467] -----------------------------------------------------------------------------
[ 15.752467]
[ 15.752467] INFO: 0xffff880c7e5f3ec0-0xffff880c7e5f3ec7. First byte 0x30 instead of 0xbb
[ 15.752467] INFO: Allocated in 0xffff88087e4f11e0 age=131909211166235 cpu=2119111312 pid=-30712
[ 15.752467] INFO: Freed in 0xffff88087e4f13f0 age=131909211165707 cpu=2119111840 pid=-30712
[ 15.752467] INFO: Slab 0xffffea002bba4d28 objects=51 new=3 fp=0x0007000000000000 flags=0xa00000000000080
[ 15.752467] INFO: Object 0xffff880c7e5f3eb0 @offset=3760
[ 15.752467]
[ 15.752467] Bytes b4 0xffff880c7e5f3ea0: 18 00 00 00 7e 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ....~...ZZZZZZZZ
[ 15.752467] Object 0xffff880c7e5f3eb0: d0 0f 4f 7e 08 88 ff ff 80 10 4f 7e 08 88 ff ff .O~....O~..
[ 15.752467] Redzone 0xffff880c7e5f3ec0: 30 11 4f 7e 08 88 ff ff 0.O~..
[ 15.752467] Padding 0xffff880c7e5f3ef8: 00 16 4f 7e 08 88 ff ff ..O~..
[ 15.752467] Pid: 1, comm: swapper Not tainted 2.6.35-slubq #1
[ 15.752467] Call Trace:
[ 15.752467] [<ffffffff81104333>] print_trailer+0x134/0x13f
[ 15.752467] [<ffffffff811043f5>] check_bytes_and_report+0xb7/0xe8
[ 15.752467] [<ffffffff81104481>] check_object+0x5b/0x150
[ 15.752467] [<ffffffff81106f18>] ? alloc_shared_caches+0x10f/0x277
[ 15.752467] [<ffffffff811049bf>] alloc_debug_processing+0xd5/0x160
[ 15.752467] [<ffffffff811054d7>] slab_alloc+0x52e/0x590
[ 15.752467] [<ffffffff81106f18>] ? alloc_shared_caches+0x10f/0x277
[ 15.752467] [<ffffffff811061fc>] __kmalloc_node+0x78/0xa3
[ 15.752467] [<ffffffff81106f18>] alloc_shared_caches+0x10f/0x277
[ 15.752467] [<ffffffff81b09661>] slab_sysfs_init+0x96/0x10a
[ 15.752467] [<ffffffff81b095cb>] ? slab_sysfs_init+0x0/0x10a
[ 15.752467] [<ffffffff810001f9>] do_one_initcall+0x5e/0x14e
[ 15.752467] [<ffffffff81aec6bb>] kernel_init+0x178/0x202
[ 15.752467] [<ffffffff81030954>] kernel_thread_helper+0x4/0x10
[ 15.752467] [<ffffffff81aec543>] ? kernel_init+0x0/0x202
[ 15.752467] [<ffffffff81030950>] ? kernel_thread_helper+0x0/0x10
[ 15.752467] FIX kmalloc-16: Restoring 0xffff880c7e5f3ec0-0xffff880c7e5f3ec7=0xbb

...

[ 15.752467] =============================================================================
[ 15.752467] BUG kmalloc-16: Pointer check fails
[ 15.752467] -----------------------------------------------------------------------------
[ 15.752467]
[ 15.752467] INFO: Allocated in 0xffff880c7e5f4080 age=131874850999735 cpu=2119539736 pid=-30704
[ 15.752467] INFO: Freed in 0xc00000000 age=18446743536838355610 cpu=4 pid=1
[ 15.752467] INFO: Slab 0xffffea002bba4d28 objects=51 new=0 fp=0x(null) flags=0xa00000000000080
[ 15.752467] INFO: Object 0xffff880c7e5f3ff0 @offset=4080
[ 15.752467]
[ 15.752467] Bytes b4 0xffff880c7e5f3fe0: 00 00 00 00 7e 00 00 00 00 00 00 00 5a 5a 5a 5a ....~.......ZZZZ
[ 15.752467] Object 0xffff880c7e5f3ff0: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ
[ 15.752467] Redzone 0xffff880c7e5f4000: 80 35 37 7e 04 88 ff ff .57~..
[ 15.752467] Padding 0xffff880c7e5f4038: 00 00 00 00 00 00 00 00 ........
[ 15.752467] Pid: 1, comm: swapper Not tainted 2.6.35-slubq #1
[ 15.752467] Call Trace:
[ 15.752467] [<ffffffff81104333>] print_trailer+0x134/0x13f
[ 15.752467] [<ffffffff81106f18>] ? alloc_shared_caches+0x10f/0x277
[ 15.752467] [<ffffffff81104858>] object_err+0x3a/0x43
[ 15.752467] [<ffffffff81106f18>] ? alloc_shared_caches+0x10f/0x277
[ 15.752467] [<ffffffff811049ad>] alloc_debug_processing+0xc3/0x160
[ 15.752467] [<ffffffff811054d7>] slab_alloc+0x52e/0x590
[ 15.752467] [<ffffffff81106f18>] ? alloc_shared_caches+0x10f/0x277
[ 15.752467] [<ffffffff811061fc>] __kmalloc_node+0x78/0xa3
[ 15.752467] [<ffffffff81106f18>] alloc_shared_caches+0x10f/0x277
[ 15.752467] [<ffffffff81b09661>] slab_sysfs_init+0x96/0x10a
[ 15.752467] [<ffffffff81b095cb>] ? slab_sysfs_init+0x0/0x10a
[ 15.752467] [<ffffffff810001f9>] do_one_initcall+0x5e/0x14e
[ 15.752467] [<ffffffff81aec6bb>] kernel_init+0x178/0x202
[ 15.752467] [<ffffffff81030954>] kernel_thread_helper+0x4/0x10
[ 15.752467] [<ffffffff81aec543>] ? kernel_init+0x0/0x202
[ 15.752467] [<ffffffff81030950>] ? kernel_thread_helper+0x0/0x10
[ 15.752467] FIX kmalloc-16: Marking all objects used

2010-08-17 18:50:39

by Christoph Lameter

[permalink] [raw]
Subject: Re: [S+Q3 20/23] slub: Shared cache to exploit cross cpu caching abilities.

On Tue, 17 Aug 2010, David Rientjes wrote:

> On Tue, 17 Aug 2010, Christoph Lameter wrote:
>
> > > This explodes on the memset() in slab_alloc() because of __GFP_ZERO on my
> > > system:
> >
> > Well that seems to be because __kmalloc_node returned invalid address. Run
> > with full debugging please?
> >
>
> Lots of data, so I trimmed it down to something reasonable by eliminating
> reports that were very similar. (It also looks like some metadata is
> getting displayed incorrectly such as negative pid's and 10-digit cpu
> numbers.)

Well yes I guess that is the result of large scale corruption that is
reaching into the debug fields of the object.

> [ 15.752467]
> [ 15.752467] INFO: 0xffff880c7e5f3ec0-0xffff880c7e5f3ec7. First byte 0x30 instead of 0xbb
> [ 15.752467] INFO: Allocated in 0xffff88087e4f11e0 age=131909211166235 cpu=2119111312 pid=-30712
> [ 15.752467] INFO: Freed in 0xffff88087e4f13f0 age=131909211165707 cpu=2119111840 pid=-30712
> [ 15.752467] INFO: Slab 0xffffea002bba4d28 objects=51 new=3 fp=0x0007000000000000 flags=0xa00000000000080
> [ 15.752467] INFO: Object 0xffff880c7e5f3eb0 @offset=3760
> [ 15.752467]
> [ 15.752467] Bytes b4 0xffff880c7e5f3ea0: 18 00 00 00 7e 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ....~...ZZZZZZZZ
> [ 15.752467] Object 0xffff880c7e5f3eb0: d0 0f 4f 7e 08 88 ff ff 80 10 4f 7e 08 88 ff ff .O~....O~..
> [ 15.752467] Redzone 0xffff880c7e5f3ec0: 30 11 4f 7e 08 88 ff ff 0.O~..
> [ 15.752467] Padding 0xffff880c7e5f3ef8: 00 16 4f 7e 08 88 ff ff ..O~..

16 bytes allocated and a pointer array much larger than that is used.

2010-08-17 19:02:31

by David Rientjes

[permalink] [raw]
Subject: Re: [S+Q3 20/23] slub: Shared cache to exploit cross cpu caching abilities.

On Tue, 17 Aug 2010, Christoph Lameter wrote:

> Well yes I guess that is the result of large scale corruption that is
> reaching into the debug fields of the object.
>
> > [ 15.752467]
> > [ 15.752467] INFO: 0xffff880c7e5f3ec0-0xffff880c7e5f3ec7. First byte 0x30 instead of 0xbb
> > [ 15.752467] INFO: Allocated in 0xffff88087e4f11e0 age=131909211166235 cpu=2119111312 pid=-30712
> > [ 15.752467] INFO: Freed in 0xffff88087e4f13f0 age=131909211165707 cpu=2119111840 pid=-30712
> > [ 15.752467] INFO: Slab 0xffffea002bba4d28 objects=51 new=3 fp=0x0007000000000000 flags=0xa00000000000080
> > [ 15.752467] INFO: Object 0xffff880c7e5f3eb0 @offset=3760
> > [ 15.752467]
> > [ 15.752467] Bytes b4 0xffff880c7e5f3ea0: 18 00 00 00 7e 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ....~...ZZZZZZZZ
> > [ 15.752467] Object 0xffff880c7e5f3eb0: d0 0f 4f 7e 08 88 ff ff 80 10 4f 7e 08 88 ff ff .O~....O~..
> > [ 15.752467] Redzone 0xffff880c7e5f3ec0: 30 11 4f 7e 08 88 ff ff 0.O~..
> > [ 15.752467] Padding 0xffff880c7e5f3ef8: 00 16 4f 7e 08 88 ff ff ..O~..
>
> 16 bytes allocated and a pointer array much larger than that is used.
>

Since the problem persists with and without CONFIG_SLUB_DEBUG_ON, I'd
speculate that this is a problem with node scalability on my 4-node system
if this boots fine for you.

2010-08-17 19:32:57

by Christoph Lameter

[permalink] [raw]
Subject: Re: [S+Q3 20/23] slub: Shared cache to exploit cross cpu caching abilities.

On Tue, 17 Aug 2010, David Rientjes wrote:

> On Tue, 17 Aug 2010, Christoph Lameter wrote:
>
> > Well yes I guess that is the result of large scale corruption that is
> > reaching into the debug fields of the object.
> >
> > > [ 15.752467]
> > > [ 15.752467] INFO: 0xffff880c7e5f3ec0-0xffff880c7e5f3ec7. First byte 0x30 instead of 0xbb
> > > [ 15.752467] INFO: Allocated in 0xffff88087e4f11e0 age=131909211166235 cpu=2119111312 pid=-30712
> > > [ 15.752467] INFO: Freed in 0xffff88087e4f13f0 age=131909211165707 cpu=2119111840 pid=-30712
> > > [ 15.752467] INFO: Slab 0xffffea002bba4d28 objects=51 new=3 fp=0x0007000000000000 flags=0xa00000000000080
> > > [ 15.752467] INFO: Object 0xffff880c7e5f3eb0 @offset=3760
> > > [ 15.752467]
> > > [ 15.752467] Bytes b4 0xffff880c7e5f3ea0: 18 00 00 00 7e 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ....~...ZZZZZZZZ
> > > [ 15.752467] Object 0xffff880c7e5f3eb0: d0 0f 4f 7e 08 88 ff ff 80 10 4f 7e 08 88 ff ff .O~....O~..
> > > [ 15.752467] Redzone 0xffff880c7e5f3ec0: 30 11 4f 7e 08 88 ff ff 0.O~..
> > > [ 15.752467] Padding 0xffff880c7e5f3ef8: 00 16 4f 7e 08 88 ff ff ..O~..
> >
> > 16 bytes allocated and a pointer array much larger than that is used.
> >
>
> Since the problem persists with and without CONFIG_SLUB_DEBUG_ON, I'd
> speculate that this is a problem with node scalability on my 4-node system
> if this boots fine for you.

Looking at it. I have a fakenuma setup here that does not trigger it.
Guess I need something more real.

2010-08-18 19:32:10

by Christoph Lameter

[permalink] [raw]
Subject: Re: [S+Q3 20/23] slub: Shared cache to exploit cross cpu caching abilities.

On Tue, 17 Aug 2010, Christoph Lameter wrote:

> > >
> > > > [ 15.752467]
> > > > [ 15.752467] INFO: 0xffff880c7e5f3ec0-0xffff880c7e5f3ec7. First byte 0x30 instead of 0xbb
> > > > [ 15.752467] INFO: Allocated in 0xffff88087e4f11e0 age=131909211166235 cpu=2119111312 pid=-30712
> > > > [ 15.752467] INFO: Freed in 0xffff88087e4f13f0 age=131909211165707 cpu=2119111840 pid=-30712
> > > > [ 15.752467] INFO: Slab 0xffffea002bba4d28 objects=51 new=3 fp=0x0007000000000000 flags=0xa00000000000080
> > > > [ 15.752467] INFO: Object 0xffff880c7e5f3eb0 @offset=3760
> > > > [ 15.752467]
> > > > [ 15.752467] Bytes b4 0xffff880c7e5f3ea0: 18 00 00 00 7e 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ....~...ZZZZZZZZ
> > > > [ 15.752467] Object 0xffff880c7e5f3eb0: d0 0f 4f 7e 08 88 ff ff 80 10 4f 7e 08 88 ff ff .O~....O~..
> > > > [ 15.752467] Redzone 0xffff880c7e5f3ec0: 30 11 4f 7e 08 88 ff ff 0.O~..
> > > > [ 15.752467] Padding 0xffff880c7e5f3ef8: 00 16 4f 7e 08 88 ff ff ..O~..
> > >
> > > 16 bytes allocated and a pointer array much larger than that is used.
> > >
> >
> > Since the problem persists with and without CONFIG_SLUB_DEBUG_ON, I'd
> > speculate that this is a problem with node scalability on my 4-node system
> > if this boots fine for you.
>
> Looking at it. I have a fakenuma setup here that does not trigger it.
> Guess I need something more real.

Cannot reproduce it on my real 2 node numa system either.
Trouble is that results in only one alien cache per cpu caching domain.