2009-06-10 20:12:27

by Pekka Enberg

[permalink] [raw]
Subject: [GIT PULL] Early boot SLAB for 2.6.31

Hi Linus,

Here are the core patches for enabling slab before the scheduler initializes
itself in the boot sequence. I added slab fallback support to the bootmem
allocator so that we don't need a flag day for switching to early slab.

I have tested this series on x86-64 with SLAB, SLUB, and SLOB. Note: the
following harmless warning appears at boot:

[ 0.000000] ------------[ cut here ]------------
[ 0.000000] WARNING: at mm/bootmem.c:535 alloc_arch_preferred_bootmem+0x31/0x56()
[ 0.000000] Hardware name:
[ 0.000000] Modules linked in:
[ 0.000000] Pid: 0, comm: swapper Not tainted 2.6.30 #472
[ 0.000000] Call Trace:
[ 0.000000] [<ffffffff809d62c3>] ? alloc_arch_preferred_bootmem+0x31/0x56
[ 0.000000] [<ffffffff8025c304>] warn_slowpath_common+0x7c/0xa9
[ 0.000000] [<ffffffff8025c345>] warn_slowpath_null+0x14/0x16
[ 0.000000] [<ffffffff809d62c3>] alloc_arch_preferred_bootmem+0x31/0x56
[ 0.000000] [<ffffffff809d6833>] ___alloc_bootmem_nopanic+0x3f/0xc9
[ 0.000000] [<ffffffff809d68ce>] ___alloc_bootmem+0x11/0x3a
[ 0.000000] [<ffffffff809d69a0>] __alloc_bootmem+0xb/0xd
[ 0.000000] [<ffffffff809d21f9>] sched_init+0x43/0x4ee
[ 0.000000] [<ffffffff809c0aae>] start_kernel+0x1cc/0x3aa
[ 0.000000] [<ffffffff809c029a>] x86_64_start_reservations+0xaa/0xae
[ 0.000000] [<ffffffff809c037f>] x86_64_start_kernel+0xe1/0xe8
[ 0.000000] ---[ end trace 4eaa2a86a8e2da22 ]---

I already have patches for that but they are against the -tip tree so I think
we ought to just merge this series to mainline and fix everything up in
subsystem trees for 2.6.31 proper.

Pekka

The following changes since commit 07a2039b8eb0af4ff464efd3dfd95de5c02648c6:
Linus Torvalds (1):
Linux 2.6.30

are available in the git repository at:

ssh://master.kernel.org/pub/scm/linux/kernel/git/penberg/slab-2.6 for-linus

Pekka Enberg (3):
bootmem: use slab if bootmem is no longer available
slab: setup allocators earlier in the boot sequence
vmalloc: use kzalloc() instead of alloc_bootmem()

init/main.c | 32 +++++++++++++--------
mm/bootmem.c | 3 ++
mm/slab.c | 85 ++++++++++++++++++++++++++++++---------------------------
mm/slub.c | 17 +++++++-----
mm/vmalloc.c | 3 +-
5 files changed, 79 insertions(+), 61 deletions(-)

diff --git a/init/main.c b/init/main.c
index d721dad..0c6f366 100644
--- a/init/main.c
+++ b/init/main.c
@@ -574,6 +574,26 @@ asmlinkage void __init start_kernel(void)
setup_nr_cpu_ids();
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */

+ build_all_zonelists();
+ page_alloc_init();
+
+ printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
+ parse_early_param();
+ parse_args("Booting kernel", static_command_line, __start___param,
+ __stop___param - __start___param,
+ &unknown_bootoption);
+ /*
+ * These use large bootmem allocations and must precede
+ * kmem_cache_init()
+ */
+ pidhash_init();
+ vfs_caches_init_early();
+ /*
+ * Set up kernel memory allocators
+ */
+ mem_init();
+ kmem_cache_init();
+ vmalloc_init();
/*
* Set up the scheduler prior starting any interrupts (such as the
* timer interrupt). Full topology setup happens at smp_init()
@@ -585,13 +605,6 @@ asmlinkage void __init start_kernel(void)
* fragile until we cpu_idle() for the first time.
*/
preempt_disable();
- build_all_zonelists();
- page_alloc_init();
- printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
- parse_early_param();
- parse_args("Booting kernel", static_command_line, __start___param,
- __stop___param - __start___param,
- &unknown_bootoption);
if (!irqs_disabled()) {
printk(KERN_WARNING "start_kernel(): bug: interrupts were "
"enabled *very* early, fixing it\n");
@@ -603,7 +616,6 @@ asmlinkage void __init start_kernel(void)
/* init some links before init_ISA_irqs() */
early_irq_init();
init_IRQ();
- pidhash_init();
init_timers();
hrtimers_init();
softirq_init();
@@ -645,14 +657,10 @@ asmlinkage void __init start_kernel(void)
initrd_start = 0;
}
#endif
- vmalloc_init();
- vfs_caches_init_early();
cpuset_init_early();
page_cgroup_init();
- mem_init();
enable_debug_pagealloc();
cpu_hotplug_init();
- kmem_cache_init();
kmemtrace_init();
debug_objects_mem_init();
idr_init_cache();
diff --git a/mm/bootmem.c b/mm/bootmem.c
index daf9271..457269c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -532,6 +532,9 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
unsigned long size, unsigned long align,
unsigned long goal, unsigned long limit)
{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc(size, GFP_NOWAIT);
+
#ifdef CONFIG_HAVE_ARCH_BOOTMEM
bootmem_data_t *p_bdata;

diff --git a/mm/slab.c b/mm/slab.c
index 9a90b00..a5b3cf4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -315,7 +315,7 @@ static int drain_freelist(struct kmem_cache *cache,
struct kmem_list3 *l3, int tofree);
static void free_block(struct kmem_cache *cachep, void **objpp, int len,
int node);
-static int enable_cpucache(struct kmem_cache *cachep);
+static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
static void cache_reap(struct work_struct *unused);

/*
@@ -958,12 +958,12 @@ static void __cpuinit start_cpu_timer(int cpu)
}

static struct array_cache *alloc_arraycache(int node, int entries,
- int batchcount)
+ int batchcount, gfp_t gfp)
{
int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
struct array_cache *nc = NULL;

- nc = kmalloc_node(memsize, GFP_KERNEL, node);
+ nc = kmalloc_node(memsize, gfp, node);
if (nc) {
nc->avail = 0;
nc->limit = entries;
@@ -1003,7 +1003,7 @@ static int transfer_objects(struct array_cache *to,
#define drain_alien_cache(cachep, alien) do { } while (0)
#define reap_alien(cachep, l3) do { } while (0)

-static inline struct array_cache **alloc_alien_cache(int node, int limit)
+static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
{
return (struct array_cache **)BAD_ALIEN_MAGIC;
}
@@ -1034,7 +1034,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
static void *alternate_node_alloc(struct kmem_cache *, gfp_t);

-static struct array_cache **alloc_alien_cache(int node, int limit)
+static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
{
struct array_cache **ac_ptr;
int memsize = sizeof(void *) * nr_node_ids;
@@ -1042,14 +1042,14 @@ static struct array_cache **alloc_alien_cache(int node, int limit)

if (limit > 1)
limit = 12;
- ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
+ ac_ptr = kmalloc_node(memsize, gfp, node);
if (ac_ptr) {
for_each_node(i) {
if (i == node || !node_online(i)) {
ac_ptr[i] = NULL;
continue;
}
- ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
+ ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
if (!ac_ptr[i]) {
for (i--; i >= 0; i--)
kfree(ac_ptr[i]);
@@ -1282,20 +1282,20 @@ static int __cpuinit cpuup_prepare(long cpu)
struct array_cache **alien = NULL;

nc = alloc_arraycache(node, cachep->limit,
- cachep->batchcount);
+ cachep->batchcount, GFP_KERNEL);
if (!nc)
goto bad;
if (cachep->shared) {
shared = alloc_arraycache(node,
cachep->shared * cachep->batchcount,
- 0xbaadf00d);
+ 0xbaadf00d, GFP_KERNEL);
if (!shared) {
kfree(nc);
goto bad;
}
}
if (use_alien_caches) {
- alien = alloc_alien_cache(node, cachep->limit);
+ alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
if (!alien) {
kfree(shared);
kfree(nc);
@@ -1399,10 +1399,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
{
struct kmem_list3 *ptr;

- ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
+ ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid);
BUG_ON(!ptr);

- local_irq_disable();
memcpy(ptr, list, sizeof(struct kmem_list3));
/*
* Do not assume that spinlocks can be initialized via memcpy:
@@ -1411,7 +1410,6 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,

MAKE_ALL_LISTS(cachep, ptr, nodeid);
cachep->nodelists[nodeid] = ptr;
- local_irq_enable();
}

/*
@@ -1575,9 +1573,8 @@ void __init kmem_cache_init(void)
{
struct array_cache *ptr;

- ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+ ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);

- local_irq_disable();
BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
memcpy(ptr, cpu_cache_get(&cache_cache),
sizeof(struct arraycache_init));
@@ -1587,11 +1584,9 @@ void __init kmem_cache_init(void)
spin_lock_init(&ptr->lock);

cache_cache.array[smp_processor_id()] = ptr;
- local_irq_enable();

- ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+ ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);

- local_irq_disable();
BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
!= &initarray_generic.cache);
memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
@@ -1603,7 +1598,6 @@ void __init kmem_cache_init(void)

malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
ptr;
- local_irq_enable();
}
/* 5) Replace the bootstrap kmem_list3's */
{
@@ -1627,7 +1621,7 @@ void __init kmem_cache_init(void)
struct kmem_cache *cachep;
mutex_lock(&cache_chain_mutex);
list_for_each_entry(cachep, &cache_chain, next)
- if (enable_cpucache(cachep))
+ if (enable_cpucache(cachep, GFP_NOWAIT))
BUG();
mutex_unlock(&cache_chain_mutex);
}
@@ -2064,10 +2058,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
return left_over;
}

-static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
+static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
{
if (g_cpucache_up == FULL)
- return enable_cpucache(cachep);
+ return enable_cpucache(cachep, gfp);

if (g_cpucache_up == NONE) {
/*
@@ -2089,7 +2083,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
g_cpucache_up = PARTIAL_AC;
} else {
cachep->array[smp_processor_id()] =
- kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+ kmalloc(sizeof(struct arraycache_init), gfp);

if (g_cpucache_up == PARTIAL_AC) {
set_up_list3s(cachep, SIZE_L3);
@@ -2153,6 +2147,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
{
size_t left_over, slab_size, ralign;
struct kmem_cache *cachep = NULL, *pc;
+ gfp_t gfp;

/*
* Sanity checks... these are all serious usage bugs.
@@ -2168,8 +2163,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
* We use cache_chain_mutex to ensure a consistent view of
* cpu_online_mask as well. Please see cpuup_callback
*/
- get_online_cpus();
- mutex_lock(&cache_chain_mutex);
+ if (slab_is_available()) {
+ get_online_cpus();
+ mutex_lock(&cache_chain_mutex);
+ }

list_for_each_entry(pc, &cache_chain, next) {
char tmp;
@@ -2278,8 +2275,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
*/
align = ralign;

+ if (slab_is_available())
+ gfp = GFP_KERNEL;
+ else
+ gfp = GFP_NOWAIT;
+
/* Get cache's description obj. */
- cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
+ cachep = kmem_cache_zalloc(&cache_cache, gfp);
if (!cachep)
goto oops;

@@ -2382,7 +2384,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
cachep->ctor = ctor;
cachep->name = name;

- if (setup_cpu_cache(cachep)) {
+ if (setup_cpu_cache(cachep, gfp)) {
__kmem_cache_destroy(cachep);
cachep = NULL;
goto oops;
@@ -2394,8 +2396,10 @@ oops:
if (!cachep && (flags & SLAB_PANIC))
panic("kmem_cache_create(): failed to create slab `%s'\n",
name);
- mutex_unlock(&cache_chain_mutex);
- put_online_cpus();
+ if (slab_is_available()) {
+ mutex_unlock(&cache_chain_mutex);
+ put_online_cpus();
+ }
return cachep;
}
EXPORT_SYMBOL(kmem_cache_create);
@@ -3802,7 +3806,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name);
/*
* This initializes kmem_list3 or resizes various caches for all nodes.
*/
-static int alloc_kmemlist(struct kmem_cache *cachep)
+static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
{
int node;
struct kmem_list3 *l3;
@@ -3812,7 +3816,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
for_each_online_node(node) {

if (use_alien_caches) {
- new_alien = alloc_alien_cache(node, cachep->limit);
+ new_alien = alloc_alien_cache(node, cachep->limit, gfp);
if (!new_alien)
goto fail;
}
@@ -3821,7 +3825,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
if (cachep->shared) {
new_shared = alloc_arraycache(node,
cachep->shared*cachep->batchcount,
- 0xbaadf00d);
+ 0xbaadf00d, gfp);
if (!new_shared) {
free_alien_cache(new_alien);
goto fail;
@@ -3850,7 +3854,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
free_alien_cache(new_alien);
continue;
}
- l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
+ l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
if (!l3) {
free_alien_cache(new_alien);
kfree(new_shared);
@@ -3906,18 +3910,18 @@ static void do_ccupdate_local(void *info)

/* Always called with the cache_chain_mutex held */
static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
- int batchcount, int shared)
+ int batchcount, int shared, gfp_t gfp)
{
struct ccupdate_struct *new;
int i;

- new = kzalloc(sizeof(*new), GFP_KERNEL);
+ new = kzalloc(sizeof(*new), gfp);
if (!new)
return -ENOMEM;

for_each_online_cpu(i) {
new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
- batchcount);
+ batchcount, gfp);
if (!new->new[i]) {
for (i--; i >= 0; i--)
kfree(new->new[i]);
@@ -3944,11 +3948,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
kfree(ccold);
}
kfree(new);
- return alloc_kmemlist(cachep);
+ return alloc_kmemlist(cachep, gfp);
}

/* Called with cache_chain_mutex held always */
-static int enable_cpucache(struct kmem_cache *cachep)
+static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
{
int err;
int limit, shared;
@@ -3994,7 +3998,7 @@ static int enable_cpucache(struct kmem_cache *cachep)
if (limit > 32)
limit = 32;
#endif
- err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
+ err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
if (err)
printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
cachep->name, -err);
@@ -4300,7 +4304,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
res = 0;
} else {
res = do_tune_cpucache(cachep, limit,
- batchcount, shared);
+ batchcount, shared,
+ GFP_KERNEL);
}
break;
}
diff --git a/mm/slub.c b/mm/slub.c
index 65ffda5..0ead807 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2557,13 +2557,16 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
if (gfp_flags & SLUB_DMA)
flags = SLAB_CACHE_DMA;

- down_write(&slub_lock);
+ /*
+ * This function is called with IRQs disabled during early-boot on
+ * single CPU so there's no need to take slub_lock here.
+ */
if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
flags, NULL))
goto panic;

list_add(&s->list, &slab_caches);
- up_write(&slub_lock);
+
if (sysfs_slab_add(s))
goto panic;
return s;
@@ -3021,7 +3024,7 @@ void __init kmem_cache_init(void)
* kmem_cache_open for slab_state == DOWN.
*/
create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
- sizeof(struct kmem_cache_node), GFP_KERNEL);
+ sizeof(struct kmem_cache_node), GFP_NOWAIT);
kmalloc_caches[0].refcount = -1;
caches++;

@@ -3034,16 +3037,16 @@ void __init kmem_cache_init(void)
/* Caches that are not of the two-to-the-power-of size */
if (KMALLOC_MIN_SIZE <= 64) {
create_kmalloc_cache(&kmalloc_caches[1],
- "kmalloc-96", 96, GFP_KERNEL);
+ "kmalloc-96", 96, GFP_NOWAIT);
caches++;
create_kmalloc_cache(&kmalloc_caches[2],
- "kmalloc-192", 192, GFP_KERNEL);
+ "kmalloc-192", 192, GFP_NOWAIT);
caches++;
}

for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
create_kmalloc_cache(&kmalloc_caches[i],
- "kmalloc", 1 << i, GFP_KERNEL);
+ "kmalloc", 1 << i, GFP_NOWAIT);
caches++;
}

@@ -3080,7 +3083,7 @@ void __init kmem_cache_init(void)
/* Provide the correct kmalloc names now that the caches are up */
for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
kmalloc_caches[i]. name =
- kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
+ kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);

#ifdef CONFIG_SMP
register_cpu_notifier(&slab_notifier);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 083716e..3235138 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -23,7 +23,6 @@
#include <linux/rbtree.h>
#include <linux/radix-tree.h>
#include <linux/rcupdate.h>
-#include <linux/bootmem.h>
#include <linux/pfn.h>

#include <asm/atomic.h>
@@ -1032,7 +1031,7 @@ void __init vmalloc_init(void)

/* Import existing vmlist entries. */
for (tmp = vmlist; tmp; tmp = tmp->next) {
- va = alloc_bootmem(sizeof(struct vmap_area));
+ va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
va->flags = tmp->flags | VM_VM_AREA;
va->va_start = (unsigned long)tmp->addr;
va->va_end = va->va_start + tmp->size;


2009-06-10 20:34:11

by Linus Torvalds

[permalink] [raw]
Subject: Re: [GIT PULL] Early boot SLAB for 2.6.31



On Wed, 10 Jun 2009, Pekka J Enberg wrote:
>
> I already have patches for that but they are against the -tip tree so I think
> we ought to just merge this series to mainline and fix everything up in
> subsystem trees for 2.6.31 proper.

Hmm. Are there any reasons why the scheduler fixups can't go in this
series? Do they depend on other things in -tip?

Linus

2009-06-10 20:34:46

by Pekka Enberg

[permalink] [raw]
Subject: Re: [GIT PULL] Early boot SLAB for 2.6.31

Linus Torvalds wrote:
>
> On Wed, 10 Jun 2009, Pekka J Enberg wrote:
>> I already have patches for that but they are against the -tip tree so I think
>> we ought to just merge this series to mainline and fix everything up in
>> subsystem trees for 2.6.31 proper.
>
> Hmm. Are there any reasons why the scheduler fixups can't go in this
> series? Do they depend on other things in -tip?

The patches are rebased to -tip, yeah. I can do a version against your
tree if you want but that will mean merge conflicts for Ingo. Hmm?

Pekka

2009-06-10 20:34:59

by Linus Torvalds

[permalink] [raw]
Subject: Re: [GIT PULL] Early boot SLAB for 2.6.31



On Wed, 10 Jun 2009, Linus Torvalds wrote:
>
> Hmm. Are there any reasons why the scheduler fixups can't go in this
> series? Do they depend on other things in -tip?

[ .. because otherwise we'll inevitably just get unnecessary bug-reports
about this all - harmless or not, that message would be annoying and
certainly cause people to panic. ]

Linus

2009-06-10 20:43:42

by Ingo Molnar

[permalink] [raw]
Subject: Re: [GIT PULL] Early boot SLAB for 2.6.31


* Pekka Enberg <[email protected]> wrote:

> Linus Torvalds wrote:
>>
>> On Wed, 10 Jun 2009, Pekka J Enberg wrote:
>
>>> I already have patches for that but they are against the -tip
>>> tree so I think we ought to just merge this series to mainline
>>> and fix everything up in subsystem trees for 2.6.31 proper.
>>
>> Hmm. Are there any reasons why the scheduler fixups can't go in
>> this series? Do they depend on other things in -tip?
>
> The patches are rebased to -tip, yeah. I can do a version against
> your tree if you want but that will mean merge conflicts for Ingo.
> Hmm?

I'm a tiny bit nervous about the tested-ness of the patches. Such
stuff rarely works at first try. But it's obviously nice changes.

What kind of conflicts are there against -tip? The diffstat suggests
it's mostly in-SLAB code, right? There shouldnt be much to conflict,
except kmemcheck - which has more or less trivial callbacks there.

Ingo

2009-06-10 20:47:21

by Pekka Enberg

[permalink] [raw]
Subject: Re: [GIT PULL] Early boot SLAB for 2.6.31

On Wed, Jun 10, 2009 at 11:43 PM, Ingo Molnar<[email protected]> wrote:
>
> * Pekka Enberg <[email protected]> wrote:
>
>> Linus Torvalds wrote:
>>>
>>> On Wed, 10 Jun 2009, Pekka J Enberg wrote:
>>
>>>> I already have patches for that but they are against the -tip
>>>> tree so I think we ought to just merge this series to mainline
>>>> and fix everything up in subsystem trees for 2.6.31 proper.
>>>
>>> Hmm. Are there any reasons why the scheduler fixups can't go in
>>> this series? Do they depend on other things in -tip?
>>
>> The patches are rebased to -tip, yeah. I can do a version against
>> your tree if you want but that will mean merge conflicts for Ingo.
>> Hmm?
>
> I'm a tiny bit nervous about the tested-ness of the patches. Such
> stuff rarely works at first try. But it's obviously nice changes.

Yeah, I was thinking of sitting on them until 2.6.32 and put them into
linux-next after the merge window closes. But Linus seems to want them
and with the fallback in place, we can probably fix any fall out quite
easily.

> What kind of conflicts are there against -tip? The diffstat suggests
> it's mostly in-SLAB code, right? There shouldnt be much to conflict,
> except kmemcheck - which has more or less trivial callbacks there.

The conflicting bits are the patches that remove bootmem allocator
uses in arch/x86 and kernel/sched.c.

Pekka

2009-06-10 20:51:41

by Ingo Molnar

[permalink] [raw]
Subject: Re: [GIT PULL] Early boot SLAB for 2.6.31


* Pekka Enberg <[email protected]> wrote:

> On Wed, Jun 10, 2009 at 11:43 PM, Ingo Molnar<[email protected]> wrote:
> >
> > * Pekka Enberg <[email protected]> wrote:
> >
> >> Linus Torvalds wrote:
> >>>
> >>> On Wed, 10 Jun 2009, Pekka J Enberg wrote:
> >>
> >>>> I already have patches for that but they are against the -tip
> >>>> tree so I think we ought to just merge this series to mainline
> >>>> and fix everything up in subsystem trees for 2.6.31 proper.
> >>>
> >>> Hmm. Are there any reasons why the scheduler fixups can't go in
> >>> this series? Do they depend on other things in -tip?
> >>
> >> The patches are rebased to -tip, yeah. I can do a version against
> >> your tree if you want but that will mean merge conflicts for Ingo.
> >> Hmm?
> >
> > I'm a tiny bit nervous about the tested-ness of the patches. Such
> > stuff rarely works at first try. But it's obviously nice changes.
>
> Yeah, I was thinking of sitting on them until 2.6.32 and put them
> into linux-next after the merge window closes. [...]

Nah, that would be unreasonably long.

> [...] But Linus seems to want them and with the fallback in place,
> we can probably fix any fall out quite easily.

Yeah.

> > What kind of conflicts are there against -tip? The diffstat
> > suggests it's mostly in-SLAB code, right? There shouldnt be much
> > to conflict, except kmemcheck - which has more or less trivial
> > callbacks there.
>
> The conflicting bits are the patches that remove bootmem allocator
> uses in arch/x86 and kernel/sched.c.

Give me an hour and i'll get some minimal testing done.

Ingo

2009-06-10 20:57:25

by Pekka Enberg

[permalink] [raw]
Subject: Re: [GIT PULL] Early boot SLAB for 2.6.31

Hi Linus,

On Wed, 10 Jun 2009, Linus Torvalds wrote:
> > Hmm. Are there any reasons why the scheduler fixups can't go in this
> > series? Do they depend on other things in -tip?
>
> [ .. because otherwise we'll inevitably just get unnecessary bug-reports
> about this all - harmless or not, that message would be annoying and
> certainly cause people to panic. ]

OK, so I merged and pushed these two patches to 'for-linus' branch of my
tree and I don't see any more warnings on boot.

Pekka

>From b38af0a398ec80c04769eb322dc92e1833481483 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <[email protected]>
Date: Wed, 10 Jun 2009 23:42:36 +0300
Subject: [PATCH] sched: use kzalloc() instead of the bootmem allocator

Now that kmem_cache_init() happens before sched_init(), we should use kzalloc()
and not the bootmem allocator.

Signed-off-by: Pekka Enberg <[email protected]>
---
kernel/sched.c | 20 ++++++++------------
1 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 26efa47..cb01c5c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -68,7 +68,6 @@
#include <linux/pagemap.h>
#include <linux/hrtimer.h>
#include <linux/tick.h>
-#include <linux/bootmem.h>
#include <linux/debugfs.h>
#include <linux/ctype.h>
#include <linux/ftrace.h>
@@ -7525,21 +7524,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)

static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
{
+ gfp_t gfp = GFP_KERNEL;
+
memset(rd, 0, sizeof(*rd));

- if (bootmem) {
- alloc_bootmem_cpumask_var(&def_root_domain.span);
- alloc_bootmem_cpumask_var(&def_root_domain.online);
- alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
- cpupri_init(&rd->cpupri, true);
- return 0;
- }
+ if (bootmem)
+ gfp = GFP_NOWAIT;

- if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+ if (!alloc_cpumask_var(&rd->span, gfp))
goto out;
- if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+ if (!alloc_cpumask_var(&rd->online, gfp))
goto free_span;
- if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ if (!alloc_cpumask_var(&rd->rto_mask, gfp))
goto free_online;

if (cpupri_init(&rd->cpupri, false) != 0)
@@ -8865,7 +8861,7 @@ void __init sched_init(void)
* we use alloc_bootmem().
*/
if (alloc_size) {
- ptr = (unsigned long)alloc_bootmem(alloc_size);
+ ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);

#ifdef CONFIG_FAIR_GROUP_SCHED
init_task_group.se = (struct sched_entity **)ptr;
--
1.6.0.4

>From dbf86f2ba459e3b070c6d74aad1e6b115171dd47 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <[email protected]>
Date: Wed, 10 Jun 2009 23:53:37 +0300
Subject: [PATCH] vt: use kzalloc() instead of the bootmem allocator

Now that kmem_cache_init() happens before console_init(), we should use
kzalloc() and not the bootmem allocator.

Signed-off-by: Pekka Enberg <[email protected]>
---
drivers/char/vt.c | 8 ++------
1 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 08151d4..c796a86 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -95,7 +95,6 @@
#include <linux/timer.h>
#include <linux/interrupt.h>
#include <linux/workqueue.h>
-#include <linux/bootmem.h>
#include <linux/pm.h>
#include <linux/font.h>
#include <linux/bitops.h>
@@ -2875,14 +2874,11 @@ static int __init con_init(void)
mod_timer(&console_timer, jiffies + blankinterval);
}

- /*
- * kmalloc is not running yet - we use the bootmem allocator.
- */
for (currcons = 0; currcons < MIN_NR_CONSOLES; currcons++) {
- vc_cons[currcons].d = vc = alloc_bootmem(sizeof(struct vc_data));
+ vc_cons[currcons].d = vc = kzalloc(sizeof(struct vc_data), GFP_NOWAIT);
INIT_WORK(&vc_cons[currcons].SAK_work, vc_SAK);
visual_init(vc, currcons, 1);
- vc->vc_screenbuf = (unsigned short *)alloc_bootmem(vc->vc_screenbuf_size);
+ vc->vc_screenbuf = kzalloc(vc->vc_screenbuf_size, GFP_NOWAIT);
vc->vc_kmalloced = 0;
vc_init(vc, vc->vc_rows, vc->vc_cols,
currcons || !vc->vc_sw->con_save_screen);
--
1.6.0.4

2009-06-10 20:58:18

by Ingo Molnar

[permalink] [raw]
Subject: Re: [GIT PULL] Early boot SLAB for 2.6.31


* Ingo Molnar <[email protected]> wrote:

> > > What kind of conflicts are there against -tip? The diffstat
> > > suggests it's mostly in-SLAB code, right? There shouldnt be
> > > much to conflict, except kmemcheck - which has more or less
> > > trivial callbacks there.
> >
> > The conflicting bits are the patches that remove bootmem
> > allocator uses in arch/x86 and kernel/sched.c.
>
> Give me an hour and i'll get some minimal testing done.

This tree doesnt conflict (not even with kmecheck) - and the older
bits you sent against the scheduler and against x86 doesnt apply
anymore - but they do look scary.

How about this: i can send the scheduler and x86 bits to Linus right
now, that should make it possible to have a clean base for you and
no interactions with anything pending?

Ingo

2009-06-10 21:00:38

by Pekka Enberg

[permalink] [raw]
Subject: Re: [GIT PULL] Early boot SLAB for 2.6.31

On Wed, 10 Jun 2009, Ingo Molnar wrote:
> > > What kind of conflicts are there against -tip? The diffstat
> > > suggests it's mostly in-SLAB code, right? There shouldnt be much
> > > to conflict, except kmemcheck - which has more or less trivial
> > > callbacks there.
> >
> > The conflicting bits are the patches that remove bootmem allocator
> > uses in arch/x86 and kernel/sched.c.
>
> Give me an hour and i'll get some minimal testing done.

Thanks! By the time you're done, I am hopefully in deep sleep (stupid
time-zones!) dreaming of better kernels. So I won't be able to send a new
pull request until tomorrow morning. All the patches are in 'for-linus'
and 'topic/slab/earlyboot' branches and so -tip ones (that are not rebased
on top of this series) in 'topic/slab/earlyboot-topic' in case you're
interested.

Pekka

2009-06-10 21:01:18

by Ingo Molnar

[permalink] [raw]
Subject: Re: [GIT PULL] Early boot SLAB for 2.6.31


* Pekka J Enberg <[email protected]> wrote:

> Hi Linus,
>
> Here are the core patches for enabling slab before the scheduler initializes
> itself in the boot sequence. I added slab fallback support to the bootmem
> allocator so that we don't need a flag day for switching to early slab.
>
> I have tested this series on x86-64 with SLAB, SLUB, and SLOB. Note: the
> following harmless warning appears at boot:
>
> [ 0.000000] ------------[ cut here ]------------
> [ 0.000000] WARNING: at mm/bootmem.c:535 alloc_arch_preferred_bootmem+0x31/0x56()
> [ 0.000000] Hardware name:
> [ 0.000000] Modules linked in:
> [ 0.000000] Pid: 0, comm: swapper Not tainted 2.6.30 #472
> [ 0.000000] Call Trace:
> [ 0.000000] [<ffffffff809d62c3>] ? alloc_arch_preferred_bootmem+0x31/0x56
> [ 0.000000] [<ffffffff8025c304>] warn_slowpath_common+0x7c/0xa9
> [ 0.000000] [<ffffffff8025c345>] warn_slowpath_null+0x14/0x16
> [ 0.000000] [<ffffffff809d62c3>] alloc_arch_preferred_bootmem+0x31/0x56
> [ 0.000000] [<ffffffff809d6833>] ___alloc_bootmem_nopanic+0x3f/0xc9
> [ 0.000000] [<ffffffff809d68ce>] ___alloc_bootmem+0x11/0x3a
> [ 0.000000] [<ffffffff809d69a0>] __alloc_bootmem+0xb/0xd
> [ 0.000000] [<ffffffff809d21f9>] sched_init+0x43/0x4ee
> [ 0.000000] [<ffffffff809c0aae>] start_kernel+0x1cc/0x3aa
> [ 0.000000] [<ffffffff809c029a>] x86_64_start_reservations+0xaa/0xae
> [ 0.000000] [<ffffffff809c037f>] x86_64_start_kernel+0xe1/0xe8
> [ 0.000000] ---[ end trace 4eaa2a86a8e2da22 ]---

Yeah, i got this too:

[ 0.004000] spurious 8259A interrupt: IRQ7.
[ 0.004000] ------------[ cut here ]------------
[ 0.004000] WARNING: at mm/bootmem.c:537 alloc_arch_preferred_bootmem+0x40/0x7e()
[ 0.004000] Hardware name: System Product Name
[ 0.004000] Modules linked in:
[ 0.004000] Pid: 0, comm: swapper Not tainted 2.6.30-tip-02102-g994fdea-dirty #52083
[ 0.004000] Call Trace:
[ 0.004000] [<ffffffff81d699bb>] ? alloc_arch_preferred_bootmem+0x40/0x7e
[ 0.004000] [<ffffffff81078931>] warn_slowpath_common+0x8d/0xd0
[ 0.004000] [<ffffffff8107899b>] warn_slowpath_null+0x27/0x3d
[ 0.004000] [<ffffffff81d699bb>] alloc_arch_preferred_bootmem+0x40/0x7e
[ 0.004000] [<ffffffff81079992>] ? vprintk+0x2d6/0x31b
[ 0.004000] [<ffffffff81d6a036>] ___alloc_bootmem_nopanic+0x4e/0xec
[ 0.004000] [<ffffffff81d6a0f4>] ___alloc_bootmem+0x20/0x61
[ 0.004000] [<ffffffff81053dc8>] ? default_spin_lock_flags+0x1e/0x36
[ 0.004000] [<ffffffff81d6a28b>] __alloc_bootmem+0x1e/0x34
[ 0.004000] [<ffffffff817971cf>] vgacon_scrollback_startup+0x3d/0xa4
[ 0.004000] [<ffffffff8103ad11>] ? native_io_delay+0xd/0x58
[ 0.004000] [<ffffffff813d3545>] vgacon_startup+0x38f/0x3be
[ 0.004000] [<ffffffff81d4e140>] ? early_idt_handler+0x0/0x71
[ 0.004000] [<ffffffff81d79275>] con_init+0x2e/0x246
[ 0.004000] [<ffffffff81d4e140>] ? early_idt_handler+0x0/0x71
[ 0.004000] [<ffffffff81d789aa>] console_init+0x28/0x50
[ 0.004000] [<ffffffff810530f5>] ? native_irq_enable+0xb/0xc
[ 0.004000] [<ffffffff81d4eead>] start_kernel+0x20e/0x35b
[ 0.004000] [<ffffffff81d4e140>] ? early_idt_handler+0x0/0x71
[ 0.004000] [<ffffffff81d4e140>] ? early_idt_handler+0x0/0x71
[ 0.004000] [<ffffffff81d4e2b2>] x86_64_start_reservations+0xb9/0xd4
[ 0.004000] [<ffffffff81d4e000>] ? __init_begin+0x0/0x140
[ 0.004000] [<ffffffff81d4e3d1>] x86_64_start_kernel+0x104/0x127
[ 0.004000] ---[ end trace a7919e7f17c0a725 ]---
[ 0.004000] Console: colour VGA+ 80x25
[ 0.004000] console handover: boot [earlyser0] -> real [ttyS0]

box booted up fine otherwise.

Ingo

2009-06-10 21:02:15

by Pekka Enberg

[permalink] [raw]
Subject: Re: [GIT PULL] Early boot SLAB for 2.6.31

Ingo Molnar wrote:
> * Pekka J Enberg <[email protected]> wrote:
>
>> Hi Linus,
>>
>> Here are the core patches for enabling slab before the scheduler initializes
>> itself in the boot sequence. I added slab fallback support to the bootmem
>> allocator so that we don't need a flag day for switching to early slab.
>>
>> I have tested this series on x86-64 with SLAB, SLUB, and SLOB. Note: the
>> following harmless warning appears at boot:
>>
>> [ 0.000000] ------------[ cut here ]------------
>> [ 0.000000] WARNING: at mm/bootmem.c:535 alloc_arch_preferred_bootmem+0x31/0x56()
>> [ 0.000000] Hardware name:
>> [ 0.000000] Modules linked in:
>> [ 0.000000] Pid: 0, comm: swapper Not tainted 2.6.30 #472
>> [ 0.000000] Call Trace:
>> [ 0.000000] [<ffffffff809d62c3>] ? alloc_arch_preferred_bootmem+0x31/0x56
>> [ 0.000000] [<ffffffff8025c304>] warn_slowpath_common+0x7c/0xa9
>> [ 0.000000] [<ffffffff8025c345>] warn_slowpath_null+0x14/0x16
>> [ 0.000000] [<ffffffff809d62c3>] alloc_arch_preferred_bootmem+0x31/0x56
>> [ 0.000000] [<ffffffff809d6833>] ___alloc_bootmem_nopanic+0x3f/0xc9
>> [ 0.000000] [<ffffffff809d68ce>] ___alloc_bootmem+0x11/0x3a
>> [ 0.000000] [<ffffffff809d69a0>] __alloc_bootmem+0xb/0xd
>> [ 0.000000] [<ffffffff809d21f9>] sched_init+0x43/0x4ee
>> [ 0.000000] [<ffffffff809c0aae>] start_kernel+0x1cc/0x3aa
>> [ 0.000000] [<ffffffff809c029a>] x86_64_start_reservations+0xaa/0xae
>> [ 0.000000] [<ffffffff809c037f>] x86_64_start_kernel+0xe1/0xe8
>> [ 0.000000] ---[ end trace 4eaa2a86a8e2da22 ]---
>
> Yeah, i got this too:
>
> [ 0.004000] spurious 8259A interrupt: IRQ7.
> [ 0.004000] ------------[ cut here ]------------
> [ 0.004000] WARNING: at mm/bootmem.c:537 alloc_arch_preferred_bootmem+0x40/0x7e()
> [ 0.004000] Hardware name: System Product Name
> [ 0.004000] Modules linked in:
> [ 0.004000] Pid: 0, comm: swapper Not tainted 2.6.30-tip-02102-g994fdea-dirty #52083
> [ 0.004000] Call Trace:
> [ 0.004000] [<ffffffff81d699bb>] ? alloc_arch_preferred_bootmem+0x40/0x7e
> [ 0.004000] [<ffffffff81078931>] warn_slowpath_common+0x8d/0xd0
> [ 0.004000] [<ffffffff8107899b>] warn_slowpath_null+0x27/0x3d
> [ 0.004000] [<ffffffff81d699bb>] alloc_arch_preferred_bootmem+0x40/0x7e
> [ 0.004000] [<ffffffff81079992>] ? vprintk+0x2d6/0x31b
> [ 0.004000] [<ffffffff81d6a036>] ___alloc_bootmem_nopanic+0x4e/0xec
> [ 0.004000] [<ffffffff81d6a0f4>] ___alloc_bootmem+0x20/0x61
> [ 0.004000] [<ffffffff81053dc8>] ? default_spin_lock_flags+0x1e/0x36
> [ 0.004000] [<ffffffff81d6a28b>] __alloc_bootmem+0x1e/0x34
> [ 0.004000] [<ffffffff817971cf>] vgacon_scrollback_startup+0x3d/0xa4
> [ 0.004000] [<ffffffff8103ad11>] ? native_io_delay+0xd/0x58
> [ 0.004000] [<ffffffff813d3545>] vgacon_startup+0x38f/0x3be
> [ 0.004000] [<ffffffff81d4e140>] ? early_idt_handler+0x0/0x71
> [ 0.004000] [<ffffffff81d79275>] con_init+0x2e/0x246
> [ 0.004000] [<ffffffff81d4e140>] ? early_idt_handler+0x0/0x71
> [ 0.004000] [<ffffffff81d789aa>] console_init+0x28/0x50
> [ 0.004000] [<ffffffff810530f5>] ? native_irq_enable+0xb/0xc
> [ 0.004000] [<ffffffff81d4eead>] start_kernel+0x20e/0x35b
> [ 0.004000] [<ffffffff81d4e140>] ? early_idt_handler+0x0/0x71
> [ 0.004000] [<ffffffff81d4e140>] ? early_idt_handler+0x0/0x71
> [ 0.004000] [<ffffffff81d4e2b2>] x86_64_start_reservations+0xb9/0xd4
> [ 0.004000] [<ffffffff81d4e000>] ? __init_begin+0x0/0x140
> [ 0.004000] [<ffffffff81d4e3d1>] x86_64_start_kernel+0x104/0x127
> [ 0.004000] ---[ end trace a7919e7f17c0a725 ]---
> [ 0.004000] Console: colour VGA+ 80x25
> [ 0.004000] console handover: boot [earlyser0] -> real [ttyS0]
>
> box booted up fine otherwise.

Heh, Andrew did complain that the warning is cryptic! It's just an
indication that someone tried to do a bootmem allocation after slab was
set up and we switched to kzalloc() under the hood. In the scheduler
case, it's completely harmless.

I sent some patches out to fix that and the init_console() one.

Pekka

2009-06-10 21:03:31

by Pekka Enberg

[permalink] [raw]
Subject: Re: [GIT PULL] Early boot SLAB for 2.6.31

Ingo Molnar wrote:
> How about this: i can send the scheduler and x86 bits to Linus right
> now, that should make it possible to have a clean base for you and
> no interactions with anything pending?

Sure, we can do that too. Whatever works for you guys, it's not a huge
deal for me to rebase the series.

Pekka

2009-06-10 21:04:35

by Yinghai Lu

[permalink] [raw]
Subject: Re: [GIT PULL] Early boot SLAB for 2.6.31

Ingo Molnar wrote:
> * Pekka J Enberg <[email protected]> wrote:
>
>> Hi Linus,
>>
>> Here are the core patches for enabling slab before the scheduler initializes
>> itself in the boot sequence. I added slab fallback support to the bootmem
>> allocator so that we don't need a flag day for switching to early slab.
>>
>> I have tested this series on x86-64 with SLAB, SLUB, and SLOB. Note: the
>> following harmless warning appears at boot:
>>
>> [ 0.000000] ------------[ cut here ]------------
>> [ 0.000000] WARNING: at mm/bootmem.c:535 alloc_arch_preferred_bootmem+0x31/0x56()
>> [ 0.000000] Hardware name:
>> [ 0.000000] Modules linked in:
>> [ 0.000000] Pid: 0, comm: swapper Not tainted 2.6.30 #472
>> [ 0.000000] Call Trace:
>> [ 0.000000] [<ffffffff809d62c3>] ? alloc_arch_preferred_bootmem+0x31/0x56
>> [ 0.000000] [<ffffffff8025c304>] warn_slowpath_common+0x7c/0xa9
>> [ 0.000000] [<ffffffff8025c345>] warn_slowpath_null+0x14/0x16
>> [ 0.000000] [<ffffffff809d62c3>] alloc_arch_preferred_bootmem+0x31/0x56
>> [ 0.000000] [<ffffffff809d6833>] ___alloc_bootmem_nopanic+0x3f/0xc9
>> [ 0.000000] [<ffffffff809d68ce>] ___alloc_bootmem+0x11/0x3a
>> [ 0.000000] [<ffffffff809d69a0>] __alloc_bootmem+0xb/0xd
>> [ 0.000000] [<ffffffff809d21f9>] sched_init+0x43/0x4ee
>> [ 0.000000] [<ffffffff809c0aae>] start_kernel+0x1cc/0x3aa
>> [ 0.000000] [<ffffffff809c029a>] x86_64_start_reservations+0xaa/0xae
>> [ 0.000000] [<ffffffff809c037f>] x86_64_start_kernel+0xe1/0xe8
>> [ 0.000000] ---[ end trace 4eaa2a86a8e2da22 ]---
>
> Yeah, i got this too:
>
> [ 0.004000] spurious 8259A interrupt: IRQ7.
> [ 0.004000] ------------[ cut here ]------------
> [ 0.004000] WARNING: at mm/bootmem.c:537 alloc_arch_preferred_bootmem+0x40/0x7e()
> [ 0.004000] Hardware name: System Product Name
> [ 0.004000] Modules linked in:
> [ 0.004000] Pid: 0, comm: swapper Not tainted 2.6.30-tip-02102-g994fdea-dirty #52083
> [ 0.004000] Call Trace:
> [ 0.004000] [<ffffffff81d699bb>] ? alloc_arch_preferred_bootmem+0x40/0x7e
> [ 0.004000] [<ffffffff81078931>] warn_slowpath_common+0x8d/0xd0
> [ 0.004000] [<ffffffff8107899b>] warn_slowpath_null+0x27/0x3d
> [ 0.004000] [<ffffffff81d699bb>] alloc_arch_preferred_bootmem+0x40/0x7e
> [ 0.004000] [<ffffffff81079992>] ? vprintk+0x2d6/0x31b
> [ 0.004000] [<ffffffff81d6a036>] ___alloc_bootmem_nopanic+0x4e/0xec
> [ 0.004000] [<ffffffff81d6a0f4>] ___alloc_bootmem+0x20/0x61
> [ 0.004000] [<ffffffff81053dc8>] ? default_spin_lock_flags+0x1e/0x36
> [ 0.004000] [<ffffffff81d6a28b>] __alloc_bootmem+0x1e/0x34
> [ 0.004000] [<ffffffff817971cf>] vgacon_scrollback_startup+0x3d/0xa4
> [ 0.004000] [<ffffffff8103ad11>] ? native_io_delay+0xd/0x58
> [ 0.004000] [<ffffffff813d3545>] vgacon_startup+0x38f/0x3be
> [ 0.004000] [<ffffffff81d4e140>] ? early_idt_handler+0x0/0x71
> [ 0.004000] [<ffffffff81d79275>] con_init+0x2e/0x246
> [ 0.004000] [<ffffffff81d4e140>] ? early_idt_handler+0x0/0x71
> [ 0.004000] [<ffffffff81d789aa>] console_init+0x28/0x50
> [ 0.004000] [<ffffffff810530f5>] ? native_irq_enable+0xb/0xc
> [ 0.004000] [<ffffffff81d4eead>] start_kernel+0x20e/0x35b
> [ 0.004000] [<ffffffff81d4e140>] ? early_idt_handler+0x0/0x71
> [ 0.004000] [<ffffffff81d4e140>] ? early_idt_handler+0x0/0x71
> [ 0.004000] [<ffffffff81d4e2b2>] x86_64_start_reservations+0xb9/0xd4
> [ 0.004000] [<ffffffff81d4e000>] ? __init_begin+0x0/0x140
> [ 0.004000] [<ffffffff81d4e3d1>] x86_64_start_kernel+0x104/0x127
> [ 0.004000] ---[ end trace a7919e7f17c0a725 ]---
> [ 0.004000] Console: colour VGA+ 80x25
> [ 0.004000] console handover: boot [earlyser0] -> real [ttyS0]
>
> box booted up fine otherwise.

got several patches clean up through those warning. will put those in git for you to pick up

YH

2009-06-10 21:11:41

by Pekka Enberg

[permalink] [raw]
Subject: Re: [GIT PULL] Early boot SLAB for 2.6.31

Hi Ingo,

Ingo Molnar wrote:
>>>> What kind of conflicts are there against -tip? The diffstat
>>>> suggests it's mostly in-SLAB code, right? There shouldnt be
>>>> much to conflict, except kmemcheck - which has more or less
>>>> trivial callbacks there.
>>> The conflicting bits are the patches that remove bootmem
>>> allocator uses in arch/x86 and kernel/sched.c.
>> Give me an hour and i'll get some minimal testing done.
>
> This tree doesnt conflict (not even with kmecheck) - and the older
> bits you sent against the scheduler and against x86 doesnt apply
> anymore - but they do look scary.

Btw, yeah, it doesn't conflict because I dropped the problematic patches
and did the bootmem fallback instead.

But now you know why I tried to push all this to -tip. Your tree is
moving so fast that it's difficult to generate patches that apply to
both, -tip and mainline, in this particular area :-).

Pekka

2009-06-10 21:12:18

by Yinghai Lu

[permalink] [raw]
Subject: Re: [GIT PULL] Early boot SLAB for 2.6.31

Ingo Molnar wrote:
> * Pekka J Enberg <[email protected]> wrote:
>
>> Hi Linus,
>>
>> Here are the core patches for enabling slab before the scheduler initializes
>> itself in the boot sequence. I added slab fallback support to the bootmem
>> allocator so that we don't need a flag day for switching to early slab.
>>
>> I have tested this series on x86-64 with SLAB, SLUB, and SLOB. Note: the
>> following harmless warning appears at boot:
>>
>> [ 0.000000] ------------[ cut here ]------------
>> [ 0.000000] WARNING: at mm/bootmem.c:535 alloc_arch_preferred_bootmem+0x31/0x56()
>> [ 0.000000] Hardware name:
>> [ 0.000000] Modules linked in:
>> [ 0.000000] Pid: 0, comm: swapper Not tainted 2.6.30 #472
>> [ 0.000000] Call Trace:
>> [ 0.000000] [<ffffffff809d62c3>] ? alloc_arch_preferred_bootmem+0x31/0x56
>> [ 0.000000] [<ffffffff8025c304>] warn_slowpath_common+0x7c/0xa9
>> [ 0.000000] [<ffffffff8025c345>] warn_slowpath_null+0x14/0x16
>> [ 0.000000] [<ffffffff809d62c3>] alloc_arch_preferred_bootmem+0x31/0x56
>> [ 0.000000] [<ffffffff809d6833>] ___alloc_bootmem_nopanic+0x3f/0xc9
>> [ 0.000000] [<ffffffff809d68ce>] ___alloc_bootmem+0x11/0x3a
>> [ 0.000000] [<ffffffff809d69a0>] __alloc_bootmem+0xb/0xd
>> [ 0.000000] [<ffffffff809d21f9>] sched_init+0x43/0x4ee
>> [ 0.000000] [<ffffffff809c0aae>] start_kernel+0x1cc/0x3aa
>> [ 0.000000] [<ffffffff809c029a>] x86_64_start_reservations+0xaa/0xae
>> [ 0.000000] [<ffffffff809c037f>] x86_64_start_kernel+0xe1/0xe8
>> [ 0.000000] ---[ end trace 4eaa2a86a8e2da22 ]---
>
> Yeah, i got this too:
>
> [ 0.004000] spurious 8259A interrupt: IRQ7.
> [ 0.004000] ------------[ cut here ]------------
> [ 0.004000] WARNING: at mm/bootmem.c:537 alloc_arch_preferred_bootmem+0x40/0x7e()
> [ 0.004000] Hardware name: System Product Name
> [ 0.004000] Modules linked in:
> [ 0.004000] Pid: 0, comm: swapper Not tainted 2.6.30-tip-02102-g994fdea-dirty #52083
> [ 0.004000] Call Trace:
> [ 0.004000] [<ffffffff81d699bb>] ? alloc_arch_preferred_bootmem+0x40/0x7e
> [ 0.004000] [<ffffffff81078931>] warn_slowpath_common+0x8d/0xd0
> [ 0.004000] [<ffffffff8107899b>] warn_slowpath_null+0x27/0x3d
> [ 0.004000] [<ffffffff81d699bb>] alloc_arch_preferred_bootmem+0x40/0x7e
> [ 0.004000] [<ffffffff81079992>] ? vprintk+0x2d6/0x31b
> [ 0.004000] [<ffffffff81d6a036>] ___alloc_bootmem_nopanic+0x4e/0xec
> [ 0.004000] [<ffffffff81d6a0f4>] ___alloc_bootmem+0x20/0x61
> [ 0.004000] [<ffffffff81053dc8>] ? default_spin_lock_flags+0x1e/0x36
> [ 0.004000] [<ffffffff81d6a28b>] __alloc_bootmem+0x1e/0x34
> [ 0.004000] [<ffffffff817971cf>] vgacon_scrollback_startup+0x3d/0xa4
> [ 0.004000] [<ffffffff8103ad11>] ? native_io_delay+0xd/0x58
> [ 0.004000] [<ffffffff813d3545>] vgacon_startup+0x38f/0x3be
> [ 0.004000] [<ffffffff81d4e140>] ? early_idt_handler+0x0/0x71
> [ 0.004000] [<ffffffff81d79275>] con_init+0x2e/0x246
> [ 0.004000] [<ffffffff81d4e140>] ? early_idt_handler+0x0/0x71
> [ 0.004000] [<ffffffff81d789aa>] console_init+0x28/0x50
> [ 0.004000] [<ffffffff810530f5>] ? native_irq_enable+0xb/0xc
> [ 0.004000] [<ffffffff81d4eead>] start_kernel+0x20e/0x35b
> [ 0.004000] [<ffffffff81d4e140>] ? early_idt_handler+0x0/0x71
> [ 0.004000] [<ffffffff81d4e140>] ? early_idt_handler+0x0/0x71
> [ 0.004000] [<ffffffff81d4e2b2>] x86_64_start_reservations+0xb9/0xd4
> [ 0.004000] [<ffffffff81d4e000>] ? __init_begin+0x0/0x140
> [ 0.004000] [<ffffffff81d4e3d1>] x86_64_start_kernel+0x104/0x127
> [ 0.004000] ---[ end trace a7919e7f17c0a725 ]---
> [ 0.004000] Console: colour VGA+ 80x25
> [ 0.004000] console handover: boot [earlyser0] -> real [ttyS0]
>
> box booted up fine otherwise.
>
git://git.kernel.org/pub/scm/linux/kernel/git/yinghai/linux-2.6-yinghai.git

last 10 or so.

YH

2009-06-11 00:55:09

by Ingo Molnar

[permalink] [raw]
Subject: Re: [GIT PULL] Early boot SLAB for 2.6.31


* Pekka Enberg <[email protected]> wrote:

> Hi Ingo,
>
> Ingo Molnar wrote:
>>>>> What kind of conflicts are there against -tip? The diffstat
>>>>> suggests it's mostly in-SLAB code, right? There shouldnt be much
>>>>> to conflict, except kmemcheck - which has more or less trivial
>>>>> callbacks there.
>>>> The conflicting bits are the patches that remove bootmem allocator
>>>> uses in arch/x86 and kernel/sched.c.
>>> Give me an hour and i'll get some minimal testing done.
>>
>> This tree doesnt conflict (not even with kmecheck) - and the older
>> bits you sent against the scheduler and against x86 doesnt apply
>> anymore - but they do look scary.
>
> Btw, yeah, it doesn't conflict because I dropped the problematic patches
> and did the bootmem fallback instead.
>
> But now you know why I tried to push all this to -tip. Your tree
> is moving so fast that it's difficult to generate patches that
> apply to both, -tip and mainline, in this particular area :-).

Hey, i'd agree normally, but the scheduler tree was very quiet in
this cycle, for a change :-)

The main "problem" here really is the multi-tree impact of such
broad changes. Those are best kept in a tree like -mm, which goes on
top of all other trees and is thus basically the only tree that can
do tree-wide changes.

Anyway, if you rebase to latest -git it should be fine - Linus
pulled the scheduler and x86 bits.

Ingo

2009-06-11 11:17:18

by Pekka Enberg

[permalink] [raw]
Subject: [GIT PULL v2] Early boot SLAB for 2.6.31

Hi Linus,

The following changes since commit 991ec02cdca33b03a132a0cacfe6f0aa0be9aa8d:
Linus Torvalds (1):
Merge branch 'tracing-urgent-for-linus' of git://git.kernel.org/.../tip/linux-2.6-tip

are available in the git repository at:

ssh://master.kernel.org/pub/scm/linux/kernel/git/penberg/slab-2.6 for-linus

Pekka Enberg (9):
bootmem: use slab if bootmem is no longer available
slab: setup allocators earlier in the boot sequence
vmalloc: use kzalloc() instead of alloc_bootmem()
sched: use kzalloc() instead of the bootmem allocator
vt: use kzalloc() instead of the bootmem allocator
Merge commit 'linus/master' into topic/slab/earlyboot
bootmem: fix slab fallback on numa
sched: use alloc_cpumask_var() instead of alloc_bootmem_cpumask_var()
sched: use slab in cpupri_init()

Yinghai Lu (3):
x86: remove some alloc_bootmem_cpumask_var calling
irq/cpumask: make memoryless node zero happy
memcg: don't use bootmem allocator in setup code

arch/x86/kernel/apic/io_apic.c | 6 ++-
drivers/char/vt.c | 8 +---
include/linux/irq.h | 18 +++-----
init/main.c | 32 +++++++++------
kernel/cpuset.c | 2 +-
kernel/irq/handle.c | 9 ++--
kernel/profile.c | 6 ---
kernel/sched.c | 30 ++++++--------
kernel/sched_cpupri.c | 8 ++-
lib/cpumask.c | 11 +----
mm/bootmem.c | 12 ++++++
mm/page_cgroup.c | 12 ++++--
mm/slab.c | 85 +++++++++++++++++++++-------------------
mm/slub.c | 17 +++++---
mm/vmalloc.c | 3 +-
15 files changed, 135 insertions(+), 124 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 1946fac..94605e7 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -177,16 +177,18 @@ int __init arch_early_irq_init(void)
struct irq_cfg *cfg;
struct irq_desc *desc;
int count;
+ int node;
int i;

cfg = irq_cfgx;
count = ARRAY_SIZE(irq_cfgx);
+ node= cpu_to_node(boot_cpu_id);

for (i = 0; i < count; i++) {
desc = irq_to_desc(i);
desc->chip_data = &cfg[i];
- alloc_bootmem_cpumask_var(&cfg[i].domain);
- alloc_bootmem_cpumask_var(&cfg[i].old_domain);
+ alloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
+ alloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
if (i < NR_IRQS_LEGACY)
cpumask_setall(cfg[i].domain);
}
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 08151d4..c796a86 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -95,7 +95,6 @@
#include <linux/timer.h>
#include <linux/interrupt.h>
#include <linux/workqueue.h>
-#include <linux/bootmem.h>
#include <linux/pm.h>
#include <linux/font.h>
#include <linux/bitops.h>
@@ -2875,14 +2874,11 @@ static int __init con_init(void)
mod_timer(&console_timer, jiffies + blankinterval);
}

- /*
- * kmalloc is not running yet - we use the bootmem allocator.
- */
for (currcons = 0; currcons < MIN_NR_CONSOLES; currcons++) {
- vc_cons[currcons].d = vc = alloc_bootmem(sizeof(struct vc_data));
+ vc_cons[currcons].d = vc = kzalloc(sizeof(struct vc_data), GFP_NOWAIT);
INIT_WORK(&vc_cons[currcons].SAK_work, vc_SAK);
visual_init(vc, currcons, 1);
- vc->vc_screenbuf = (unsigned short *)alloc_bootmem(vc->vc_screenbuf_size);
+ vc->vc_screenbuf = kzalloc(vc->vc_screenbuf_size, GFP_NOWAIT);
vc->vc_kmalloced = 0;
vc_init(vc, vc->vc_rows, vc->vc_cols,
currcons || !vc->vc_sw->con_save_screen);
diff --git a/include/linux/irq.h b/include/linux/irq.h
index eedbb8e..1e50c34 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -430,23 +430,19 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
* Returns true if successful (or not required).
*/
static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
- bool boot)
+ bool boot)
{
-#ifdef CONFIG_CPUMASK_OFFSTACK
- if (boot) {
- alloc_bootmem_cpumask_var(&desc->affinity);
+ gfp_t gfp = GFP_ATOMIC;

-#ifdef CONFIG_GENERIC_PENDING_IRQ
- alloc_bootmem_cpumask_var(&desc->pending_mask);
-#endif
- return true;
- }
+ if (boot)
+ gfp = GFP_NOWAIT;

- if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ if (!alloc_cpumask_var_node(&desc->affinity, gfp, node))
return false;

#ifdef CONFIG_GENERIC_PENDING_IRQ
- if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) {
+ if (!alloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
free_cpumask_var(desc->affinity);
return false;
}
diff --git a/init/main.c b/init/main.c
index bb7dc57..859af21 100644
--- a/init/main.c
+++ b/init/main.c
@@ -574,6 +574,26 @@ asmlinkage void __init start_kernel(void)
setup_nr_cpu_ids();
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */

+ build_all_zonelists();
+ page_alloc_init();
+
+ printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
+ parse_early_param();
+ parse_args("Booting kernel", static_command_line, __start___param,
+ __stop___param - __start___param,
+ &unknown_bootoption);
+ /*
+ * These use large bootmem allocations and must precede
+ * kmem_cache_init()
+ */
+ pidhash_init();
+ vfs_caches_init_early();
+ /*
+ * Set up kernel memory allocators
+ */
+ mem_init();
+ kmem_cache_init();
+ vmalloc_init();
/*
* Set up the scheduler prior starting any interrupts (such as the
* timer interrupt). Full topology setup happens at smp_init()
@@ -585,13 +605,6 @@ asmlinkage void __init start_kernel(void)
* fragile until we cpu_idle() for the first time.
*/
preempt_disable();
- build_all_zonelists();
- page_alloc_init();
- printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
- parse_early_param();
- parse_args("Booting kernel", static_command_line, __start___param,
- __stop___param - __start___param,
- &unknown_bootoption);
if (!irqs_disabled()) {
printk(KERN_WARNING "start_kernel(): bug: interrupts were "
"enabled *very* early, fixing it\n");
@@ -603,7 +616,6 @@ asmlinkage void __init start_kernel(void)
/* init some links before init_ISA_irqs() */
early_irq_init();
init_IRQ();
- pidhash_init();
init_timers();
hrtimers_init();
softirq_init();
@@ -645,14 +657,10 @@ asmlinkage void __init start_kernel(void)
initrd_start = 0;
}
#endif
- vmalloc_init();
- vfs_caches_init_early();
cpuset_init_early();
page_cgroup_init();
- mem_init();
enable_debug_pagealloc();
cpu_hotplug_init();
- kmem_cache_init();
kmemtrace_init();
debug_objects_mem_init();
idr_init_cache();
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 026facc..d5a7e17 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1857,7 +1857,7 @@ struct cgroup_subsys cpuset_subsys = {

int __init cpuset_init_early(void)
{
- alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
+ alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT);

top_cpuset.mems_generation = cpuset_mems_generation++;
return 0;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a600184..e161999 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -150,6 +150,7 @@ int __init early_irq_init(void)
{
struct irq_desc *desc;
int legacy_count;
+ int node;
int i;

init_irq_default_affinity();
@@ -160,20 +161,20 @@ int __init early_irq_init(void)

desc = irq_desc_legacy;
legacy_count = ARRAY_SIZE(irq_desc_legacy);
+ node = first_online_node;

/* allocate irq_desc_ptrs array based on nr_irqs */
irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));

/* allocate based on nr_cpu_ids */
- /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */
- kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids *
- sizeof(int));
+ kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
+ sizeof(int), GFP_NOWAIT, node);

for (i = 0; i < legacy_count; i++) {
desc[i].irq = i;
desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
- alloc_desc_masks(&desc[i], 0, true);
+ alloc_desc_masks(&desc[i], node, true);
init_desc_masks(&desc[i]);
irq_desc_ptrs[i] = desc + i;
}
diff --git a/kernel/profile.c b/kernel/profile.c
index 7724e04..28cf26a 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -111,12 +111,6 @@ int __ref profile_init(void)
/* only text is profiled */
prof_len = (_etext - _stext) >> prof_shift;
buffer_bytes = prof_len*sizeof(atomic_t);
- if (!slab_is_available()) {
- prof_buffer = alloc_bootmem(buffer_bytes);
- alloc_bootmem_cpumask_var(&prof_cpu_mask);
- cpumask_copy(prof_cpu_mask, cpu_possible_mask);
- return 0;
- }

if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
return -ENOMEM;
diff --git a/kernel/sched.c b/kernel/sched.c
index 14c447a..dcf2dc2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -68,7 +68,6 @@
#include <linux/pagemap.h>
#include <linux/hrtimer.h>
#include <linux/tick.h>
-#include <linux/bootmem.h>
#include <linux/debugfs.h>
#include <linux/ctype.h>
#include <linux/ftrace.h>
@@ -7782,24 +7781,21 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)

static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
{
+ gfp_t gfp = GFP_KERNEL;
+
memset(rd, 0, sizeof(*rd));

- if (bootmem) {
- alloc_bootmem_cpumask_var(&def_root_domain.span);
- alloc_bootmem_cpumask_var(&def_root_domain.online);
- alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
- cpupri_init(&rd->cpupri, true);
- return 0;
- }
+ if (bootmem)
+ gfp = GFP_NOWAIT;

- if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+ if (!alloc_cpumask_var(&rd->span, gfp))
goto out;
- if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+ if (!alloc_cpumask_var(&rd->online, gfp))
goto free_span;
- if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ if (!alloc_cpumask_var(&rd->rto_mask, gfp))
goto free_online;

- if (cpupri_init(&rd->cpupri, false) != 0)
+ if (cpupri_init(&rd->cpupri, bootmem) != 0)
goto free_rto_mask;
return 0;

@@ -9123,7 +9119,7 @@ void __init sched_init(void)
* we use alloc_bootmem().
*/
if (alloc_size) {
- ptr = (unsigned long)alloc_bootmem(alloc_size);
+ ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);

#ifdef CONFIG_FAIR_GROUP_SCHED
init_task_group.se = (struct sched_entity **)ptr;
@@ -9314,13 +9310,13 @@ void __init sched_init(void)
current->sched_class = &fair_sched_class;

/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
- alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+ alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ
- alloc_bootmem_cpumask_var(&nohz.cpu_mask);
- alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
+ alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
+ alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
#endif
- alloc_bootmem_cpumask_var(&cpu_isolated_map);
+ alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
#endif /* SMP */

scheduler_running = 1;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 344712a..7deffc9 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -154,8 +154,12 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
*/
int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
{
+ gfp_t gfp = GFP_KERNEL;
int i;

+ if (bootmem)
+ gfp = GFP_NOWAIT;
+
memset(cp, 0, sizeof(*cp));

for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -163,9 +167,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)

spin_lock_init(&vec->lock);
vec->count = 0;
- if (bootmem)
- alloc_bootmem_cpumask_var(&vec->mask);
- else if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&vec->mask, gfp))
goto cleanup;
}

diff --git a/lib/cpumask.c b/lib/cpumask.c
index eb23aaa..7bb4142 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -92,15 +92,8 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
*/
bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
{
- if (likely(slab_is_available()))
- *mask = kmalloc_node(cpumask_size(), flags, node);
- else {
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
- printk(KERN_ERR
- "=> alloc_cpumask_var: kmalloc not available!\n");
-#endif
- *mask = NULL;
- }
+ *mask = kmalloc_node(cpumask_size(), flags, node);
+
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
if (!*mask) {
printk(KERN_ERR "=> alloc_cpumask_var: failed!\n");
diff --git a/mm/bootmem.c b/mm/bootmem.c
index daf9271..282df0a 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -532,6 +532,9 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
unsigned long size, unsigned long align,
unsigned long goal, unsigned long limit)
{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc(size, GFP_NOWAIT);
+
#ifdef CONFIG_HAVE_ARCH_BOOTMEM
bootmem_data_t *p_bdata;

@@ -662,6 +665,9 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
}

@@ -693,6 +699,9 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
{
void *ptr;

+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
if (ptr)
return ptr;
@@ -745,6 +754,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
return ___alloc_bootmem_node(pgdat->bdata, size, align,
goal, ARCH_LOW_ADDRESS_LIMIT);
}
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 791905c..3dd4a90 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -47,6 +47,8 @@ static int __init alloc_node_page_cgroup(int nid)
struct page_cgroup *base, *pc;
unsigned long table_size;
unsigned long start_pfn, nr_pages, index;
+ struct page *page;
+ unsigned int order;

start_pfn = NODE_DATA(nid)->node_start_pfn;
nr_pages = NODE_DATA(nid)->node_spanned_pages;
@@ -55,11 +57,13 @@ static int __init alloc_node_page_cgroup(int nid)
return 0;

table_size = sizeof(struct page_cgroup) * nr_pages;
-
- base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
- table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
- if (!base)
+ order = get_order(table_size);
+ page = alloc_pages_node(nid, GFP_NOWAIT | __GFP_ZERO, order);
+ if (!page)
+ page = alloc_pages_node(-1, GFP_NOWAIT | __GFP_ZERO, order);
+ if (!page)
return -ENOMEM;
+ base = page_address(page);
for (index = 0; index < nr_pages; index++) {
pc = base + index;
__init_page_cgroup(pc, start_pfn + index);
diff --git a/mm/slab.c b/mm/slab.c
index f85831d..2bd611f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -315,7 +315,7 @@ static int drain_freelist(struct kmem_cache *cache,
struct kmem_list3 *l3, int tofree);
static void free_block(struct kmem_cache *cachep, void **objpp, int len,
int node);
-static int enable_cpucache(struct kmem_cache *cachep);
+static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
static void cache_reap(struct work_struct *unused);

/*
@@ -958,12 +958,12 @@ static void __cpuinit start_cpu_timer(int cpu)
}

static struct array_cache *alloc_arraycache(int node, int entries,
- int batchcount)
+ int batchcount, gfp_t gfp)
{
int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
struct array_cache *nc = NULL;

- nc = kmalloc_node(memsize, GFP_KERNEL, node);
+ nc = kmalloc_node(memsize, gfp, node);
if (nc) {
nc->avail = 0;
nc->limit = entries;
@@ -1003,7 +1003,7 @@ static int transfer_objects(struct array_cache *to,
#define drain_alien_cache(cachep, alien) do { } while (0)
#define reap_alien(cachep, l3) do { } while (0)

-static inline struct array_cache **alloc_alien_cache(int node, int limit)
+static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
{
return (struct array_cache **)BAD_ALIEN_MAGIC;
}
@@ -1034,7 +1034,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
static void *alternate_node_alloc(struct kmem_cache *, gfp_t);

-static struct array_cache **alloc_alien_cache(int node, int limit)
+static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
{
struct array_cache **ac_ptr;
int memsize = sizeof(void *) * nr_node_ids;
@@ -1042,14 +1042,14 @@ static struct array_cache **alloc_alien_cache(int node, int limit)

if (limit > 1)
limit = 12;
- ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
+ ac_ptr = kmalloc_node(memsize, gfp, node);
if (ac_ptr) {
for_each_node(i) {
if (i == node || !node_online(i)) {
ac_ptr[i] = NULL;
continue;
}
- ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
+ ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
if (!ac_ptr[i]) {
for (i--; i >= 0; i--)
kfree(ac_ptr[i]);
@@ -1282,20 +1282,20 @@ static int __cpuinit cpuup_prepare(long cpu)
struct array_cache **alien = NULL;

nc = alloc_arraycache(node, cachep->limit,
- cachep->batchcount);
+ cachep->batchcount, GFP_KERNEL);
if (!nc)
goto bad;
if (cachep->shared) {
shared = alloc_arraycache(node,
cachep->shared * cachep->batchcount,
- 0xbaadf00d);
+ 0xbaadf00d, GFP_KERNEL);
if (!shared) {
kfree(nc);
goto bad;
}
}
if (use_alien_caches) {
- alien = alloc_alien_cache(node, cachep->limit);
+ alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
if (!alien) {
kfree(shared);
kfree(nc);
@@ -1399,10 +1399,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
{
struct kmem_list3 *ptr;

- ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
+ ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid);
BUG_ON(!ptr);

- local_irq_disable();
memcpy(ptr, list, sizeof(struct kmem_list3));
/*
* Do not assume that spinlocks can be initialized via memcpy:
@@ -1411,7 +1410,6 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,

MAKE_ALL_LISTS(cachep, ptr, nodeid);
cachep->nodelists[nodeid] = ptr;
- local_irq_enable();
}

/*
@@ -1575,9 +1573,8 @@ void __init kmem_cache_init(void)
{
struct array_cache *ptr;

- ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+ ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);

- local_irq_disable();
BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
memcpy(ptr, cpu_cache_get(&cache_cache),
sizeof(struct arraycache_init));
@@ -1587,11 +1584,9 @@ void __init kmem_cache_init(void)
spin_lock_init(&ptr->lock);

cache_cache.array[smp_processor_id()] = ptr;
- local_irq_enable();

- ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+ ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);

- local_irq_disable();
BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
!= &initarray_generic.cache);
memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
@@ -1603,7 +1598,6 @@ void __init kmem_cache_init(void)

malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
ptr;
- local_irq_enable();
}
/* 5) Replace the bootstrap kmem_list3's */
{
@@ -1627,7 +1621,7 @@ void __init kmem_cache_init(void)
struct kmem_cache *cachep;
mutex_lock(&cache_chain_mutex);
list_for_each_entry(cachep, &cache_chain, next)
- if (enable_cpucache(cachep))
+ if (enable_cpucache(cachep, GFP_NOWAIT))
BUG();
mutex_unlock(&cache_chain_mutex);
}
@@ -2064,10 +2058,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
return left_over;
}

-static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
+static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
{
if (g_cpucache_up == FULL)
- return enable_cpucache(cachep);
+ return enable_cpucache(cachep, gfp);

if (g_cpucache_up == NONE) {
/*
@@ -2089,7 +2083,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
g_cpucache_up = PARTIAL_AC;
} else {
cachep->array[smp_processor_id()] =
- kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+ kmalloc(sizeof(struct arraycache_init), gfp);

if (g_cpucache_up == PARTIAL_AC) {
set_up_list3s(cachep, SIZE_L3);
@@ -2153,6 +2147,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
{
size_t left_over, slab_size, ralign;
struct kmem_cache *cachep = NULL, *pc;
+ gfp_t gfp;

/*
* Sanity checks... these are all serious usage bugs.
@@ -2168,8 +2163,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
* We use cache_chain_mutex to ensure a consistent view of
* cpu_online_mask as well. Please see cpuup_callback
*/
- get_online_cpus();
- mutex_lock(&cache_chain_mutex);
+ if (slab_is_available()) {
+ get_online_cpus();
+ mutex_lock(&cache_chain_mutex);
+ }

list_for_each_entry(pc, &cache_chain, next) {
char tmp;
@@ -2278,8 +2275,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
*/
align = ralign;

+ if (slab_is_available())
+ gfp = GFP_KERNEL;
+ else
+ gfp = GFP_NOWAIT;
+
/* Get cache's description obj. */
- cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
+ cachep = kmem_cache_zalloc(&cache_cache, gfp);
if (!cachep)
goto oops;

@@ -2382,7 +2384,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
cachep->ctor = ctor;
cachep->name = name;

- if (setup_cpu_cache(cachep)) {
+ if (setup_cpu_cache(cachep, gfp)) {
__kmem_cache_destroy(cachep);
cachep = NULL;
goto oops;
@@ -2394,8 +2396,10 @@ oops:
if (!cachep && (flags & SLAB_PANIC))
panic("kmem_cache_create(): failed to create slab `%s'\n",
name);
- mutex_unlock(&cache_chain_mutex);
- put_online_cpus();
+ if (slab_is_available()) {
+ mutex_unlock(&cache_chain_mutex);
+ put_online_cpus();
+ }
return cachep;
}
EXPORT_SYMBOL(kmem_cache_create);
@@ -3802,7 +3806,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name);
/*
* This initializes kmem_list3 or resizes various caches for all nodes.
*/
-static int alloc_kmemlist(struct kmem_cache *cachep)
+static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
{
int node;
struct kmem_list3 *l3;
@@ -3812,7 +3816,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
for_each_online_node(node) {

if (use_alien_caches) {
- new_alien = alloc_alien_cache(node, cachep->limit);
+ new_alien = alloc_alien_cache(node, cachep->limit, gfp);
if (!new_alien)
goto fail;
}
@@ -3821,7 +3825,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
if (cachep->shared) {
new_shared = alloc_arraycache(node,
cachep->shared*cachep->batchcount,
- 0xbaadf00d);
+ 0xbaadf00d, gfp);
if (!new_shared) {
free_alien_cache(new_alien);
goto fail;
@@ -3850,7 +3854,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
free_alien_cache(new_alien);
continue;
}
- l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
+ l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
if (!l3) {
free_alien_cache(new_alien);
kfree(new_shared);
@@ -3906,18 +3910,18 @@ static void do_ccupdate_local(void *info)

/* Always called with the cache_chain_mutex held */
static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
- int batchcount, int shared)
+ int batchcount, int shared, gfp_t gfp)
{
struct ccupdate_struct *new;
int i;

- new = kzalloc(sizeof(*new), GFP_KERNEL);
+ new = kzalloc(sizeof(*new), gfp);
if (!new)
return -ENOMEM;

for_each_online_cpu(i) {
new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
- batchcount);
+ batchcount, gfp);
if (!new->new[i]) {
for (i--; i >= 0; i--)
kfree(new->new[i]);
@@ -3944,11 +3948,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
kfree(ccold);
}
kfree(new);
- return alloc_kmemlist(cachep);
+ return alloc_kmemlist(cachep, gfp);
}

/* Called with cache_chain_mutex held always */
-static int enable_cpucache(struct kmem_cache *cachep)
+static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
{
int err;
int limit, shared;
@@ -3994,7 +3998,7 @@ static int enable_cpucache(struct kmem_cache *cachep)
if (limit > 32)
limit = 32;
#endif
- err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
+ err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
if (err)
printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
cachep->name, -err);
@@ -4300,7 +4304,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
res = 0;
} else {
res = do_tune_cpucache(cachep, limit,
- batchcount, shared);
+ batchcount, shared,
+ GFP_KERNEL);
}
break;
}
diff --git a/mm/slub.c b/mm/slub.c
index 5e805a6..c1815a6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2557,13 +2557,16 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
if (gfp_flags & SLUB_DMA)
flags = SLAB_CACHE_DMA;

- down_write(&slub_lock);
+ /*
+ * This function is called with IRQs disabled during early-boot on
+ * single CPU so there's no need to take slub_lock here.
+ */
if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
flags, NULL))
goto panic;

list_add(&s->list, &slab_caches);
- up_write(&slub_lock);
+
if (sysfs_slab_add(s))
goto panic;
return s;
@@ -3021,7 +3024,7 @@ void __init kmem_cache_init(void)
* kmem_cache_open for slab_state == DOWN.
*/
create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
- sizeof(struct kmem_cache_node), GFP_KERNEL);
+ sizeof(struct kmem_cache_node), GFP_NOWAIT);
kmalloc_caches[0].refcount = -1;
caches++;

@@ -3034,16 +3037,16 @@ void __init kmem_cache_init(void)
/* Caches that are not of the two-to-the-power-of size */
if (KMALLOC_MIN_SIZE <= 64) {
create_kmalloc_cache(&kmalloc_caches[1],
- "kmalloc-96", 96, GFP_KERNEL);
+ "kmalloc-96", 96, GFP_NOWAIT);
caches++;
create_kmalloc_cache(&kmalloc_caches[2],
- "kmalloc-192", 192, GFP_KERNEL);
+ "kmalloc-192", 192, GFP_NOWAIT);
caches++;
}

for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
create_kmalloc_cache(&kmalloc_caches[i],
- "kmalloc", 1 << i, GFP_KERNEL);
+ "kmalloc", 1 << i, GFP_NOWAIT);
caches++;
}

@@ -3080,7 +3083,7 @@ void __init kmem_cache_init(void)
/* Provide the correct kmalloc names now that the caches are up */
for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
kmalloc_caches[i]. name =
- kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
+ kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);

#ifdef CONFIG_SMP
register_cpu_notifier(&slab_notifier);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 083716e..3235138 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -23,7 +23,6 @@
#include <linux/rbtree.h>
#include <linux/radix-tree.h>
#include <linux/rcupdate.h>
-#include <linux/bootmem.h>
#include <linux/pfn.h>

#include <asm/atomic.h>
@@ -1032,7 +1031,7 @@ void __init vmalloc_init(void)

/* Import existing vmlist entries. */
for (tmp = vmlist; tmp; tmp = tmp->next) {
- va = alloc_bootmem(sizeof(struct vmap_area));
+ va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
va->flags = tmp->flags | VM_VM_AREA;
va->va_start = (unsigned long)tmp->addr;
va->va_end = va->va_start + tmp->size;

2009-06-11 11:35:55

by Ingo Molnar

[permalink] [raw]
Subject: Re: [GIT PULL v2] Early boot SLAB for 2.6.31


* Pekka J Enberg <[email protected]> wrote:

> Hi Linus,
>
> The following changes since commit 991ec02cdca33b03a132a0cacfe6f0aa0be9aa8d:
> Linus Torvalds (1):
> Merge branch 'tracing-urgent-for-linus' of git://git.kernel.org/.../tip/linux-2.6-tip
>
> are available in the git repository at:
>
> ssh://master.kernel.org/pub/scm/linux/kernel/git/penberg/slab-2.6 for-linus

Thanks Pekka for sorting this out - this tree looks nice and it does
not conflict with anything we have pending. I started testing it on
6 boxes - if you dont hear from me within a few hours it's all fine.

Ingo

2009-06-11 11:41:40

by Ingo Molnar

[permalink] [raw]
Subject: Re: [GIT PULL v2] Early boot SLAB for 2.6.31


* Ingo Molnar <[email protected]> wrote:

> > are available in the git repository at:
> >
> > ssh://master.kernel.org/pub/scm/linux/kernel/git/penberg/slab-2.6
> > for-linus
>
> Thanks Pekka for sorting this out - this tree looks nice and it
> does not conflict with anything we have pending. I started testing
> it on 6 boxes - if you dont hear from me within a few hours it's
> all fine.

Hm, with this pulled on a testbox i'm still getting:

[ 0.000000] Experimental hierarchical RCU init done.
[ 0.000000] NR_IRQS:4352 nr_irqs:256
[ 0.000000] ------------[ cut here ]------------
[ 0.000000] WARNING: at mm/bootmem.c:537 alloc_arch_preferred_bootmem+0x40/0x7e()
[ 0.000000] Hardware name: To Be Filled By O.E.M.
[ 0.000000] Pid: 0, comm: swapper Not tainted 2.6.30-tip-02161-g7a74539-dirty #59709
[ 0.000000] Call Trace:
[ 0.000000] [<ffffffff823f8c8e>] ? alloc_arch_preferred_bootmem+0x40/0x7e
[ 0.000000] [<ffffffff81067168>] warn_slowpath_common+0x88/0xcb
[ 0.000000] [<ffffffff810671d2>] warn_slowpath_null+0x27/0x3d
[ 0.000000] [<ffffffff823f8c8e>] alloc_arch_preferred_bootmem+0x40/0x7e
[ 0.000000] [<ffffffff823f9307>] ___alloc_bootmem_nopanic+0x4e/0xec
[ 0.000000] [<ffffffff823f93c5>] ___alloc_bootmem+0x20/0x61
[ 0.000000] [<ffffffff823f962e>] __alloc_bootmem+0x1e/0x34
[ 0.000000] [<ffffffff823f757c>] early_irq_init+0x6d/0x118
[ 0.000000] [<ffffffff823e0140>] ? early_idt_handler+0x0/0x71
[ 0.000000] [<ffffffff823e0cf7>] start_kernel+0x192/0x394
[ 0.000000] [<ffffffff823e0140>] ? early_idt_handler+0x0/0x71
[ 0.000000] [<ffffffff823e02ad>] x86_64_start_reservations+0xb4/0xcf
[ 0.000000] [<ffffffff823e0000>] ? __init_begin+0x0/0x140
[ 0.000000] [<ffffffff823e0420>] x86_64_start_kernel+0x158/0x17b
[ 0.000000] ---[ end trace a7919e7f17c0a725 ]---
[ 0.000000] Fast TSC calibration using PIT
[ 0.000000] Detected 2002.510 MHz processor.
[ 0.004000] Console: colour VGA+ 80x25

Wasnt this supposed to have gone away with this tree?

Config and full bootlog attached.

Ingo


Attachments:
(No filename) (2.04 kB)
config (60.78 kB)
boot.log (236.65 kB)
Download all attachments

2009-06-11 11:42:40

by Ingo Molnar

[permalink] [raw]
Subject: Re: [GIT PULL v2] Early boot SLAB for 2.6.31


* Ingo Molnar <[email protected]> wrote:

> Hm, with this pulled on a testbox i'm still getting:
>
> [ 0.000000] Experimental hierarchical RCU init done.
> [ 0.000000] NR_IRQS:4352 nr_irqs:256
> [ 0.000000] ------------[ cut here ]------------
> [ 0.000000] WARNING: at mm/bootmem.c:537 alloc_arch_preferred_bootmem+0x40/0x7e()

Another testbox crashed on bootup. I'm collecting a serial log from
it - config attached meanwhile.

Ingo


Attachments:
(No filename) (448.00 B)
config (66.31 kB)
Download all attachments

2009-06-11 11:49:20

by Ingo Molnar

[permalink] [raw]
Subject: Re: [GIT PULL v2] Early boot SLAB for 2.6.31


* Ingo Molnar <[email protected]> wrote:

> * Ingo Molnar <[email protected]> wrote:
>
> > Hm, with this pulled on a testbox i'm still getting:
> >
> > [ 0.000000] Experimental hierarchical RCU init done.
> > [ 0.000000] NR_IRQS:4352 nr_irqs:256
> > [ 0.000000] ------------[ cut here ]------------
> > [ 0.000000] WARNING: at mm/bootmem.c:537 alloc_arch_preferred_bootmem+0x40/0x7e()
>
> Another testbox crashed on bootup. I'm collecting a serial log
> from it - config attached meanwhile.

We have a hard crash in the WP-protect code:

[ 0.000000] Checking if this processor honours the WP bit even in supervisor mode...BUG: Int 14: CR2 ffcff000
[ 0.000000] EDI 00000188 ESI 00000ac7 EBP c17eaf9c ESP c17eaf8c
[ 0.000000] EBX 000014e0 EDX 0000000e ECX 01856067 EAX 00000001
[ 0.000000] err 00000003 EIP c10135b1 CS 00000060 flg 00010002
[ 0.000000] Stack: c17eafa8 c17fd410 c16747bc c17eafc4 c17fd7e5 000011fd f8616000 c18237cc
[ 0.000000] 00099800 c17bb000 c17eafec c17f1668 000001c5 c17f1322 c166e039 c1822bf0
[ 0.000000] c166e033 c153a014 c18237cc 00020800 c17eaff8 c17f106a 00020800 01ba5003
[ 0.000000] Pid: 0, comm: swapper Not tainted 2.6.30-tip-02161-g7a74539-dirty #52203
[ 0.000000] Call Trace:
[ 0.000000] [<c15357c2>] ? printk+0x14/0x16
[ 0.000000] [<c10135b1>] ? do_test_wp_bit+0x19/0x23
[ 0.000000] [<c17fd410>] ? test_wp_bit+0x26/0x64
[ 0.000000] [<c17fd7e5>] ? mem_init+0x1ba/0x1d8
[ 0.000000] [<c17f1668>] ? start_kernel+0x164/0x2f7
[ 0.000000] [<c17f1322>] ? unknown_bootoption+0x0/0x19c
[ 0.000000] [<c17f106a>] ? __init_begin+0x6a/0x6f

(full bootlog attached)

Just a quick analysis from the place we crash (without looking into
any details): the WP test is the first time we really make use of
the MMU during bootup - crashes there are often a sign of messed up
pagetables, which is easy if the bootmem allocator is changed.

These patches needs more work.

Ingo

[ 0.000000] Initializing cgroup subsys cpu
[ 0.000000] Linux version 2.6.30-tip-02161-g7a74539-dirty (mingo@sirius) (gcc version 4.3.2 20081105 (Red Hat 4.3.2-7) (GCC) ) #52203 SMP Thu Jun 11 13:37:22 CEST 2009
[ 0.000000] KERNEL supported cpus:
[ 0.000000] Intel GenuineIntel
[ 0.000000] AMD AuthenticAMD
[ 0.000000] NSC Geode by NSC
[ 0.000000] Cyrix CyrixInstead
[ 0.000000] Centaur CentaurHauls
[ 0.000000] Transmeta GenuineTMx86
[ 0.000000] Transmeta TransmetaCPU
[ 0.000000] UMC UMC UMC UMC
[ 0.000000] BIOS-provided physical RAM map:
[ 0.000000] BIOS-e820: 0000000000000000 - 000000000009f800 (usable)
[ 0.000000] BIOS-e820: 000000000009f800 - 00000000000a0000 (reserved)
[ 0.000000] BIOS-e820: 00000000000f0000 - 0000000000100000 (reserved)
[ 0.000000] BIOS-e820: 0000000000100000 - 000000003fff0000 (usable)
[ 0.000000] BIOS-e820: 000000003fff0000 - 000000003fff3000 (ACPI NVS)
[ 0.000000] BIOS-e820: 000000003fff3000 - 0000000040000000 (ACPI data)
[ 0.000000] BIOS-e820: 00000000e0000000 - 00000000f0000000 (reserved)
[ 0.000000] BIOS-e820: 00000000fec00000 - 0000000100000000 (reserved)
[ 0.000000] console [earlyser0] enabled
[ 0.000000] debug: ignoring loglevel setting.
[ 0.000000] DMI 2.3 present.
[ 0.000000] last_pfn = 0x3fff0 max_arch_pfn = 0x100000
[ 0.000000] Warning only 894MB will be used.
[ 0.000000] Use a HIGHMEM enabled kernel.
[ 0.000000] initial memory mapped : 0 - 01c00000
[ 0.000000] init_memory_mapping: 0000000000000000-0000000037e16000
[ 0.000000] 0000000000 - 0000400000 page 4k
[ 0.000000] 0000400000 - 0037c00000 page 2M
[ 0.000000] 0037c00000 - 0037e16000 page 4k
[ 0.000000] kernel direct mapping tables up to 37e16000 @ 7000-c000
[ 0.000000] ACPI: RSDP 000f76f0 00014 (v00 Nvidia)
[ 0.000000] ACPI: RSDT 3fff3040 00034 (v01 Nvidia AWRDACPI 42302E31 AWRD 00000000)
[ 0.000000] ACPI: FACP 3fff30c0 00074 (v01 Nvidia AWRDACPI 42302E31 AWRD 00000000)
[ 0.000000] ACPI: DSDT 3fff3180 06264 (v01 NVIDIA AWRDACPI 00001000 MSFT 0100000E)
[ 0.000000] ACPI: FACS 3fff0000 00040
[ 0.000000] ACPI: SRAT 3fff9500 000A0 (v01 AMD HAMMER 00000001 AMD 00000001)
[ 0.000000] ACPI: MCFG 3fff9600 0003C (v01 Nvidia AWRDACPI 42302E31 AWRD 00000000)
[ 0.000000] ACPI: APIC 3fff9440 0007C (v01 Nvidia AWRDACPI 42302E31 AWRD 00000000)
[ 0.000000] ACPI: Local APIC address 0xfee00000
[ 0.000000] 894MB LOWMEM available.
[ 0.000000] mapped low ram: 0 - 37e16000
[ 0.000000] low ram: 0 - 37e16000
[ 0.000000] node 0 low ram: 00000000 - 37e16000
[ 0.000000] node 0 bootmap 00008000 - 0000efc4
[ 0.000000] (8 early reservations) ==> bootmem [0000000000 - 0037e16000]
[ 0.000000] #0 [0000000000 - 0000001000] BIOS data page ==> [0000000000 - 0000001000]
[ 0.000000] #1 [0000001000 - 0000002000] EX TRAMPOLINE ==> [0000001000 - 0000002000]
[ 0.000000] #2 [0000006000 - 0000007000] TRAMPOLINE ==> [0000006000 - 0000007000]
[ 0.000000] #3 [0001000000 - 0001a04b14] TEXT DATA BSS ==> [0001000000 - 0001a04b14]
[ 0.000000] #4 [000009f800 - 0000100000] BIOS reserved ==> [000009f800 - 0000100000]
[ 0.000000] #5 [0001a05000 - 0001a0c149] BRK ==> [0001a05000 - 0001a0c149]
[ 0.000000] #6 [0000007000 - 0000008000] PGTABLE ==> [0000007000 - 0000008000]
[ 0.000000] #7 [0000008000 - 000000f000] BOOTMAP ==> [0000008000 - 000000f000]
[ 0.000000] Zone PFN ranges:
[ 0.000000] DMA 0x00000000 -> 0x00001000
[ 0.000000] Normal 0x00001000 -> 0x00037e16
[ 0.000000] Movable zone start PFN for each node
[ 0.000000] early_node_map[2] active PFN ranges
[ 0.000000] 0: 0x00000000 -> 0x0000009f
[ 0.000000] 0: 0x00000100 -> 0x00037e16
[ 0.000000] On node 0 totalpages: 228789
[ 0.000000] free_area_init_node: node 0, pgdat c17b2340, node_mem_map c1a0d000
[ 0.000000] DMA zone: 32 pages used for memmap
[ 0.000000] DMA zone: 0 pages reserved
[ 0.000000] DMA zone: 3967 pages, LIFO batch:0
[ 0.000000] Normal zone: 1757 pages used for memmap
[ 0.000000] Normal zone: 223033 pages, LIFO batch:31
[ 0.000000] Using APIC driver default
[ 0.000000] Nvidia board detected. Ignoring ACPI timer override.
[ 0.000000] If you got timer trouble try acpi_use_timer_override
[ 0.000000] ACPI: PM-Timer IO Port: 0x4008
[ 0.000000] ACPI: Local APIC address 0xfee00000
[ 0.000000] ACPI: LAPIC (acpi_id[0x00] lapic_id[0x00] enabled)
[ 0.000000] ACPI: LAPIC (acpi_id[0x01] lapic_id[0x01] enabled)
[ 0.000000] ACPI: LAPIC_NMI (acpi_id[0x00] high edge lint[0x1])
[ 0.000000] ACPI: LAPIC_NMI (acpi_id[0x01] high edge lint[0x1])
[ 0.000000] ACPI: IOAPIC (id[0x02] address[0xfec00000] gsi_base[0])
[ 0.000000] IOAPIC[0]: apic_id 2, version 17, address 0xfec00000, GSI 0-23
[ 0.000000] ACPI: INT_SRC_OVR (bus 0 bus_irq 0 global_irq 2 dfl dfl)
[ 0.000000] ACPI: BIOS IRQ0 pin2 override ignored.
[ 0.000000] ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 high level)
[ 0.000000] ACPI: INT_SRC_OVR (bus 0 bus_irq 14 global_irq 14 high edge)
[ 0.000000] ACPI: INT_SRC_OVR (bus 0 bus_irq 15 global_irq 15 high edge)
[ 0.000000] ACPI: IRQ9 used by override.
[ 0.000000] ACPI: IRQ14 used by override.
[ 0.000000] ACPI: IRQ15 used by override.
[ 0.000000] Enabling APIC mode: Flat. Using 1 I/O APICs
[ 0.000000] Using ACPI (MADT) for SMP configuration information
[ 0.000000] SMP: Allowing 2 CPUs, 0 hotplug CPUs
[ 0.000000] mapped APIC to ffffb000 (fee00000)
[ 0.000000] mapped IOAPIC to ffffa000 (fec00000)
[ 0.000000] nr_irqs_gsi: 24
[ 0.000000] PM: Registered nosave memory: 000000000009f000 - 00000000000a0000
[ 0.000000] PM: Registered nosave memory: 00000000000a0000 - 00000000000f0000
[ 0.000000] PM: Registered nosave memory: 00000000000f0000 - 0000000000100000
[ 0.000000] Allocating PCI resources starting at 40000000 (gap: 40000000:a0000000)
[ 0.000000] NR_CPUS:32 nr_cpumask_bits:32 nr_cpu_ids:2 nr_node_ids:1
[ 0.000000] PERCPU: Embedded 16 pages at c2111000, static data 41948 bytes
[ 0.000000] Built 1 zonelists in Zone order, mobility grouping on. Total pages: 227000
[ 0.000000] Kernel command line: root=/dev/sda1 earlyprintk=serial,ttyS0,115200,keep console=tty debug initcall_debug enforcing=0 apic=verbose ignore_loglevel sysrq_always_enabled selinux=0 nmi_watchdog=0 3 panic=1
[ 0.000000] debug: sysrq always enabled.
[ 0.000000] PID hash table entries: 4096 (order: 12, 16384 bytes)
[ 0.000000] Dentry cache hash table entries: 131072 (order: 7, 524288 bytes)
[ 0.000000] Inode-cache hash table entries: 65536 (order: 6, 262144 bytes)
[ 0.000000] Memory: 896604k/915544k available (5344k kernel code, 18420k reserved, 2759k data, 392k init, 0k highmem)
[ 0.000000] virtual kernel memory layout:
[ 0.000000] fixmap : 0xffe18000 - 0xfffff000 (1948 kB)
[ 0.000000] vmalloc : 0xf8616000 - 0xffe16000 ( 120 MB)
[ 0.000000] lowmem : 0xc0000000 - 0xf7e16000 ( 894 MB)
[ 0.000000] .init : 0xc17f1000 - 0xc1853000 ( 392 kB)
[ 0.000000] .data : 0xc1538001 - 0xc17e9c20 (2759 kB)
[ 0.000000] .text : 0xc1000000 - 0xc1538001 (5344 kB)
[ 0.000000] Checking if this processor honours the WP bit even in supervisor mode...BUG: Int 14: CR2 ffcff000
[ 0.000000] EDI 00000188 ESI 00000ac7 EBP c17eaf9c ESP c17eaf8c
[ 0.000000] EBX 000014e0 EDX 0000000e ECX 01856067 EAX 00000001
[ 0.000000] err 00000003 EIP c10135b1 CS 00000060 flg 00010002
[ 0.000000] Stack: c17eafa8 c17fd410 c16747bc c17eafc4 c17fd7e5 000011fd f8616000 c18237cc
[ 0.000000] 00099800 c17bb000 c17eafec c17f1668 000001c5 c17f1322 c166e039 c1822bf0
[ 0.000000] c166e033 c153a014 c18237cc 00020800 c17eaff8 c17f106a 00020800 01ba5003
[ 0.000000] Pid: 0, comm: swapper Not tainted 2.6.30-tip-02161-g7a74539-dirty #52203
[ 0.000000] Call Trace:
[ 0.000000] [<c15357c2>] ? printk+0x14/0x16
[ 0.000000] [<c10135b1>] ? do_test_wp_bit+0x19/0x23
[ 0.000000] [<c17fd410>] ? test_wp_bit+0x26/0x64
[ 0.000000] [<c17fd7e5>] ? mem_init+0x1ba/0x1d8
[ 0.000000] [<c17f1668>] ? start_kernel+0x164/0x2f7
[ 0.000000] [<c17f1322>] ? unknown_bootoption+0x0/0x19c
[ 0.000000] [<c17f106a>] ? __init_begin+0x6a/0x6f
[ 0.000000] BUG: Int 14: CR2 (null)
[ 0.000000] EDI 00000188 ESI 00000ac7 EBP c17eaf9c ESP c17eaf50
[ 0.000000] EBX 000014e0 EDX 0000000e ECX c17eaf58 EAX (null)
[ 0.000000] err 00000002 EIP c100033a CS 00000060 flg 00010046
[ 0.000000] Stack: c1766036 0000000e ffcff000 00000188 00000ac7 c17eaf9c c17eaf8c 000014e0
[ 0.000000] 0000000e 01856067 00000001 00000003 c10135b1 00000060 00010002 c17eafa8
[ 0.000000] c17fd410 c16747bc c17eafc4 c17fd7e5 000011fd f8616000 c18237cc 00099800
[ 0.000000] Pid: 0, comm: swapper Not tainted 2.6.30-tip-02161-g7a74539-dirty #52203
[ 0.000000] Call Trace:
[ 0.000000] [<c15357c2>] ? printk+0x14/0x16
[ 0.000000] [<c10135b1>] ? do_test_wp_bit+0x19/0x23
[ 0.000000] [<c17fd410>] ? test_wp_bit+0x26/0x64
[ 0.000000] [<c17fd7e5>] ? mem_init+0x1ba/0x1d8
[ 0.000000] [<c17f1668>] ? start_kernel+0x164/0x2f7
[ 0.000000] [<c17f1322>] ? unknown_bootoption+0x0/0x19c
[ 0.000000] [<c17f106a>] ? __init_begin+0x6a/0x6f

2009-06-11 11:50:00

by Pekka Enberg

[permalink] [raw]
Subject: Re: [GIT PULL v2] Early boot SLAB for 2.6.31

On Thu, 11 Jun 2009, Ingo Molnar wrote:
> Wasnt this supposed to have gone away with this tree?

Yes, if I hadn't messed up the merge conflict fixup. :-) I am not hitting
that with my configuration, btw.

Pekka

>From fc1ebbcad1d86b67deed4442df9445cc81061c85 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <[email protected]>
Date: Thu, 11 Jun 2009 14:46:49 +0300
Subject: [PATCH] irq: use kcalloc() instead of the bootmem allocator

Fixes the following problem:

[ 0.000000] Experimental hierarchical RCU init done.
[ 0.000000] NR_IRQS:4352 nr_irqs:256
[ 0.000000] ------------[ cut here ]------------
[ 0.000000] WARNING: at mm/bootmem.c:537 alloc_arch_preferred_bootmem+0x40/0x7e()
[ 0.000000] Hardware name: To Be Filled By O.E.M.
[ 0.000000] Pid: 0, comm: swapper Not tainted 2.6.30-tip-02161-g7a74539-dirty #59709
[ 0.000000] Call Trace:
[ 0.000000] [<ffffffff823f8c8e>] ? alloc_arch_preferred_bootmem+0x40/0x7e
[ 0.000000] [<ffffffff81067168>] warn_slowpath_common+0x88/0xcb
[ 0.000000] [<ffffffff810671d2>] warn_slowpath_null+0x27/0x3d
[ 0.000000] [<ffffffff823f8c8e>] alloc_arch_preferred_bootmem+0x40/0x7e
[ 0.000000] [<ffffffff823f9307>] ___alloc_bootmem_nopanic+0x4e/0xec
[ 0.000000] [<ffffffff823f93c5>] ___alloc_bootmem+0x20/0x61
[ 0.000000] [<ffffffff823f962e>] __alloc_bootmem+0x1e/0x34
[ 0.000000] [<ffffffff823f757c>] early_irq_init+0x6d/0x118
[ 0.000000] [<ffffffff823e0140>] ? early_idt_handler+0x0/0x71
[ 0.000000] [<ffffffff823e0cf7>] start_kernel+0x192/0x394
[ 0.000000] [<ffffffff823e0140>] ? early_idt_handler+0x0/0x71
[ 0.000000] [<ffffffff823e02ad>] x86_64_start_reservations+0xb4/0xcf
[ 0.000000] [<ffffffff823e0000>] ? __init_begin+0x0/0x140
[ 0.000000] [<ffffffff823e0420>] x86_64_start_kernel+0x158/0x17b
[ 0.000000] ---[ end trace a7919e7f17c0a725 ]---
[ 0.000000] Fast TSC calibration using PIT
[ 0.000000] Detected 2002.510 MHz processor.
[ 0.004000] Console: colour VGA+ 80x25

Reported-by: Ingo Molnar <[email protected]>
Signed-off-by: Pekka Enberg <[email protected]>
---
kernel/irq/handle.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e161999..1045785 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -164,7 +164,7 @@ int __init early_irq_init(void)
node = first_online_node;

/* allocate irq_desc_ptrs array based on nr_irqs */
- irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
+ irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);

/* allocate based on nr_cpu_ids */
kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
--
1.6.0.4

2009-06-11 11:55:22

by Ingo Molnar

[permalink] [raw]
Subject: Re: [GIT PULL v2] Early boot SLAB for 2.6.31


* Ingo Molnar <[email protected]> wrote:

> [...] I started testing it on 6 boxes - if you dont hear from me
> within a few hours it's all fine.

Quick runtime test summary:

- 3 boxes produced the boot warning
- two crashed (both 32-bit - i guess the WP test crash)
- one booted up fine

So there's widespread breakage on x86, i'd suggest holding off on
this for another day, so that we can get it fixed and tested some
more.

I still like patches of course! :-)

Ingo

2009-06-11 11:56:19

by Pekka Enberg

[permalink] [raw]
Subject: Re: [GIT PULL v2] Early boot SLAB for 2.6.31

On Thu, 2009-06-11 at 13:48 +0200, Ingo Molnar wrote:
> * Ingo Molnar <[email protected]> wrote:
>
> > * Ingo Molnar <[email protected]> wrote:
> >
> > > Hm, with this pulled on a testbox i'm still getting:
> > >
> > > [ 0.000000] Experimental hierarchical RCU init done.
> > > [ 0.000000] NR_IRQS:4352 nr_irqs:256
> > > [ 0.000000] ------------[ cut here ]------------
> > > [ 0.000000] WARNING: at mm/bootmem.c:537 alloc_arch_preferred_bootmem+0x40/0x7e()
> >
> > Another testbox crashed on bootup. I'm collecting a serial log
> > from it - config attached meanwhile.
>
> We have a hard crash in the WP-protect code:
>
> [ 0.000000] Checking if this processor honours the WP bit even in supervisor mode...BUG: Int 14: CR2 ffcff000
> [ 0.000000] EDI 00000188 ESI 00000ac7 EBP c17eaf9c ESP c17eaf8c
> [ 0.000000] EBX 000014e0 EDX 0000000e ECX 01856067 EAX 00000001
> [ 0.000000] err 00000003 EIP c10135b1 CS 00000060 flg 00010002
> [ 0.000000] Stack: c17eafa8 c17fd410 c16747bc c17eafc4 c17fd7e5 000011fd f8616000 c18237cc
> [ 0.000000] 00099800 c17bb000 c17eafec c17f1668 000001c5 c17f1322 c166e039 c1822bf0
> [ 0.000000] c166e033 c153a014 c18237cc 00020800 c17eaff8 c17f106a 00020800 01ba5003
> [ 0.000000] Pid: 0, comm: swapper Not tainted 2.6.30-tip-02161-g7a74539-dirty #52203
> [ 0.000000] Call Trace:
> [ 0.000000] [<c15357c2>] ? printk+0x14/0x16
> [ 0.000000] [<c10135b1>] ? do_test_wp_bit+0x19/0x23
> [ 0.000000] [<c17fd410>] ? test_wp_bit+0x26/0x64
> [ 0.000000] [<c17fd7e5>] ? mem_init+0x1ba/0x1d8
> [ 0.000000] [<c17f1668>] ? start_kernel+0x164/0x2f7
> [ 0.000000] [<c17f1322>] ? unknown_bootoption+0x0/0x19c
> [ 0.000000] [<c17f106a>] ? __init_begin+0x6a/0x6f
>
> (full bootlog attached)
>
> Just a quick analysis from the place we crash (without looking into
> any details): the WP test is the first time we really make use of
> the MMU during bootup - crashes there are often a sign of messed up
> pagetables, which is easy if the bootmem allocator is changed.

No, mem_init() happens _before_ slab is set up so it's probably just
this.

And yes, the patch set clearly needs more work.

Pekka

diff --git a/init/main.c b/init/main.c
index 859af21..6d38f96 100644
--- a/init/main.c
+++ b/init/main.c
@@ -588,6 +588,8 @@ asmlinkage void __init start_kernel(void)
*/
pidhash_init();
vfs_caches_init_early();
+ sort_main_extable();
+ trap_init();
/*
* Set up kernel memory allocators
*/
@@ -610,8 +612,6 @@ asmlinkage void __init start_kernel(void)
"enabled *very* early, fixing it\n");
local_irq_disable();
}
- sort_main_extable();
- trap_init();
rcu_init();
/* init some links before init_ISA_irqs() */
early_irq_init();

2009-06-11 13:59:42

by Christoph Lameter

[permalink] [raw]
Subject: Re: [GIT PULL v2] Early boot SLAB for 2.6.31

On Thu, 11 Jun 2009, Pekka J Enberg wrote:

> setup_nr_cpu_ids();
> smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
>
> + build_all_zonelists();
> + page_alloc_init();
> +
> + printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
> + parse_early_param();
> + parse_args("Booting kernel", static_command_line, __start___param,
> + __stop___param - __start___param,
> + &unknown_bootoption);
> + /*
> + * These use large bootmem allocations and must precede
> + * kmem_cache_init()
> + */
> + pidhash_init();
> + vfs_caches_init_early();
> + /*
> + * Set up kernel memory allocators
> + */
> + mem_init();
> + kmem_cache_init();
> + vmalloc_init();

Good they are all together. Maybe we can come up with an mm_init()?

> @@ -603,7 +616,6 @@ asmlinkage void __init start_kernel(void)
> /* init some links before init_ISA_irqs() */
> early_irq_init();
> init_IRQ();
> - pidhash_init();
> init_timers();
> hrtimers_init();
> softirq_init();
> @@ -645,14 +657,10 @@ asmlinkage void __init start_kernel(void)
> initrd_start = 0;
> }
> #endif
> - vmalloc_init();
> - vfs_caches_init_early();
> cpuset_init_early();
> page_cgroup_init();
> - mem_init();
> enable_debug_pagealloc();
> cpu_hotplug_init();
> - kmem_cache_init();
> kmemtrace_init();
> debug_objects_mem_init();
> idr_init_cache();

Therefore potential breakage is in cpusets, hotplug and control groups.
Have any of these been tested with these patches? NUMA?

> diff --git a/mm/slub.c b/mm/slub.c
> index 5e805a6..c1815a6 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -2557,13 +2557,16 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
> if (gfp_flags & SLUB_DMA)
> flags = SLAB_CACHE_DMA;
>
> - down_write(&slub_lock);
> + /*
> + * This function is called with IRQs disabled during early-boot on
> + * single CPU so there's no need to take slub_lock here.
> + */
> if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,

This function is also called later when kmalloc caches are
created on demand.

2009-06-11 14:06:41

by Pekka Enberg

[permalink] [raw]
Subject: Re: [GIT PULL v2] Early boot SLAB for 2.6.31

Hi Christoph,

On Thu, Jun 11, 2009 at 4:58 PM, Christoph
Lameter<[email protected]> wrote:
>> @@ -645,14 +657,10 @@ asmlinkage void __init start_kernel(void)
>> ? ? ? ? ? ? ? initrd_start = 0;
>> ? ? ? }
>> ?#endif
>> - ? ? vmalloc_init();
>> - ? ? vfs_caches_init_early();
>> ? ? ? cpuset_init_early();
>> ? ? ? page_cgroup_init();
>> - ? ? mem_init();
>> ? ? ? enable_debug_pagealloc();
>> ? ? ? cpu_hotplug_init();
>> - ? ? kmem_cache_init();
>> ? ? ? kmemtrace_init();
>> ? ? ? debug_objects_mem_init();
>> ? ? ? idr_init_cache();
>
> Therefore potential breakage is in cpusets, hotplug and control groups.
> Have any of these been tested with these patches? NUMA?

Yinghai has done some testing. Control groups should to be fine:

http://git.kernel.org/?p=linux/kernel/git/penberg/slab-2.6.git;a=commitdiff;h=7d63de6b4c7798a6a1c6504a65c327aaf76ef2d5

I have not tested hotplug and cpuset_init_early() does indeed needs to
be switched over to slab (it does that via fallback now).

>> diff --git a/mm/slub.c b/mm/slub.c
>> index 5e805a6..c1815a6 100644
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>> @@ -2557,13 +2557,16 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
>> ? ? ? if (gfp_flags & SLUB_DMA)
>> ? ? ? ? ? ? ? flags = SLAB_CACHE_DMA;
>>
>> - ? ? down_write(&slub_lock);
>> + ? ? /*
>> + ? ? ?* This function is called with IRQs disabled during early-boot on
>> + ? ? ?* single CPU so there's no need to take slub_lock here.
>> + ? ? ?*/
>> ? ? ? if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
>
> This function is also called later when kmalloc caches are
> created on demand.

Where? AFAICT, only kmem_cache_init() calls the function.

Pekka

2009-06-11 14:27:29

by Christoph Lameter

[permalink] [raw]
Subject: Re: [GIT PULL v2] Early boot SLAB for 2.6.31

On Thu, 11 Jun 2009, Pekka Enberg wrote:

> > This function is also called later when kmalloc caches are
> > created on demand.
>
> Where? AFAICT, only kmem_cache_init() calls the function.

Right. I thought dma_kmalloc_cache would also call this but it open codes
create_kmalloc_cache due to the need to trylock on slub_lock.

2009-06-11 15:24:46

by Pekka Enberg

[permalink] [raw]
Subject: Re: [GIT PULL v2] Early boot SLAB for 2.6.31

Hi Christoph,

On Thu, Jun 11, 2009 at 5:06 PM, Pekka Enberg<[email protected]> wrote:
> I have not tested hotplug and cpuset_init_early() does indeed needs to
> be switched over to slab (it does that via fallback now).

Oh, my bad, Yinghai took care of cpusets too:

http://git.kernel.org/?p=linux/kernel/git/penberg/slab-2.6.git;a=commitdiff;h=d39aad49c3e672b7de393c39438529243610c1ad

The patch shouldn't have probably labeled as "x86" but anyway, it's
taken care of. CPU hotplug initialization isn't using the bootmem
allocator so I don't see a problem with that either. Or did you have
something specific in mind?

Pekka

2009-06-11 17:51:19

by Yinghai Lu

[permalink] [raw]
Subject: Re: [GIT PULL v2] Early boot SLAB for 2.6.31

Pekka Enberg wrote:
> Hi Christoph,
>
> On Thu, Jun 11, 2009 at 5:06 PM, Pekka Enberg<[email protected]> wrote:
>> I have not tested hotplug and cpuset_init_early() does indeed needs to
>> be switched over to slab (it does that via fallback now).
>
> Oh, my bad, Yinghai took care of cpusets too:

your trees works here here. numa and SLUB

YH

2009-06-11 18:10:14

by Pekka Enberg

[permalink] [raw]
Subject: [GIT PULL v3] Early boot SLAB for 2.6.31

Hi Linus,

Here's third take on the early boot SLAB patches for 2.6.31. I fixed the
problems found by Ingo in his testing and rebased the series to be
bisectable. I have tested the tree on 32-bit UMA and on x86-64 with
qemu and Yanghai has tested them on NUMA+SLUB (thanks Yanghai!). Note: The
series has not been tested on non-x86 architectures so we may introduce
some breakage there.

The bulk of the changes are just replacing bootmem alloc call-sites to use
slab instead to silence bootmem fallback warnings during boot. I do expect
that we missed some corner cases but as I've said before, the slab
fallback code in bootmem should take care of those.

Pekka

The following changes since commit 991ec02cdca33b03a132a0cacfe6f0aa0be9aa8d:
Linus Torvalds (1):
Merge branch 'tracing-urgent-for-linus' of git://git.kernel.org/.../tip/linux-2.6-tip

are available in the git repository at:

ssh://master.kernel.org/pub/scm/linux/kernel/git/penberg/slab-2.6 topic/slab/earlyboot

Pekka Enberg (11):
bootmem: use slab if bootmem is no longer available
bootmem: fix slab fallback on numa
slab: setup allocators earlier in the boot sequence
vmalloc: use kzalloc() instead of alloc_bootmem()
init: introduce mm_init()
sched: use kzalloc() instead of the bootmem allocator
vt: use kzalloc() instead of the bootmem allocator
sched: use alloc_cpumask_var() instead of alloc_bootmem_cpumask_var()
sched: use slab in cpupri_init()
irq: use kcalloc() instead of the bootmem allocator
vgacon: use slab allocator instead of the bootmem allocator

Yinghai Lu (3):
x86: remove some alloc_bootmem_cpumask_var calling
irq/cpumask: make memoryless node zero happy
memcg: don't use bootmem allocator in setup code

arch/x86/kernel/apic/io_apic.c | 6 ++-
drivers/char/vt.c | 8 +---
drivers/video/console/vgacon.c | 5 +-
include/linux/irq.h | 18 +++-----
init/main.c | 41 +++++++++++++-------
kernel/cpuset.c | 2 +-
kernel/irq/handle.c | 11 +++--
kernel/profile.c | 6 ---
kernel/sched.c | 30 ++++++--------
kernel/sched_cpupri.c | 8 ++-
lib/cpumask.c | 11 +----
mm/bootmem.c | 12 ++++++
mm/page_cgroup.c | 12 ++++--
mm/slab.c | 85 +++++++++++++++++++++-------------------
mm/slub.c | 17 +++++---
mm/vmalloc.c | 3 +-
16 files changed, 145 insertions(+), 130 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 1946fac..94605e7 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -177,16 +177,18 @@ int __init arch_early_irq_init(void)
struct irq_cfg *cfg;
struct irq_desc *desc;
int count;
+ int node;
int i;

cfg = irq_cfgx;
count = ARRAY_SIZE(irq_cfgx);
+ node= cpu_to_node(boot_cpu_id);

for (i = 0; i < count; i++) {
desc = irq_to_desc(i);
desc->chip_data = &cfg[i];
- alloc_bootmem_cpumask_var(&cfg[i].domain);
- alloc_bootmem_cpumask_var(&cfg[i].old_domain);
+ alloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
+ alloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
if (i < NR_IRQS_LEGACY)
cpumask_setall(cfg[i].domain);
}
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 08151d4..c796a86 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -95,7 +95,6 @@
#include <linux/timer.h>
#include <linux/interrupt.h>
#include <linux/workqueue.h>
-#include <linux/bootmem.h>
#include <linux/pm.h>
#include <linux/font.h>
#include <linux/bitops.h>
@@ -2875,14 +2874,11 @@ static int __init con_init(void)
mod_timer(&console_timer, jiffies + blankinterval);
}

- /*
- * kmalloc is not running yet - we use the bootmem allocator.
- */
for (currcons = 0; currcons < MIN_NR_CONSOLES; currcons++) {
- vc_cons[currcons].d = vc = alloc_bootmem(sizeof(struct vc_data));
+ vc_cons[currcons].d = vc = kzalloc(sizeof(struct vc_data), GFP_NOWAIT);
INIT_WORK(&vc_cons[currcons].SAK_work, vc_SAK);
visual_init(vc, currcons, 1);
- vc->vc_screenbuf = (unsigned short *)alloc_bootmem(vc->vc_screenbuf_size);
+ vc->vc_screenbuf = kzalloc(vc->vc_screenbuf_size, GFP_NOWAIT);
vc->vc_kmalloced = 0;
vc_init(vc, vc->vc_rows, vc->vc_cols,
currcons || !vc->vc_sw->con_save_screen);
diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c
index 38e86b8..59d7d5e 100644
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -180,7 +180,7 @@ static inline void vga_set_mem_top(struct vc_data *c)
}

#ifdef CONFIG_VGACON_SOFT_SCROLLBACK
-#include <linux/bootmem.h>
+#include <linux/slab.h>
/* software scrollback */
static void *vgacon_scrollback;
static int vgacon_scrollback_tail;
@@ -210,8 +210,7 @@ static void vgacon_scrollback_init(int pitch)
*/
static void __init_refok vgacon_scrollback_startup(void)
{
- vgacon_scrollback = alloc_bootmem(CONFIG_VGACON_SOFT_SCROLLBACK_SIZE
- * 1024);
+ vgacon_scrollback = kcalloc(CONFIG_VGACON_SOFT_SCROLLBACK_SIZE, 1024, GFP_NOWAIT);
vgacon_scrollback_init(vga_video_num_columns * 2);
}

diff --git a/include/linux/irq.h b/include/linux/irq.h
index eedbb8e..1e50c34 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -430,23 +430,19 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
* Returns true if successful (or not required).
*/
static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
- bool boot)
+ bool boot)
{
-#ifdef CONFIG_CPUMASK_OFFSTACK
- if (boot) {
- alloc_bootmem_cpumask_var(&desc->affinity);
+ gfp_t gfp = GFP_ATOMIC;

-#ifdef CONFIG_GENERIC_PENDING_IRQ
- alloc_bootmem_cpumask_var(&desc->pending_mask);
-#endif
- return true;
- }
+ if (boot)
+ gfp = GFP_NOWAIT;

- if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ if (!alloc_cpumask_var_node(&desc->affinity, gfp, node))
return false;

#ifdef CONFIG_GENERIC_PENDING_IRQ
- if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) {
+ if (!alloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
free_cpumask_var(desc->affinity);
return false;
}
diff --git a/init/main.c b/init/main.c
index bb7dc57..7917695 100644
--- a/init/main.c
+++ b/init/main.c
@@ -533,6 +533,16 @@ void __init __weak thread_info_cache_init(void)
{
}

+/*
+ * Set up kernel memory allocators
+ */
+static void __init mm_init(void)
+{
+ mem_init();
+ kmem_cache_init();
+ vmalloc_init();
+}
+
asmlinkage void __init start_kernel(void)
{
char * command_line;
@@ -574,6 +584,23 @@ asmlinkage void __init start_kernel(void)
setup_nr_cpu_ids();
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */

+ build_all_zonelists();
+ page_alloc_init();
+
+ printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
+ parse_early_param();
+ parse_args("Booting kernel", static_command_line, __start___param,
+ __stop___param - __start___param,
+ &unknown_bootoption);
+ /*
+ * These use large bootmem allocations and must precede
+ * kmem_cache_init()
+ */
+ pidhash_init();
+ vfs_caches_init_early();
+ sort_main_extable();
+ trap_init();
+ mm_init();
/*
* Set up the scheduler prior starting any interrupts (such as the
* timer interrupt). Full topology setup happens at smp_init()
@@ -585,25 +612,15 @@ asmlinkage void __init start_kernel(void)
* fragile until we cpu_idle() for the first time.
*/
preempt_disable();
- build_all_zonelists();
- page_alloc_init();
- printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
- parse_early_param();
- parse_args("Booting kernel", static_command_line, __start___param,
- __stop___param - __start___param,
- &unknown_bootoption);
if (!irqs_disabled()) {
printk(KERN_WARNING "start_kernel(): bug: interrupts were "
"enabled *very* early, fixing it\n");
local_irq_disable();
}
- sort_main_extable();
- trap_init();
rcu_init();
/* init some links before init_ISA_irqs() */
early_irq_init();
init_IRQ();
- pidhash_init();
init_timers();
hrtimers_init();
softirq_init();
@@ -645,14 +662,10 @@ asmlinkage void __init start_kernel(void)
initrd_start = 0;
}
#endif
- vmalloc_init();
- vfs_caches_init_early();
cpuset_init_early();
page_cgroup_init();
- mem_init();
enable_debug_pagealloc();
cpu_hotplug_init();
- kmem_cache_init();
kmemtrace_init();
debug_objects_mem_init();
idr_init_cache();
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 026facc..d5a7e17 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1857,7 +1857,7 @@ struct cgroup_subsys cpuset_subsys = {

int __init cpuset_init_early(void)
{
- alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
+ alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT);

top_cpuset.mems_generation = cpuset_mems_generation++;
return 0;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a600184..1045785 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -150,6 +150,7 @@ int __init early_irq_init(void)
{
struct irq_desc *desc;
int legacy_count;
+ int node;
int i;

init_irq_default_affinity();
@@ -160,20 +161,20 @@ int __init early_irq_init(void)

desc = irq_desc_legacy;
legacy_count = ARRAY_SIZE(irq_desc_legacy);
+ node = first_online_node;

/* allocate irq_desc_ptrs array based on nr_irqs */
- irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
+ irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);

/* allocate based on nr_cpu_ids */
- /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */
- kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids *
- sizeof(int));
+ kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
+ sizeof(int), GFP_NOWAIT, node);

for (i = 0; i < legacy_count; i++) {
desc[i].irq = i;
desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
- alloc_desc_masks(&desc[i], 0, true);
+ alloc_desc_masks(&desc[i], node, true);
init_desc_masks(&desc[i]);
irq_desc_ptrs[i] = desc + i;
}
diff --git a/kernel/profile.c b/kernel/profile.c
index 7724e04..28cf26a 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -111,12 +111,6 @@ int __ref profile_init(void)
/* only text is profiled */
prof_len = (_etext - _stext) >> prof_shift;
buffer_bytes = prof_len*sizeof(atomic_t);
- if (!slab_is_available()) {
- prof_buffer = alloc_bootmem(buffer_bytes);
- alloc_bootmem_cpumask_var(&prof_cpu_mask);
- cpumask_copy(prof_cpu_mask, cpu_possible_mask);
- return 0;
- }

if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
return -ENOMEM;
diff --git a/kernel/sched.c b/kernel/sched.c
index 14c447a..dcf2dc2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -68,7 +68,6 @@
#include <linux/pagemap.h>
#include <linux/hrtimer.h>
#include <linux/tick.h>
-#include <linux/bootmem.h>
#include <linux/debugfs.h>
#include <linux/ctype.h>
#include <linux/ftrace.h>
@@ -7782,24 +7781,21 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)

static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
{
+ gfp_t gfp = GFP_KERNEL;
+
memset(rd, 0, sizeof(*rd));

- if (bootmem) {
- alloc_bootmem_cpumask_var(&def_root_domain.span);
- alloc_bootmem_cpumask_var(&def_root_domain.online);
- alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
- cpupri_init(&rd->cpupri, true);
- return 0;
- }
+ if (bootmem)
+ gfp = GFP_NOWAIT;

- if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+ if (!alloc_cpumask_var(&rd->span, gfp))
goto out;
- if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+ if (!alloc_cpumask_var(&rd->online, gfp))
goto free_span;
- if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ if (!alloc_cpumask_var(&rd->rto_mask, gfp))
goto free_online;

- if (cpupri_init(&rd->cpupri, false) != 0)
+ if (cpupri_init(&rd->cpupri, bootmem) != 0)
goto free_rto_mask;
return 0;

@@ -9123,7 +9119,7 @@ void __init sched_init(void)
* we use alloc_bootmem().
*/
if (alloc_size) {
- ptr = (unsigned long)alloc_bootmem(alloc_size);
+ ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);

#ifdef CONFIG_FAIR_GROUP_SCHED
init_task_group.se = (struct sched_entity **)ptr;
@@ -9314,13 +9310,13 @@ void __init sched_init(void)
current->sched_class = &fair_sched_class;

/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
- alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+ alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ
- alloc_bootmem_cpumask_var(&nohz.cpu_mask);
- alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
+ alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
+ alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
#endif
- alloc_bootmem_cpumask_var(&cpu_isolated_map);
+ alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
#endif /* SMP */

scheduler_running = 1;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 344712a..7deffc9 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -154,8 +154,12 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
*/
int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
{
+ gfp_t gfp = GFP_KERNEL;
int i;

+ if (bootmem)
+ gfp = GFP_NOWAIT;
+
memset(cp, 0, sizeof(*cp));

for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -163,9 +167,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)

spin_lock_init(&vec->lock);
vec->count = 0;
- if (bootmem)
- alloc_bootmem_cpumask_var(&vec->mask);
- else if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&vec->mask, gfp))
goto cleanup;
}

diff --git a/lib/cpumask.c b/lib/cpumask.c
index eb23aaa..7bb4142 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -92,15 +92,8 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
*/
bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
{
- if (likely(slab_is_available()))
- *mask = kmalloc_node(cpumask_size(), flags, node);
- else {
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
- printk(KERN_ERR
- "=> alloc_cpumask_var: kmalloc not available!\n");
-#endif
- *mask = NULL;
- }
+ *mask = kmalloc_node(cpumask_size(), flags, node);
+
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
if (!*mask) {
printk(KERN_ERR "=> alloc_cpumask_var: failed!\n");
diff --git a/mm/bootmem.c b/mm/bootmem.c
index daf9271..282df0a 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -532,6 +532,9 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
unsigned long size, unsigned long align,
unsigned long goal, unsigned long limit)
{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc(size, GFP_NOWAIT);
+
#ifdef CONFIG_HAVE_ARCH_BOOTMEM
bootmem_data_t *p_bdata;

@@ -662,6 +665,9 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
}

@@ -693,6 +699,9 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
{
void *ptr;

+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
if (ptr)
return ptr;
@@ -745,6 +754,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
return ___alloc_bootmem_node(pgdat->bdata, size, align,
goal, ARCH_LOW_ADDRESS_LIMIT);
}
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 791905c..3dd4a90 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -47,6 +47,8 @@ static int __init alloc_node_page_cgroup(int nid)
struct page_cgroup *base, *pc;
unsigned long table_size;
unsigned long start_pfn, nr_pages, index;
+ struct page *page;
+ unsigned int order;

start_pfn = NODE_DATA(nid)->node_start_pfn;
nr_pages = NODE_DATA(nid)->node_spanned_pages;
@@ -55,11 +57,13 @@ static int __init alloc_node_page_cgroup(int nid)
return 0;

table_size = sizeof(struct page_cgroup) * nr_pages;
-
- base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
- table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
- if (!base)
+ order = get_order(table_size);
+ page = alloc_pages_node(nid, GFP_NOWAIT | __GFP_ZERO, order);
+ if (!page)
+ page = alloc_pages_node(-1, GFP_NOWAIT | __GFP_ZERO, order);
+ if (!page)
return -ENOMEM;
+ base = page_address(page);
for (index = 0; index < nr_pages; index++) {
pc = base + index;
__init_page_cgroup(pc, start_pfn + index);
diff --git a/mm/slab.c b/mm/slab.c
index f85831d..2bd611f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -315,7 +315,7 @@ static int drain_freelist(struct kmem_cache *cache,
struct kmem_list3 *l3, int tofree);
static void free_block(struct kmem_cache *cachep, void **objpp, int len,
int node);
-static int enable_cpucache(struct kmem_cache *cachep);
+static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
static void cache_reap(struct work_struct *unused);

/*
@@ -958,12 +958,12 @@ static void __cpuinit start_cpu_timer(int cpu)
}

static struct array_cache *alloc_arraycache(int node, int entries,
- int batchcount)
+ int batchcount, gfp_t gfp)
{
int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
struct array_cache *nc = NULL;

- nc = kmalloc_node(memsize, GFP_KERNEL, node);
+ nc = kmalloc_node(memsize, gfp, node);
if (nc) {
nc->avail = 0;
nc->limit = entries;
@@ -1003,7 +1003,7 @@ static int transfer_objects(struct array_cache *to,
#define drain_alien_cache(cachep, alien) do { } while (0)
#define reap_alien(cachep, l3) do { } while (0)

-static inline struct array_cache **alloc_alien_cache(int node, int limit)
+static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
{
return (struct array_cache **)BAD_ALIEN_MAGIC;
}
@@ -1034,7 +1034,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
static void *alternate_node_alloc(struct kmem_cache *, gfp_t);

-static struct array_cache **alloc_alien_cache(int node, int limit)
+static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
{
struct array_cache **ac_ptr;
int memsize = sizeof(void *) * nr_node_ids;
@@ -1042,14 +1042,14 @@ static struct array_cache **alloc_alien_cache(int node, int limit)

if (limit > 1)
limit = 12;
- ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
+ ac_ptr = kmalloc_node(memsize, gfp, node);
if (ac_ptr) {
for_each_node(i) {
if (i == node || !node_online(i)) {
ac_ptr[i] = NULL;
continue;
}
- ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
+ ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
if (!ac_ptr[i]) {
for (i--; i >= 0; i--)
kfree(ac_ptr[i]);
@@ -1282,20 +1282,20 @@ static int __cpuinit cpuup_prepare(long cpu)
struct array_cache **alien = NULL;

nc = alloc_arraycache(node, cachep->limit,
- cachep->batchcount);
+ cachep->batchcount, GFP_KERNEL);
if (!nc)
goto bad;
if (cachep->shared) {
shared = alloc_arraycache(node,
cachep->shared * cachep->batchcount,
- 0xbaadf00d);
+ 0xbaadf00d, GFP_KERNEL);
if (!shared) {
kfree(nc);
goto bad;
}
}
if (use_alien_caches) {
- alien = alloc_alien_cache(node, cachep->limit);
+ alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
if (!alien) {
kfree(shared);
kfree(nc);
@@ -1399,10 +1399,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
{
struct kmem_list3 *ptr;

- ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
+ ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid);
BUG_ON(!ptr);

- local_irq_disable();
memcpy(ptr, list, sizeof(struct kmem_list3));
/*
* Do not assume that spinlocks can be initialized via memcpy:
@@ -1411,7 +1410,6 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,

MAKE_ALL_LISTS(cachep, ptr, nodeid);
cachep->nodelists[nodeid] = ptr;
- local_irq_enable();
}

/*
@@ -1575,9 +1573,8 @@ void __init kmem_cache_init(void)
{
struct array_cache *ptr;

- ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+ ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);

- local_irq_disable();
BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
memcpy(ptr, cpu_cache_get(&cache_cache),
sizeof(struct arraycache_init));
@@ -1587,11 +1584,9 @@ void __init kmem_cache_init(void)
spin_lock_init(&ptr->lock);

cache_cache.array[smp_processor_id()] = ptr;
- local_irq_enable();

- ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+ ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);

- local_irq_disable();
BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
!= &initarray_generic.cache);
memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
@@ -1603,7 +1598,6 @@ void __init kmem_cache_init(void)

malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
ptr;
- local_irq_enable();
}
/* 5) Replace the bootstrap kmem_list3's */
{
@@ -1627,7 +1621,7 @@ void __init kmem_cache_init(void)
struct kmem_cache *cachep;
mutex_lock(&cache_chain_mutex);
list_for_each_entry(cachep, &cache_chain, next)
- if (enable_cpucache(cachep))
+ if (enable_cpucache(cachep, GFP_NOWAIT))
BUG();
mutex_unlock(&cache_chain_mutex);
}
@@ -2064,10 +2058,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
return left_over;
}

-static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
+static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
{
if (g_cpucache_up == FULL)
- return enable_cpucache(cachep);
+ return enable_cpucache(cachep, gfp);

if (g_cpucache_up == NONE) {
/*
@@ -2089,7 +2083,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
g_cpucache_up = PARTIAL_AC;
} else {
cachep->array[smp_processor_id()] =
- kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+ kmalloc(sizeof(struct arraycache_init), gfp);

if (g_cpucache_up == PARTIAL_AC) {
set_up_list3s(cachep, SIZE_L3);
@@ -2153,6 +2147,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
{
size_t left_over, slab_size, ralign;
struct kmem_cache *cachep = NULL, *pc;
+ gfp_t gfp;

/*
* Sanity checks... these are all serious usage bugs.
@@ -2168,8 +2163,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
* We use cache_chain_mutex to ensure a consistent view of
* cpu_online_mask as well. Please see cpuup_callback
*/
- get_online_cpus();
- mutex_lock(&cache_chain_mutex);
+ if (slab_is_available()) {
+ get_online_cpus();
+ mutex_lock(&cache_chain_mutex);
+ }

list_for_each_entry(pc, &cache_chain, next) {
char tmp;
@@ -2278,8 +2275,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
*/
align = ralign;

+ if (slab_is_available())
+ gfp = GFP_KERNEL;
+ else
+ gfp = GFP_NOWAIT;
+
/* Get cache's description obj. */
- cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
+ cachep = kmem_cache_zalloc(&cache_cache, gfp);
if (!cachep)
goto oops;

@@ -2382,7 +2384,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
cachep->ctor = ctor;
cachep->name = name;

- if (setup_cpu_cache(cachep)) {
+ if (setup_cpu_cache(cachep, gfp)) {
__kmem_cache_destroy(cachep);
cachep = NULL;
goto oops;
@@ -2394,8 +2396,10 @@ oops:
if (!cachep && (flags & SLAB_PANIC))
panic("kmem_cache_create(): failed to create slab `%s'\n",
name);
- mutex_unlock(&cache_chain_mutex);
- put_online_cpus();
+ if (slab_is_available()) {
+ mutex_unlock(&cache_chain_mutex);
+ put_online_cpus();
+ }
return cachep;
}
EXPORT_SYMBOL(kmem_cache_create);
@@ -3802,7 +3806,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name);
/*
* This initializes kmem_list3 or resizes various caches for all nodes.
*/
-static int alloc_kmemlist(struct kmem_cache *cachep)
+static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
{
int node;
struct kmem_list3 *l3;
@@ -3812,7 +3816,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
for_each_online_node(node) {

if (use_alien_caches) {
- new_alien = alloc_alien_cache(node, cachep->limit);
+ new_alien = alloc_alien_cache(node, cachep->limit, gfp);
if (!new_alien)
goto fail;
}
@@ -3821,7 +3825,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
if (cachep->shared) {
new_shared = alloc_arraycache(node,
cachep->shared*cachep->batchcount,
- 0xbaadf00d);
+ 0xbaadf00d, gfp);
if (!new_shared) {
free_alien_cache(new_alien);
goto fail;
@@ -3850,7 +3854,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
free_alien_cache(new_alien);
continue;
}
- l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
+ l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
if (!l3) {
free_alien_cache(new_alien);
kfree(new_shared);
@@ -3906,18 +3910,18 @@ static void do_ccupdate_local(void *info)

/* Always called with the cache_chain_mutex held */
static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
- int batchcount, int shared)
+ int batchcount, int shared, gfp_t gfp)
{
struct ccupdate_struct *new;
int i;

- new = kzalloc(sizeof(*new), GFP_KERNEL);
+ new = kzalloc(sizeof(*new), gfp);
if (!new)
return -ENOMEM;

for_each_online_cpu(i) {
new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
- batchcount);
+ batchcount, gfp);
if (!new->new[i]) {
for (i--; i >= 0; i--)
kfree(new->new[i]);
@@ -3944,11 +3948,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
kfree(ccold);
}
kfree(new);
- return alloc_kmemlist(cachep);
+ return alloc_kmemlist(cachep, gfp);
}

/* Called with cache_chain_mutex held always */
-static int enable_cpucache(struct kmem_cache *cachep)
+static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
{
int err;
int limit, shared;
@@ -3994,7 +3998,7 @@ static int enable_cpucache(struct kmem_cache *cachep)
if (limit > 32)
limit = 32;
#endif
- err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
+ err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
if (err)
printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
cachep->name, -err);
@@ -4300,7 +4304,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
res = 0;
} else {
res = do_tune_cpucache(cachep, limit,
- batchcount, shared);
+ batchcount, shared,
+ GFP_KERNEL);
}
break;
}
diff --git a/mm/slub.c b/mm/slub.c
index 5e805a6..c1815a6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2557,13 +2557,16 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
if (gfp_flags & SLUB_DMA)
flags = SLAB_CACHE_DMA;

- down_write(&slub_lock);
+ /*
+ * This function is called with IRQs disabled during early-boot on
+ * single CPU so there's no need to take slub_lock here.
+ */
if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
flags, NULL))
goto panic;

list_add(&s->list, &slab_caches);
- up_write(&slub_lock);
+
if (sysfs_slab_add(s))
goto panic;
return s;
@@ -3021,7 +3024,7 @@ void __init kmem_cache_init(void)
* kmem_cache_open for slab_state == DOWN.
*/
create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
- sizeof(struct kmem_cache_node), GFP_KERNEL);
+ sizeof(struct kmem_cache_node), GFP_NOWAIT);
kmalloc_caches[0].refcount = -1;
caches++;

@@ -3034,16 +3037,16 @@ void __init kmem_cache_init(void)
/* Caches that are not of the two-to-the-power-of size */
if (KMALLOC_MIN_SIZE <= 64) {
create_kmalloc_cache(&kmalloc_caches[1],
- "kmalloc-96", 96, GFP_KERNEL);
+ "kmalloc-96", 96, GFP_NOWAIT);
caches++;
create_kmalloc_cache(&kmalloc_caches[2],
- "kmalloc-192", 192, GFP_KERNEL);
+ "kmalloc-192", 192, GFP_NOWAIT);
caches++;
}

for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
create_kmalloc_cache(&kmalloc_caches[i],
- "kmalloc", 1 << i, GFP_KERNEL);
+ "kmalloc", 1 << i, GFP_NOWAIT);
caches++;
}

@@ -3080,7 +3083,7 @@ void __init kmem_cache_init(void)
/* Provide the correct kmalloc names now that the caches are up */
for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
kmalloc_caches[i]. name =
- kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
+ kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);

#ifdef CONFIG_SMP
register_cpu_notifier(&slab_notifier);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 083716e..3235138 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -23,7 +23,6 @@
#include <linux/rbtree.h>
#include <linux/radix-tree.h>
#include <linux/rcupdate.h>
-#include <linux/bootmem.h>
#include <linux/pfn.h>

#include <asm/atomic.h>
@@ -1032,7 +1031,7 @@ void __init vmalloc_init(void)

/* Import existing vmlist entries. */
for (tmp = vmlist; tmp; tmp = tmp->next) {
- va = alloc_bootmem(sizeof(struct vmap_area));
+ va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
va->flags = tmp->flags | VM_VM_AREA;
va->va_start = (unsigned long)tmp->addr;
va->va_end = va->va_start + tmp->size;

2009-06-11 21:44:24

by Ingo Molnar

[permalink] [raw]
Subject: Re: [GIT PULL v3] Early boot SLAB for 2.6.31


* Pekka J Enberg <[email protected]> wrote:

> Hi Linus,
>
> Here's third take on the early boot SLAB patches for 2.6.31. I
> fixed the problems found by Ingo in his testing and rebased the
> series to be bisectable. I have tested the tree on 32-bit UMA and
> on x86-64 with qemu and Yanghai has tested them on NUMA+SLUB
> (thanks Yanghai!). Note: The series has not been tested on non-x86
> architectures so we may introduce some breakage there.

Latest -git now produces this boot warning on x86:

[ 0.000000] Memory: 885032k/915540k available (5993k kernel code, 29844k reserved, 3842k data, 428k init, 0k highmem)
[ 0.000000] virtual kernel memory layout:
[ 0.000000] fixmap : 0xffe17000 - 0xfffff000 (1952 kB)
[ 0.000000] vmalloc : 0xf8615000 - 0xffe15000 ( 120 MB)
[ 0.000000] lowmem : 0xc0000000 - 0xf7e15000 ( 894 MB)
[ 0.000000] .init : 0xc19a5000 - 0xc1a10000 ( 428 kB)
[ 0.000000] .data : 0xc15da4bb - 0xc199af6c (3842 kB)
[ 0.000000] .text : 0xc1000000 - 0xc15da4bb (5993 kB)
[ 0.000000] Checking if this processor honours the WP bit even in supervisor mode...Ok.
[ 0.000000] ------------[ cut here ]------------
[ 0.000000] WARNING: at kernel/smp.c:369 smp_call_function_many+0x50/0x1b0()
[ 0.000000] Hardware name: System Product Name
[ 0.000000] Modules linked in:
[ 0.000000] Pid: 0, comm: swapper Not tainted 2.6.30-tip #52504
[ 0.000000] Call Trace:
[ 0.000000] [<c104aa16>] warn_slowpath_common+0x65/0x95
[ 0.000000] [<c104aa58>] warn_slowpath_null+0x12/0x15
[ 0.000000] [<c1073bbe>] smp_call_function_many+0x50/0x1b0
[ 0.000000] [<c1037615>] ? do_flush_tlb_all+0x0/0x41
[ 0.000000] [<c1037615>] ? do_flush_tlb_all+0x0/0x41
[ 0.000000] [<c1073d4f>] smp_call_function+0x31/0x58
[ 0.000000] [<c1037615>] ? do_flush_tlb_all+0x0/0x41
[ 0.000000] [<c104f635>] on_each_cpu+0x26/0x65
[ 0.000000] [<c10374b5>] flush_tlb_all+0x19/0x1b
[ 0.000000] [<c1032ab3>] zap_low_mappings+0x4d/0x56
[ 0.000000] [<c15d64b5>] ? printk+0x14/0x17
[ 0.000000] [<c19b42a8>] mem_init+0x23d/0x245
[ 0.000000] [<c19a56a1>] start_kernel+0x17a/0x2d5
[ 0.000000] [<c19a5347>] ? unknown_bootoption+0x0/0x19a
[ 0.000000] [<c19a5039>] __init_begin+0x39/0x41
[ 0.000000] ---[ end trace 4eaa2a86a8e2da22 ]---
[ 0.000000] ------------[ cut here ]------------
[ 0.000000] WARNING: at kernel/lockdep.c:2128 trace_hardirqs_on_caller+0xc6/0x143()
[ 0.000000] Hardware name: System Product Name
[ 0.000000] Modules linked in:
[ 0.000000] Pid: 0, comm: swapper Tainted: G W 2.6.30-tip #52504
[ 0.000000] Call Trace:
[ 0.000000] [<c104aa16>] warn_slowpath_common+0x65/0x95
[ 0.000000] [<c104f64b>] ? on_each_cpu+0x3c/0x65
[ 0.000000] [<c1037615>] ? do_flush_tlb_all+0x0/0x41
[ 0.000000] [<c104aa58>] warn_slowpath_null+0x12/0x15
[ 0.000000] [<c106c883>] trace_hardirqs_on_caller+0xc6/0x143
[ 0.000000] [<c106c90b>] trace_hardirqs_on+0xb/0xd
[ 0.000000] [<c104f64b>] on_each_cpu+0x3c/0x65
[ 0.000000] [<c10374b5>] flush_tlb_all+0x19/0x1b
[ 0.000000] [<c1032ab3>] zap_low_mappings+0x4d/0x56
[ 0.000000] [<c15d64b5>] ? printk+0x14/0x17
[ 0.000000] [<c19b42a8>] mem_init+0x23d/0x245
[ 0.000000] [<c19a56a1>] start_kernel+0x17a/0x2d5
[ 0.000000] [<c19a5347>] ? unknown_bootoption+0x0/0x19a
[ 0.000000] [<c19a5039>] __init_begin+0x39/0x41
[ 0.000000] ---[ end trace 4eaa2a86a8e2da23 ]---
[ 0.000000] SLUB: Genslabs=13, HWalign=64, Order=0-3, MinObjects=0, CPUs=2, Nodes=1
[ 0.000000] start_kernel(): bug: interrupts were enabled *very* early, fixing it
[ 0.000000] Preemptible RCU implementation.
[ 0.000000] NR_IRQS:2304 nr_irqs:424
[ 0.000000] Fast TSC calibration using PIT
[ 0.000000] Detected 2010.509 MHz processor.
[ 0.010000] spurious 8259A interrupt: IRQ7.
[ 0.010000] Console: colour VGA+ 80x25
[ 0.010000] console [tty0] enabled

config attached.

Ingo


Attachments:
(No filename) (3.91 kB)
config (59.75 kB)
Download all attachments

2009-06-11 22:04:31

by Ingo Molnar

[permalink] [raw]
Subject: Re: [GIT PULL v3] Early boot SLAB for 2.6.31


i also get this different warning, again:

[ 0.000000] ------------[ cut here ]------------
[ 0.000000] WARNING: at mm/bootmem.c:537 alloc_arch_preferred_bootmem+0x2b/0x71()
[ 0.000000] Hardware name: System Product Name
[ 0.000000] Modules linked in:
[ 0.000000] Pid: 0, comm: swapper Tainted: G W 2.6.30-tip-03087-g0bb2618-dirty #52506
[ 0.000000] Call Trace:
[ 0.000000] [<81032588>] warn_slowpath_common+0x60/0x90
[ 0.000000] [<810325c5>] warn_slowpath_null+0xd/0x10
[ 0.000000] [<819d1bc0>] alloc_arch_preferred_bootmem+0x2b/0x71
[ 0.000000] [<819d1c31>] ___alloc_bootmem_nopanic+0x2b/0x9a
[ 0.000000] [<81050a0a>] ? lock_release+0xac/0xb2
[ 0.000000] [<819d1d4c>] ___alloc_bootmem+0xe/0x2d
[ 0.000000] [<819d1e9f>] __alloc_bootmem+0xa/0xc
[ 0.000000] [<819d7c63>] alloc_bootmem_cpumask_var+0x21/0x26
[ 0.000000] [<819d0cc8>] early_irq_init+0x15/0x10d
[ 0.000000] [<819bb75a>] start_kernel+0x167/0x326
[ 0.000000] [<819bb06b>] __init_begin+0x6b/0x70
[ 0.000000] ---[ end trace 4eaa2a86a8e2da23 ]---
[ 0.000000] NR_IRQS:2304 nr_irqs:424
[ 0.000000] CPU 0 irqstacks, hard=821e6000 soft=821e7000

Ingo

2009-06-11 22:43:19

by Yinghai Lu

[permalink] [raw]
Subject: Re: [GIT PULL v3] Early boot SLAB for 2.6.31

Ingo Molnar wrote:
> i also get this different warning, again:
>
> [ 0.000000] ------------[ cut here ]------------
> [ 0.000000] WARNING: at mm/bootmem.c:537 alloc_arch_preferred_bootmem+0x2b/0x71()
> [ 0.000000] Hardware name: System Product Name
> [ 0.000000] Modules linked in:
> [ 0.000000] Pid: 0, comm: swapper Tainted: G W 2.6.30-tip-03087-g0bb2618-dirty #52506
> [ 0.000000] Call Trace:
> [ 0.000000] [<81032588>] warn_slowpath_common+0x60/0x90
> [ 0.000000] [<810325c5>] warn_slowpath_null+0xd/0x10
> [ 0.000000] [<819d1bc0>] alloc_arch_preferred_bootmem+0x2b/0x71
> [ 0.000000] [<819d1c31>] ___alloc_bootmem_nopanic+0x2b/0x9a
> [ 0.000000] [<81050a0a>] ? lock_release+0xac/0xb2
> [ 0.000000] [<819d1d4c>] ___alloc_bootmem+0xe/0x2d
> [ 0.000000] [<819d1e9f>] __alloc_bootmem+0xa/0xc
> [ 0.000000] [<819d7c63>] alloc_bootmem_cpumask_var+0x21/0x26
> [ 0.000000] [<819d0cc8>] early_irq_init+0x15/0x10d
> [ 0.000000] [<819bb75a>] start_kernel+0x167/0x326
> [ 0.000000] [<819bb06b>] __init_begin+0x6b/0x70
> [ 0.000000] ---[ end trace 4eaa2a86a8e2da23 ]---
> [ 0.000000] NR_IRQS:2304 nr_irqs:424
> [ 0.000000] CPU 0 irqstacks, hard=821e6000 soft=821e7000
>

please check

[PATCH] irq: slab alloc for default irq_affinity

Ingo had

[ 0.000000] ------------[ cut here ]------------
[ 0.000000] WARNING: at mm/bootmem.c:537 alloc_arch_preferred_bootmem+0x2b/0x71()
[ 0.000000] Hardware name: System Product Name
[ 0.000000] Modules linked in:
[ 0.000000] Pid: 0, comm: swapper Tainted: G W 2.6.30-tip-03087-g0bb2618-dirty #52506
[ 0.000000] Call Trace:
[ 0.000000] [<81032588>] warn_slowpath_common+0x60/0x90
[ 0.000000] [<810325c5>] warn_slowpath_null+0xd/0x10
[ 0.000000] [<819d1bc0>] alloc_arch_preferred_bootmem+0x2b/0x71
[ 0.000000] [<819d1c31>] ___alloc_bootmem_nopanic+0x2b/0x9a
[ 0.000000] [<81050a0a>] ? lock_release+0xac/0xb2
[ 0.000000] [<819d1d4c>] ___alloc_bootmem+0xe/0x2d
[ 0.000000] [<819d1e9f>] __alloc_bootmem+0xa/0xc
[ 0.000000] [<819d7c63>] alloc_bootmem_cpumask_var+0x21/0x26
[ 0.000000] [<819d0cc8>] early_irq_init+0x15/0x10d
[ 0.000000] [<819bb75a>] start_kernel+0x167/0x326
[ 0.000000] [<819bb06b>] __init_begin+0x6b/0x70
[ 0.000000] ---[ end trace 4eaa2a86a8e2da23 ]---
[ 0.000000] NR_IRQS:2304 nr_irqs:424
[ 0.000000] CPU 0 irqstacks, hard=821e6000 soft=821e7000

we need to update init_irq_default_affinity

Signed-off-by: Yinghai Lu <[email protected]>

---
kernel/irq/handle.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-2.6/kernel/irq/handle.c
===================================================================
--- linux-2.6.orig/kernel/irq/handle.c
+++ linux-2.6/kernel/irq/handle.c
@@ -45,7 +45,7 @@ void handle_bad_irq(unsigned int irq, st
#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
static void __init init_irq_default_affinity(void)
{
- alloc_bootmem_cpumask_var(&irq_default_affinity);
+ alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
cpumask_setall(irq_default_affinity);
}
#else

2009-06-11 23:15:29

by Yinghai Lu

[permalink] [raw]
Subject: Re: [GIT PULL v3] Early boot SLAB for 2.6.31

Ingo Molnar wrote:
> * Pekka J Enberg <[email protected]> wrote:
>
>> Hi Linus,
>>
>> Here's third take on the early boot SLAB patches for 2.6.31. I
>> fixed the problems found by Ingo in his testing and rebased the
>> series to be bisectable. I have tested the tree on 32-bit UMA and
>> on x86-64 with qemu and Yanghai has tested them on NUMA+SLUB
>> (thanks Yanghai!). Note: The series has not been tested on non-x86
>> architectures so we may introduce some breakage there.
>
> Latest -git now produces this boot warning on x86:
>
> [ 0.000000] Memory: 885032k/915540k available (5993k kernel code, 29844k reserved, 3842k data, 428k init, 0k highmem)
> [ 0.000000] virtual kernel memory layout:
> [ 0.000000] fixmap : 0xffe17000 - 0xfffff000 (1952 kB)
> [ 0.000000] vmalloc : 0xf8615000 - 0xffe15000 ( 120 MB)
> [ 0.000000] lowmem : 0xc0000000 - 0xf7e15000 ( 894 MB)
> [ 0.000000] .init : 0xc19a5000 - 0xc1a10000 ( 428 kB)
> [ 0.000000] .data : 0xc15da4bb - 0xc199af6c (3842 kB)
> [ 0.000000] .text : 0xc1000000 - 0xc15da4bb (5993 kB)
> [ 0.000000] Checking if this processor honours the WP bit even in supervisor mode...Ok.
> [ 0.000000] ------------[ cut here ]------------
> [ 0.000000] WARNING: at kernel/smp.c:369 smp_call_function_many+0x50/0x1b0()
> [ 0.000000] Hardware name: System Product Name
> [ 0.000000] Modules linked in:
> [ 0.000000] Pid: 0, comm: swapper Not tainted 2.6.30-tip #52504
> [ 0.000000] Call Trace:
> [ 0.000000] [<c104aa16>] warn_slowpath_common+0x65/0x95
> [ 0.000000] [<c104aa58>] warn_slowpath_null+0x12/0x15
> [ 0.000000] [<c1073bbe>] smp_call_function_many+0x50/0x1b0
> [ 0.000000] [<c1037615>] ? do_flush_tlb_all+0x0/0x41
> [ 0.000000] [<c1037615>] ? do_flush_tlb_all+0x0/0x41
> [ 0.000000] [<c1073d4f>] smp_call_function+0x31/0x58
> [ 0.000000] [<c1037615>] ? do_flush_tlb_all+0x0/0x41
> [ 0.000000] [<c104f635>] on_each_cpu+0x26/0x65
> [ 0.000000] [<c10374b5>] flush_tlb_all+0x19/0x1b
> [ 0.000000] [<c1032ab3>] zap_low_mappings+0x4d/0x56
> [ 0.000000] [<c15d64b5>] ? printk+0x14/0x17
> [ 0.000000] [<c19b42a8>] mem_init+0x23d/0x245
> [ 0.000000] [<c19a56a1>] start_kernel+0x17a/0x2d5
> [ 0.000000] [<c19a5347>] ? unknown_bootoption+0x0/0x19a
> [ 0.000000] [<c19a5039>] __init_begin+0x39/0x41
> [ 0.000000] ---[ end trace 4eaa2a86a8e2da22 ]---
> [ 0.000000] ------------[ cut here ]------------
> [ 0.000000] WARNING: at kernel/lockdep.c:2128 trace_hardirqs_on_caller+0xc6/0x143()
> [ 0.000000] Hardware name: System Product Name
> [ 0.000000] Modules linked in:
> [ 0.000000] Pid: 0, comm: swapper Tainted: G W 2.6.30-tip #52504
> [ 0.000000] Call Trace:
> [ 0.000000] [<c104aa16>] warn_slowpath_common+0x65/0x95
> [ 0.000000] [<c104f64b>] ? on_each_cpu+0x3c/0x65
> [ 0.000000] [<c1037615>] ? do_flush_tlb_all+0x0/0x41
> [ 0.000000] [<c104aa58>] warn_slowpath_null+0x12/0x15
> [ 0.000000] [<c106c883>] trace_hardirqs_on_caller+0xc6/0x143
> [ 0.000000] [<c106c90b>] trace_hardirqs_on+0xb/0xd
> [ 0.000000] [<c104f64b>] on_each_cpu+0x3c/0x65
> [ 0.000000] [<c10374b5>] flush_tlb_all+0x19/0x1b
> [ 0.000000] [<c1032ab3>] zap_low_mappings+0x4d/0x56
> [ 0.000000] [<c15d64b5>] ? printk+0x14/0x17
> [ 0.000000] [<c19b42a8>] mem_init+0x23d/0x245
> [ 0.000000] [<c19a56a1>] start_kernel+0x17a/0x2d5
> [ 0.000000] [<c19a5347>] ? unknown_bootoption+0x0/0x19a
> [ 0.000000] [<c19a5039>] __init_begin+0x39/0x41
> [ 0.000000] ---[ end trace 4eaa2a86a8e2da23 ]---
> [ 0.000000] SLUB: Genslabs=13, HWalign=64, Order=0-3, MinObjects=0, CPUs=2, Nodes=1
> [ 0.000000] start_kernel(): bug: interrupts were enabled *very* early, fixing it
> [ 0.000000] Preemptible RCU implementation.
> [ 0.000000] NR_IRQS:2304 nr_irqs:424
> [ 0.000000] Fast TSC calibration using PIT
> [ 0.000000] Detected 2010.509 MHz processor.
> [ 0.010000] spurious 8259A interrupt: IRQ7.
> [ 0.010000] Console: colour VGA+ 80x25
> [ 0.010000] console [tty0] enabled
>
> config attached.

please check

[PATCH] x86: make zap_low_mapping could be used early

only one cpu is there, just call __flush_tlb for it

Signed-off-by: Yinghai Lu <[email protected]>

---
arch/x86/include/asm/tlbflush.h | 2 +-
arch/x86/kernel/smpboot.c | 2 +-
arch/x86/mm/init_32.c | 10 +++++++---
3 files changed, 9 insertions(+), 5 deletions(-)

Index: linux-2.6/arch/x86/include/asm/tlbflush.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/tlbflush.h
+++ linux-2.6/arch/x86/include/asm/tlbflush.h
@@ -172,6 +172,6 @@ static inline void flush_tlb_kernel_rang
flush_tlb_all();
}

-extern void zap_low_mappings(void);
+extern void zap_low_mappings(bool early);

#endif /* _ASM_X86_TLBFLUSH_H */
Index: linux-2.6/arch/x86/kernel/smpboot.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/smpboot.c
+++ linux-2.6/arch/x86/kernel/smpboot.c
@@ -875,7 +875,7 @@ int __cpuinit native_cpu_up(unsigned int

err = do_boot_cpu(apicid, cpu);

- zap_low_mappings();
+ zap_low_mappings(false);
low_mappings = 0;
#else
err = do_boot_cpu(apicid, cpu);
Index: linux-2.6/arch/x86/mm/init_32.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/init_32.c
+++ linux-2.6/arch/x86/mm/init_32.c
@@ -576,7 +576,7 @@ static inline void save_pg_dir(void)
}
#endif /* !CONFIG_ACPI_SLEEP */

-void zap_low_mappings(void)
+void zap_low_mappings(bool early)
{
int i;

@@ -593,7 +593,11 @@ void zap_low_mappings(void)
set_pgd(swapper_pg_dir+i, __pgd(0));
#endif
}
- flush_tlb_all();
+
+ if (early)
+ __flush_tlb();
+ else
+ flush_tlb_all();
}

pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
@@ -968,7 +972,7 @@ void __init mem_init(void)
test_wp_bit();

save_pg_dir();
- zap_low_mappings();
+ zap_low_mappings(true);
}

#ifdef CONFIG_MEMORY_HOTPLUG

2009-06-12 07:18:34

by Ingo Molnar

[permalink] [raw]
Subject: Re: [GIT PULL v3] Early boot SLAB for 2.6.31


Plus i quickly got this crash too:

[ 0.000000] console [tty0] enabled
[ 0.000000] allocation of page_cgroup was failed.
[ 0.000000] please try cgroup_disable=memory boot option
[ 0.000000] Kernel panic - not syncing: Out of memory
[ 0.000000] Rebooting in 1 seconds..<1>BUG: unable to handle kernel NULL pointer dereference at 0000004c
[ 0.000000] IP: [<c16ec708>] klist_next+0x10/0x8f
[ 0.000000] *pdpt = 0000000001a8a001 *pde = 0000000000000000
[ 0.000000] Thread overran stack, or stack corrupted

crash-log and config attached.

Ingo

[ 0.000000] Linux version 2.6.30-tip-03093-gff58544-dirty (mingo@sirius) (gcc version 4.3.2 20081105 (Red Hat 4.3.2-7) (GCC) ) #52516 SMP PREEMPT Fri Jun 12 00:24:13 CEST 2009
[ 0.000000] KERNEL supported cpus:
[ 0.000000] Intel GenuineIntel
[ 0.000000] AMD AuthenticAMD
[ 0.000000] NSC Geode by NSC
[ 0.000000] Cyrix CyrixInstead
[ 0.000000] Centaur CentaurHauls
[ 0.000000] Transmeta GenuineTMx86
[ 0.000000] Transmeta TransmetaCPU
[ 0.000000] UMC UMC UMC UMC
[ 0.000000] BIOS-provided physical RAM map:
[ 0.000000] BIOS-e820: 0000000000000000 - 000000000009f800 (usable)
[ 0.000000] BIOS-e820: 000000000009f800 - 00000000000a0000 (reserved)
[ 0.000000] BIOS-e820: 00000000000f0000 - 0000000000100000 (reserved)
[ 0.000000] BIOS-e820: 0000000000100000 - 000000003fff0000 (usable)
[ 0.000000] BIOS-e820: 000000003fff0000 - 000000003fff3000 (ACPI NVS)
[ 0.000000] BIOS-e820: 000000003fff3000 - 0000000040000000 (ACPI data)
[ 0.000000] BIOS-e820: 00000000e0000000 - 00000000f0000000 (reserved)
[ 0.000000] BIOS-e820: 00000000fec00000 - 0000000100000000 (reserved)
[ 0.000000] console [earlyser0] enabled
[ 0.000000] debug: ignoring loglevel setting.
[ 0.000000] using polling idle threads.
[ 0.000000] DMI 2.3 present.
[ 0.000000] Phoenix BIOS detected: BIOS may corrupt low RAM, working around it.
[ 0.000000] e820 update range: 0000000000000000 - 0000000000010000 (usable) ==> (reserved)
[ 0.000000] last_pfn = 0x3fff0 max_arch_pfn = 0x1000000
[ 0.000000] only 133MB highmem pages available, ignoring highmem size of 512MB!
[ 0.000000] Scanning 0 areas for low memory corruption
[ 0.000000] modified physical RAM map:
[ 0.000000] modified: 0000000000000000 - 0000000000010000 (reserved)
[ 0.000000] modified: 0000000000010000 - 000000000009f800 (usable)
[ 0.000000] modified: 000000000009f800 - 00000000000a0000 (reserved)
[ 0.000000] modified: 00000000000f0000 - 0000000000100000 (reserved)
[ 0.000000] modified: 0000000000100000 - 000000003fff0000 (usable)
[ 0.000000] modified: 000000003fff0000 - 000000003fff3000 (ACPI NVS)
[ 0.000000] modified: 000000003fff3000 - 0000000040000000 (ACPI data)
[ 0.000000] modified: 00000000e0000000 - 00000000f0000000 (reserved)
[ 0.000000] modified: 00000000fec00000 - 0000000100000000 (reserved)
[ 0.000000] initial memory mapped : 0 - 02000000
[ 0.000000] init_memory_mapping: 0000000000000000-00000000379fe000
[ 0.000000] NX (Execute Disable) protection: active
[ 0.000000] 0000000000 - 0000200000 page 4k
[ 0.000000] 0000200000 - 0037800000 page 2M
[ 0.000000] 0037800000 - 00379fe000 page 4k
[ 0.000000] kernel direct mapping tables up to 379fe000 @ 10000-16000
[ 0.000000] Scan SMP from c0000000 for 1024 bytes.
[ 0.000000] Scan SMP from c009fc00 for 1024 bytes.
[ 0.000000] Scan SMP from c00f0000 for 65536 bytes.
[ 0.000000] found SMP MP-table at [c00f5680] f5680
[ 0.000000] Intel MultiProcessor Specification v1.4
[ 0.000000] Virtual Wire compatibility mode.
[ 0.000000] mpc: f1400-f152c
[ 0.000000] MPTABLE: OEM ID: OEM00000
[ 0.000000] MPTABLE: Product ID: PROD00000000
[ 0.000000] MPTABLE: APIC at: 0xFEE00000
[ 0.000000] Warning! Not a NUMA-Q system!
[ 0.000000] NUMA - single node, flat memory mode
[ 0.000000] Node: 0, start_pfn: 0, end_pfn: 3fff0
[ 0.000000] Setting physnode_map array to node 0 for pfns:
[ 0.000000] 0 4000 8000 c000 10000 14000 18000 1c000 20000 24000 28000 2c000 30000 34000 38000 3c000
[ 0.000000] node 0 pfn: [0 - 3fff0]
[ 0.000000] Reserving 2560 pages of KVA for lmem_map of node 0 at 3f400
[ 0.000000] remove_active_range (0, 259072, 261632)
[ 0.000000] Reserving total of a00 pages for numa KVA remap
[ 0.000000] kva_start_pfn ~ 36e00 max_low_pfn ~ 379fe
[ 0.000000] max_pfn = 3fff0
[ 0.000000] 133MB HIGHMEM available.
[ 0.000000] 889MB LOWMEM available.
[ 0.000000] max_low_pfn = 379fe, highstart_pfn = 379fe
[ 0.000000] Low memory ends at vaddr f79fe000
[ 0.000000] node 0 will remap to vaddr f6e00000 - f7800000
[ 0.000000] allocate_pgdat: node 0 NODE_DATA f6e00000
[ 0.000000] remap_numa_kva: node 0
[ 0.000000] remap_numa_kva: f6e00000 to pfn 0003f400
[ 0.000000] remap_numa_kva: f7000000 to pfn 0003f600
[ 0.000000] remap_numa_kva: f7200000 to pfn 0003f800
[ 0.000000] remap_numa_kva: f7400000 to pfn 0003fa00
[ 0.000000] remap_numa_kva: f7600000 to pfn 0003fc00
[ 0.000000] High memory starts at vaddr f79fe000
[ 0.000000] mapped low ram: 0 - 379fe000
[ 0.000000] low ram: 0 - 379fe000
[ 0.000000] node 0 low ram: 00000000 - 379fe000
[ 0.000000] node 0 bootmap 00011000 - 00017f40
[ 0.000000] (10 early reservations) ==> bootmem [0000000000 - 00379fe000]
[ 0.000000] #0 [0000000000 - 0000001000] BIOS data page ==> [0000000000 - 0000001000]
[ 0.000000] #1 [0000001000 - 0000002000] EX TRAMPOLINE ==> [0000001000 - 0000002000]
[ 0.000000] #2 [0000006000 - 0000007000] TRAMPOLINE ==> [0000006000 - 0000007000]
[ 0.000000] #3 [0001000000 - 0001c2f5a0] TEXT DATA BSS ==> [0001000000 - 0001c2f5a0]
[ 0.000000] #4 [000009f800 - 0000100000] BIOS reserved ==> [000009f800 - 0000100000]
[ 0.000000] #5 [0001c30000 - 0001c40149] BRK ==> [0001c30000 - 0001c40149]
[ 0.000000] #6 [0000010000 - 0000011000] PGTABLE ==> [0000010000 - 0000011000]
[ 0.000000] #7 [003f400000 - 003fe00000] KVA RAM
[ 0.000000] #8 [0036e00000 - 0037800000] KVA PG ==> [0036e00000 - 0037800000]
[ 0.000000] #9 [0000011000 - 0000018000] BOOTMAP ==> [0000011000 - 0000018000]
[ 0.000000] Scan SMP from c0000000 for 1024 bytes.
[ 0.000000] Scan SMP from c009fc00 for 1024 bytes.
[ 0.000000] Scan SMP from c00f0000 for 65536 bytes.
[ 0.000000] found SMP MP-table at [c00f5680] f5680
[ 0.000000] mpc: f1400-f152c
[ 0.000000] Zone PFN ranges:
[ 0.000000] DMA 0x00000010 -> 0x00001000
[ 0.000000] Normal 0x00001000 -> 0x000379fe
[ 0.000000] HighMem 0x000379fe -> 0x0003fff0
[ 0.000000] Movable zone start PFN for each node
[ 0.000000] early_node_map[3] active PFN ranges
[ 0.000000] 0: 0x00000010 -> 0x0000009f
[ 0.000000] 0: 0x00000100 -> 0x0003f400
[ 0.000000] 0: 0x0003fe00 -> 0x0003fff0
[ 0.000000] On node 0 totalpages: 259455
[ 0.000000] free_area_init_node: node 0, pgdat f6e00000, node_mem_map f6e02200
[ 0.000000] DMA zone: 32 pages used for memmap
[ 0.000000] DMA zone: 0 pages reserved
[ 0.000000] DMA zone: 3951 pages, LIFO batch:0
[ 0.000000] Normal zone: 1748 pages used for memmap
[ 0.000000] Normal zone: 221994 pages, LIFO batch:31
[ 0.000000] HighMem zone: 268 pages used for memmap
[ 0.000000] HighMem zone: 31462 pages, LIFO batch:7
[ 0.000000] Using APIC driver default
[ 0.000000] Intel MultiProcessor Specification v1.4
[ 0.000000] Virtual Wire compatibility mode.
[ 0.000000] mpc: f1400-f152c
[ 0.000000] MPTABLE: OEM ID: OEM00000
[ 0.000000] MPTABLE: Product ID: PROD00000000
[ 0.000000] MPTABLE: APIC at: 0xFEE00000
[ 0.000000] Warning! Not a NUMA-Q system!
[ 0.000000] Processor #0 (Bootup-CPU)
[ 0.000000] Processor #1
[ 0.000000] Bus #0 is PCI
[ 0.000000] Bus #1 is PCI
[ 0.000000] Bus #2 is PCI
[ 0.000000] Bus #3 is PCI
[ 0.000000] Bus #4 is PCI
[ 0.000000] Bus #5 is PCI
[ 0.000000] Bus #6 is ISA
[ 0.000000] I/O APIC #2 Version 17 at 0xFEC00000.
[ 0.000000] Int: type 0, pol 3, trig 3, bus 00, IRQ 28, APIC ID 2, APIC INT 0b
[ 0.000000] Int: type 0, pol 3, trig 3, bus 00, IRQ 10, APIC ID 2, APIC INT 03
[ 0.000000] Int: type 0, pol 3, trig 3, bus 01, IRQ 00, APIC ID 2, APIC INT 05
[ 0.000000] Int: type 0, pol 3, trig 3, bus 05, IRQ 1c, APIC ID 2, APIC INT 0b
[ 0.000000] Int: type 3, pol 0, trig 0, bus 06, IRQ 00, APIC ID 2, APIC INT 00
[ 0.000000] Int: type 0, pol 0, trig 0, bus 06, IRQ 01, APIC ID 2, APIC INT 01
[ 0.000000] Int: type 0, pol 0, trig 0, bus 06, IRQ 00, APIC ID 2, APIC INT 02
[ 0.000000] Int: type 0, pol 0, trig 0, bus 06, IRQ 04, APIC ID 2, APIC INT 04
[ 0.000000] Int: type 0, pol 0, trig 0, bus 06, IRQ 06, APIC ID 2, APIC INT 06
[ 0.000000] Int: type 0, pol 0, trig 0, bus 06, IRQ 07, APIC ID 2, APIC INT 07
[ 0.000000] Int: type 0, pol 1, trig 1, bus 06, IRQ 08, APIC ID 2, APIC INT 08
[ 0.000000] Int: type 0, pol 0, trig 0, bus 06, IRQ 09, APIC ID 2, APIC INT 09
[ 0.000000] Int: type 0, pol 0, trig 0, bus 06, IRQ 0a, APIC ID 2, APIC INT 0a
[ 0.000000] Int: type 0, pol 0, trig 0, bus 06, IRQ 0c, APIC ID 2, APIC INT 0c
[ 0.000000] Int: type 0, pol 0, trig 0, bus 06, IRQ 0d, APIC ID 2, APIC INT 0d
[ 0.000000] Int: type 0, pol 0, trig 0, bus 06, IRQ 0e, APIC ID 2, APIC INT 0e
[ 0.000000] Int: type 0, pol 0, trig 0, bus 06, IRQ 0f, APIC ID 2, APIC INT 0f
[ 0.000000] Lint: type 3, pol 0, trig 0, bus 00, IRQ 00, APIC ID ff, APIC LINT 00
[ 0.000000] Lint: type 1, pol 0, trig 0, bus 00, IRQ 00, APIC ID ff, APIC LINT 01
[ 0.000000] Enabling APIC mode: Flat. Using 1 I/O APICs
[ 0.000000] Processors: 2
[ 0.000000] SMP: Allowing 2 CPUs, 0 hotplug CPUs
[ 0.000000] mapped APIC to ffffb000 (fee00000)
[ 0.000000] mapped IOAPIC to ffffa000 (fec00000)
[ 0.000000] nr_irqs_gsi: 24
[ 0.000000] PM: Registered nosave memory: 000000000009f000 - 00000000000a0000
[ 0.000000] PM: Registered nosave memory: 00000000000a0000 - 00000000000f0000
[ 0.000000] PM: Registered nosave memory: 00000000000f0000 - 0000000000100000
[ 0.000000] Allocating PCI resources starting at 40000000 (gap: 40000000:a0000000)
[ 0.000000] NR_CPUS:32 nr_cpumask_bits:32 nr_cpu_ids:2 nr_node_ids:16
[ 0.000000] PERCPU: Embedded 16 pages at c1c46000, static data 42108 bytes
[ 0.000000] Built 1 zonelists in Zone order, mobility grouping on. Total pages: 257407
[ 0.000000] Policy zone: HighMem
[ 0.000000] Kernel command line: root=/dev/sda1 earlyprintk=serial,ttyS0,115200,keep console=tty debug initcall_debug enforcing=0 apic=verbose ignore_loglevel sysrq_always_enabled selinux=0 nmi_watchdog=0 3 panic=1 nosmp highres=0 nolapic_timer hpet=disable idle=poll highmem=512m notsc pci=nomsi
[ 0.000000] debug: sysrq always enabled.
[ 0.000000] PID hash table entries: 4096 (order: 12, 16384 bytes)
[ 0.000000] Dentry cache hash table entries: 131072 (order: 7, 524288 bytes)
[ 0.000000] Inode-cache hash table entries: 65536 (order: 6, 262144 bytes)
[ 0.000000] Enabling fast FPU save and restore... done.
[ 0.000000] Enabling unmasked SIMD FPU exception support... done.
[ 0.000000] Initializing CPU#0
[ 0.000000] Initializing HighMem for node 0 (000379fe:0003fff0)
[ 0.000000] Memory: 1014088k/1048512k available (7319k kernel code, 23732k reserved, 3017k data, 412k init, 126920k highmem)
[ 0.000000] virtual kernel memory layout:
[ 0.000000] fixmap : 0xffe16000 - 0xfffff000 (1956 kB)
[ 0.000000] pkmap : 0xffa00000 - 0xffc00000 (2048 kB)
[ 0.000000] vmalloc : 0xf81fe000 - 0xff9fe000 ( 120 MB)
[ 0.000000] lowmem : 0xc0000000 - 0xf79fe000 ( 889 MB)
[ 0.000000] .init : 0xc1a21000 - 0xc1a88000 ( 412 kB)
[ 0.000000] .data : 0xc1725d40 - 0xc1a181fc (3017 kB)
[ 0.000000] .text : 0xc1000000 - 0xc1725d40 (7319 kB)
[ 0.000000] Checking if this processor honours the WP bit even in supervisor mode...Ok.
[ 0.000000] SLUB: Genslabs=14, HWalign=64, Order=0-3, MinObjects=0, CPUs=2, Nodes=16
[ 0.000000] start_kernel(): bug: interrupts were enabled *very* early, fixing it
[ 0.000000] Experimental hierarchical RCU implementation.
[ 0.000000] RCU-based detection of stalled CPUs is enabled.
[ 0.000000] Experimental hierarchical RCU init done.
[ 0.000000] NR_IRQS:1280
[ 0.000000] CPU 0 irqstacks, hard=c1c46000 soft=c1c47000
[ 0.000000] spurious 8259A interrupt: IRQ7.
[ 0.000000] Console: colour VGA+ 80x25
[ 0.000000] console [tty0] enabled
[ 0.000000] allocation of page_cgroup was failed.
[ 0.000000] please try cgroup_disable=memory boot option
[ 0.000000] Kernel panic - not syncing: Out of memory
[ 0.000000] Rebooting in 1 seconds..<1>BUG: unable to handle kernel NULL pointer dereference at 0000004c
[ 0.000000] IP: [<c16ec708>] klist_next+0x10/0x8f
[ 0.000000] *pdpt = 0000000001a8a001 *pde = 0000000000000000
[ 0.000000] Thread overran stack, or stack corrupted
[ 0.000000] Oops: 0000 [#1] PREEMPT SMP
[ 0.000000] last sysfs file:
[ 0.000000] Modules linked in:
[ 0.000000]
[ 0.000000] Pid: 0, comm: swapper Not tainted (2.6.30-tip-03093-gff58544-dirty #52516) System Product Name
[ 0.000000] EIP: 0060:[<c16ec708>] EFLAGS: 00010082 CPU: 0
[ 0.000000] EIP is at klist_next+0x10/0x8f
[ 0.000000] EAX: 0000003c EBX: c1a19f1c ECX: 00000000 EDX: c1a19f1c
[ 0.000000] ESI: c1a19f1c EDI: 00000000 EBP: c1a19f14 ESP: c1a19f04
[ 0.000000] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
[ 0.000000] Process swapper (pid: 0, ti=c1a19000 task=c1976320 task.ti=c1a19000)
[ 0.000000] Stack:
[ 0.000000] 00000046 c1a19f1c c1a19f1c 00000000 c1a19f30 c135c631 0000003c 00000000
[ 0.000000] c1728f38 00001078 00000100 c1a19f40 c12f7eed c12f7034 c1728f38 c1a19f58
[ 0.000000] c12fcf02 ffffffff c1728f38 000003e8 c19ea000 c1a19f68 c12fcf5b ffffffff
[ 0.000000] Call Trace:
[ 0.000000] [<c135c631>] ? bus_find_device+0x55/0x75
[ 0.000000] [<c12f7eed>] ? no_pci_devices+0x1c/0x33
[ 0.000000] [<c12f7034>] ? find_anything+0x0/0xf
[ 0.000000] [<c12fcf02>] ? pci_get_subsys+0x1a/0x60
[ 0.000000] [<c12fcf5b>] ? pci_get_device+0x13/0x15
[ 0.000000] [<c10156d4>] ? mach_reboot_fixups+0x2c/0x41
[ 0.000000] [<c1010ca0>] ? native_machine_emergency_restart+0x6a/0x11a
[ 0.000000] [<c10109d4>] ? machine_emergency_restart+0x18/0x1a
[ 0.000000] [<c103bd73>] ? emergency_restart+0xd/0xf
[ 0.000000] [<c17202a7>] ? panic+0xc1/0xe3
[ 0.000000] [<c1a366fb>] ? page_cgroup_init+0x187/0x1ac
[ 0.000000] [<c1a217d1>] ? start_kernel+0x2d2/0x373
[ 0.000000] [<c1a2106a>] ? __init_begin+0x6a/0x6f


Attachments:
(No filename) (14.54 kB)
config (66.60 kB)
Download all attachments

2009-06-12 07:23:50

by Li Zefan

[permalink] [raw]
Subject: Re: [GIT PULL v3] Early boot SLAB for 2.6.31

Ingo Molnar wrote:
> Plus i quickly got this crash too:
>
> [ 0.000000] console [tty0] enabled
> [ 0.000000] allocation of page_cgroup was failed.
> [ 0.000000] please try cgroup_disable=memory boot option
> [ 0.000000] Kernel panic - not syncing: Out of memory
> [ 0.000000] Rebooting in 1 seconds..<1>BUG: unable to handle kernel NULL pointer dereference at 0000004c
> [ 0.000000] IP: [<c16ec708>] klist_next+0x10/0x8f
> [ 0.000000] *pdpt = 0000000001a8a001 *pde = 0000000000000000
> [ 0.000000] Thread overran stack, or stack corrupted
>
> crash-log and config attached.
>

reported and fixed:

http://marc.info/?l=linux-kernel&m=124478832620049&w=2

Pekka will push the fix to Linus.

2009-06-12 07:29:56

by Pekka Enberg

[permalink] [raw]
Subject: Re: [GIT PULL v3] Early boot SLAB for 2.6.31

Hi Ingo,

On Fri, 2009-06-12 at 09:17 +0200, Ingo Molnar wrote:
> Plus i quickly got this crash too:
>
> [ 0.000000] console [tty0] enabled
> [ 0.000000] allocation of page_cgroup was failed.
> [ 0.000000] please try cgroup_disable=memory boot option
> [ 0.000000] Kernel panic - not syncing: Out of memory
> [ 0.000000] Rebooting in 1 seconds..<1>BUG: unable to handle kernel NULL pointer dereference at 0000004c
> [ 0.000000] IP: [<c16ec708>] klist_next+0x10/0x8f
> [ 0.000000] *pdpt = 0000000001a8a001 *pde = 0000000000000000
> [ 0.000000] Thread overran stack, or stack corrupted

Hmm, does this patch fix it? Hiroyuki, Li, does the oops look familiar to you?

Pekka

From: KAMEZAWA Hiroyuki <[email protected]>

Now, SLAB is configured in very early stage and it can be used in
init routine now.

But replacing alloc_bootmem() in FLAT/DISCONTIGMEM's page_cgroup()
initialization breaks the allocation, now.
(Works well in SPARSEMEM case...it supports MEMORY_HOTPLUG and
size of page_cgroup is in reasonable size (< 1 << MAX_ORDER.)

This patch revive FLATMEM+memory cgroup by using alloc_bootmem.

In future,
We stop to support FLATMEM (if no users) or rewrite codes for flatmem
completely.But this will adds more messy codes and overheads.

Changelog: v1->v2
- fixed typos.

Acked-by: Pekka Enberg <[email protected]>
Tested-by: Li Zefan <[email protected]>
Reported-by: Li Zefan <[email protected]>
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
include/linux/page_cgroup.h | 18 +++++++++++++++++-
init/main.c | 5 +++++
mm/page_cgroup.c | 29 ++++++++++-------------------
3 files changed, 32 insertions(+), 20 deletions(-)

Index: linux-2.6.30.org/init/main.c
===================================================================
--- linux-2.6.30.org.orig/init/main.c 2009-06-11 19:02:53.000000000 +0900
+++ linux-2.6.30.org/init/main.c 2009-06-11 20:49:21.000000000 +0900
@@ -539,6 +539,11 @@
*/
static void __init mm_init(void)
{
+ /*
+ * page_cgroup requires countinous pages as memmap
+ * and it's bigger than MAX_ORDER unless SPARSEMEM.
+ */
+ page_cgroup_init_flatmem();
mem_init();
kmem_cache_init();
vmalloc_init();
Index: linux-2.6.30.org/mm/page_cgroup.c
===================================================================
--- linux-2.6.30.org.orig/mm/page_cgroup.c 2009-06-11 19:02:53.000000000 +0900
+++ linux-2.6.30.org/mm/page_cgroup.c 2009-06-11 20:49:59.000000000 +0900
@@ -47,8 +47,6 @@
struct page_cgroup *base, *pc;
unsigned long table_size;
unsigned long start_pfn, nr_pages, index;
- struct page *page;
- unsigned int order;

start_pfn = NODE_DATA(nid)->node_start_pfn;
nr_pages = NODE_DATA(nid)->node_spanned_pages;
@@ -57,13 +55,11 @@
return 0;

table_size = sizeof(struct page_cgroup) * nr_pages;
- order = get_order(table_size);
- page = alloc_pages_node(nid, GFP_NOWAIT | __GFP_ZERO, order);
- if (!page)
- page = alloc_pages_node(-1, GFP_NOWAIT | __GFP_ZERO, order);
- if (!page)
+
+ base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
+ table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ if (!base)
return -ENOMEM;
- base = page_address(page);
for (index = 0; index < nr_pages; index++) {
pc = base + index;
__init_page_cgroup(pc, start_pfn + index);
@@ -73,7 +69,7 @@
return 0;
}

-void __init page_cgroup_init(void)
+void __init page_cgroup_init_flatmem(void)
{

int nid, fail;
@@ -117,16 +113,11 @@
if (!section->page_cgroup) {
nid = page_to_nid(pfn_to_page(pfn));
table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
- if (slab_is_available()) {
- base = kmalloc_node(table_size,
- GFP_KERNEL | __GFP_NOWARN, nid);
- if (!base)
- base = vmalloc_node(table_size, nid);
- } else {
- base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
- table_size,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
- }
+ VM_BUG_ON(!slab_is_available());
+ base = kmalloc_node(table_size,
+ GFP_KERNEL | __GFP_NOWARN, nid);
+ if (!base)
+ base = vmalloc_node(table_size, nid);
} else {
/*
* We don't have to allocate page_cgroup again, but
Index: linux-2.6.30.org/include/linux/page_cgroup.h
===================================================================
--- linux-2.6.30.org.orig/include/linux/page_cgroup.h 2009-06-10 12:05:27.000000000 +0900
+++ linux-2.6.30.org/include/linux/page_cgroup.h 2009-06-11 20:50:32.000000000 +0900
@@ -18,7 +18,19 @@
};

void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat);
-void __init page_cgroup_init(void);
+
+#ifdef CONFIG_SPARSEMEM
+static inline void __init page_cgroup_init_flatmem(void)
+{
+}
+extern void __init page_cgroup_init(void);
+#else
+void __init page_cgroup_init_flatmem(void);
+static inline void __init page_cgroup_init(void)
+{
+}
+#endif
+
struct page_cgroup *lookup_page_cgroup(struct page *page);

enum {
@@ -87,6 +99,10 @@
{
}

+static inline void __init page_cgroup_init_flatmem(void)
+{
+}
+
#endif

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP


2009-06-12 07:31:35

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [GIT PULL v3] Early boot SLAB for 2.6.31

On Fri, 12 Jun 2009 10:29:48 +0300
Pekka Enberg <[email protected]> wrote:

> Hi Ingo,
>
> On Fri, 2009-06-12 at 09:17 +0200, Ingo Molnar wrote:
> > Plus i quickly got this crash too:
> >
> > [ 0.000000] console [tty0] enabled
> > [ 0.000000] allocation of page_cgroup was failed.
> > [ 0.000000] please try cgroup_disable=memory boot option
> > [ 0.000000] Kernel panic - not syncing: Out of memory
> > [ 0.000000] Rebooting in 1 seconds..<1>BUG: unable to handle kernel NULL pointer dereference at 0000004c
> > [ 0.000000] IP: [<c16ec708>] klist_next+0x10/0x8f
> > [ 0.000000] *pdpt = 0000000001a8a001 *pde = 0000000000000000
> > [ 0.000000] Thread overran stack, or stack corrupted
>
> Hmm, does this patch fix it? Hiroyuki, Li, does the oops look familiar to you?
>
> Pekka
>
Ah, yes. maybe. This line
"please try cgroup_disable=memory boot option"
implies that.

-Kame

> From: KAMEZAWA Hiroyuki <[email protected]>
>
> Now, SLAB is configured in very early stage and it can be used in
> init routine now.
>
> But replacing alloc_bootmem() in FLAT/DISCONTIGMEM's page_cgroup()
> initialization breaks the allocation, now.
> (Works well in SPARSEMEM case...it supports MEMORY_HOTPLUG and
> size of page_cgroup is in reasonable size (< 1 << MAX_ORDER.)
>
> This patch revive FLATMEM+memory cgroup by using alloc_bootmem.
>
> In future,
> We stop to support FLATMEM (if no users) or rewrite codes for flatmem
> completely.But this will adds more messy codes and overheads.
>
> Changelog: v1->v2
> - fixed typos.
>
> Acked-by: Pekka Enberg <[email protected]>
> Tested-by: Li Zefan <[email protected]>
> Reported-by: Li Zefan <[email protected]>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> ---
> include/linux/page_cgroup.h | 18 +++++++++++++++++-
> init/main.c | 5 +++++
> mm/page_cgroup.c | 29 ++++++++++-------------------
> 3 files changed, 32 insertions(+), 20 deletions(-)
>
> Index: linux-2.6.30.org/init/main.c
> ===================================================================
> --- linux-2.6.30.org.orig/init/main.c 2009-06-11 19:02:53.000000000 +0900
> +++ linux-2.6.30.org/init/main.c 2009-06-11 20:49:21.000000000 +0900
> @@ -539,6 +539,11 @@
> */
> static void __init mm_init(void)
> {
> + /*
> + * page_cgroup requires countinous pages as memmap
> + * and it's bigger than MAX_ORDER unless SPARSEMEM.
> + */
> + page_cgroup_init_flatmem();
> mem_init();
> kmem_cache_init();
> vmalloc_init();
> Index: linux-2.6.30.org/mm/page_cgroup.c
> ===================================================================
> --- linux-2.6.30.org.orig/mm/page_cgroup.c 2009-06-11 19:02:53.000000000 +0900
> +++ linux-2.6.30.org/mm/page_cgroup.c 2009-06-11 20:49:59.000000000 +0900
> @@ -47,8 +47,6 @@
> struct page_cgroup *base, *pc;
> unsigned long table_size;
> unsigned long start_pfn, nr_pages, index;
> - struct page *page;
> - unsigned int order;
>
> start_pfn = NODE_DATA(nid)->node_start_pfn;
> nr_pages = NODE_DATA(nid)->node_spanned_pages;
> @@ -57,13 +55,11 @@
> return 0;
>
> table_size = sizeof(struct page_cgroup) * nr_pages;
> - order = get_order(table_size);
> - page = alloc_pages_node(nid, GFP_NOWAIT | __GFP_ZERO, order);
> - if (!page)
> - page = alloc_pages_node(-1, GFP_NOWAIT | __GFP_ZERO, order);
> - if (!page)
> +
> + base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> + if (!base)
> return -ENOMEM;
> - base = page_address(page);
> for (index = 0; index < nr_pages; index++) {
> pc = base + index;
> __init_page_cgroup(pc, start_pfn + index);
> @@ -73,7 +69,7 @@
> return 0;
> }
>
> -void __init page_cgroup_init(void)
> +void __init page_cgroup_init_flatmem(void)
> {
>
> int nid, fail;
> @@ -117,16 +113,11 @@
> if (!section->page_cgroup) {
> nid = page_to_nid(pfn_to_page(pfn));
> table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
> - if (slab_is_available()) {
> - base = kmalloc_node(table_size,
> - GFP_KERNEL | __GFP_NOWARN, nid);
> - if (!base)
> - base = vmalloc_node(table_size, nid);
> - } else {
> - base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> - table_size,
> - PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> - }
> + VM_BUG_ON(!slab_is_available());
> + base = kmalloc_node(table_size,
> + GFP_KERNEL | __GFP_NOWARN, nid);
> + if (!base)
> + base = vmalloc_node(table_size, nid);
> } else {
> /*
> * We don't have to allocate page_cgroup again, but
> Index: linux-2.6.30.org/include/linux/page_cgroup.h
> ===================================================================
> --- linux-2.6.30.org.orig/include/linux/page_cgroup.h 2009-06-10 12:05:27.000000000 +0900
> +++ linux-2.6.30.org/include/linux/page_cgroup.h 2009-06-11 20:50:32.000000000 +0900
> @@ -18,7 +18,19 @@
> };
>
> void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat);
> -void __init page_cgroup_init(void);
> +
> +#ifdef CONFIG_SPARSEMEM
> +static inline void __init page_cgroup_init_flatmem(void)
> +{
> +}
> +extern void __init page_cgroup_init(void);
> +#else
> +void __init page_cgroup_init_flatmem(void);
> +static inline void __init page_cgroup_init(void)
> +{
> +}
> +#endif
> +
> struct page_cgroup *lookup_page_cgroup(struct page *page);
>
> enum {
> @@ -87,6 +99,10 @@
> {
> }
>
> +static inline void __init page_cgroup_init_flatmem(void)
> +{
> +}
> +
> #endif
>
> #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
>
>
>
>

2009-06-12 07:32:15

by Li Zefan

[permalink] [raw]
Subject: Re: [GIT PULL v3] Early boot SLAB for 2.6.31

Pekka Enberg wrote:
> Hi Ingo,
>
> On Fri, 2009-06-12 at 09:17 +0200, Ingo Molnar wrote:
>> Plus i quickly got this crash too:
>>
>> [ 0.000000] console [tty0] enabled
>> [ 0.000000] allocation of page_cgroup was failed.
>> [ 0.000000] please try cgroup_disable=memory boot option
>> [ 0.000000] Kernel panic - not syncing: Out of memory
>> [ 0.000000] Rebooting in 1 seconds..<1>BUG: unable to handle kernel NULL pointer dereference at 0000004c
>> [ 0.000000] IP: [<c16ec708>] klist_next+0x10/0x8f
>> [ 0.000000] *pdpt = 0000000001a8a001 *pde = 0000000000000000
>> [ 0.000000] Thread overran stack, or stack corrupted
>
> Hmm, does this patch fix it? Hiroyuki, Li, does the oops look familiar to you?
>

should be the same bug.

2009-06-12 07:52:10

by Ingo Molnar

[permalink] [raw]
Subject: Re: [GIT PULL v3] Early boot SLAB for 2.6.31


* Pekka Enberg <[email protected]> wrote:

> Hi Ingo,
>
> On Fri, 2009-06-12 at 09:17 +0200, Ingo Molnar wrote:
> > Plus i quickly got this crash too:
> >
> > [ 0.000000] console [tty0] enabled
> > [ 0.000000] allocation of page_cgroup was failed.
> > [ 0.000000] please try cgroup_disable=memory boot option
> > [ 0.000000] Kernel panic - not syncing: Out of memory
> > [ 0.000000] Rebooting in 1 seconds..<1>BUG: unable to handle kernel NULL pointer dereference at 0000004c
> > [ 0.000000] IP: [<c16ec708>] klist_next+0x10/0x8f
> > [ 0.000000] *pdpt = 0000000001a8a001 *pde = 0000000000000000
> > [ 0.000000] Thread overran stack, or stack corrupted
>
> Hmm, does this patch fix it? [...]

Thanks, this fixed it.

Ingo

2009-06-12 08:33:43

by Pekka Enberg

[permalink] [raw]
Subject: Re: [GIT PULL v3] Early boot SLAB for 2.6.31

On Thu, 2009-06-11 at 15:41 -0700, Yinghai Lu wrote:
> [PATCH] irq: slab alloc for default irq_affinity
>
> Ingo had
>
> [ 0.000000] ------------[ cut here ]------------
> [ 0.000000] WARNING: at mm/bootmem.c:537 alloc_arch_preferred_bootmem+0x2b/0x71()
> [ 0.000000] Hardware name: System Product Name
> [ 0.000000] Modules linked in:
> [ 0.000000] Pid: 0, comm: swapper Tainted: G W 2.6.30-tip-03087-g0bb2618-dirty #52506
> [ 0.000000] Call Trace:
> [ 0.000000] [<81032588>] warn_slowpath_common+0x60/0x90
> [ 0.000000] [<810325c5>] warn_slowpath_null+0xd/0x10
> [ 0.000000] [<819d1bc0>] alloc_arch_preferred_bootmem+0x2b/0x71
> [ 0.000000] [<819d1c31>] ___alloc_bootmem_nopanic+0x2b/0x9a
> [ 0.000000] [<81050a0a>] ? lock_release+0xac/0xb2
> [ 0.000000] [<819d1d4c>] ___alloc_bootmem+0xe/0x2d
> [ 0.000000] [<819d1e9f>] __alloc_bootmem+0xa/0xc
> [ 0.000000] [<819d7c63>] alloc_bootmem_cpumask_var+0x21/0x26
> [ 0.000000] [<819d0cc8>] early_irq_init+0x15/0x10d
> [ 0.000000] [<819bb75a>] start_kernel+0x167/0x326
> [ 0.000000] [<819bb06b>] __init_begin+0x6b/0x70
> [ 0.000000] ---[ end trace 4eaa2a86a8e2da23 ]---
> [ 0.000000] NR_IRQS:2304 nr_irqs:424
> [ 0.000000] CPU 0 irqstacks, hard=821e6000 soft=821e7000
>
> we need to update init_irq_default_affinity
>
> Signed-off-by: Yinghai Lu <[email protected]>
>
> ---
> kernel/irq/handle.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> Index: linux-2.6/kernel/irq/handle.c
> ===================================================================
> --- linux-2.6.orig/kernel/irq/handle.c
> +++ linux-2.6/kernel/irq/handle.c
> @@ -45,7 +45,7 @@ void handle_bad_irq(unsigned int irq, st
> #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
> static void __init init_irq_default_affinity(void)
> {
> - alloc_bootmem_cpumask_var(&irq_default_affinity);
> + alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
> cpumask_setall(irq_default_affinity);
> }
> #else

Applied, thanks!

Pekka

2009-06-12 08:37:37

by Pekka Enberg

[permalink] [raw]
Subject: Re: [GIT PULL v3] Early boot SLAB for 2.6.31

On Thu, 2009-06-11 at 16:14 -0700, Yinghai Lu wrote:
> please check
>
> [PATCH] x86: make zap_low_mapping could be used early
>
> only one cpu is there, just call __flush_tlb for it
>
> Signed-off-by: Yinghai Lu <[email protected]>
>
> ---
> arch/x86/include/asm/tlbflush.h | 2 +-
> arch/x86/kernel/smpboot.c | 2 +-
> arch/x86/mm/init_32.c | 10 +++++++---
> 3 files changed, 9 insertions(+), 5 deletions(-)
>
> Index: linux-2.6/arch/x86/include/asm/tlbflush.h
> ===================================================================
> --- linux-2.6.orig/arch/x86/include/asm/tlbflush.h
> +++ linux-2.6/arch/x86/include/asm/tlbflush.h
> @@ -172,6 +172,6 @@ static inline void flush_tlb_kernel_rang
> flush_tlb_all();
> }
>
> -extern void zap_low_mappings(void);
> +extern void zap_low_mappings(bool early);
>
> #endif /* _ASM_X86_TLBFLUSH_H */
> Index: linux-2.6/arch/x86/kernel/smpboot.c
> ===================================================================
> --- linux-2.6.orig/arch/x86/kernel/smpboot.c
> +++ linux-2.6/arch/x86/kernel/smpboot.c
> @@ -875,7 +875,7 @@ int __cpuinit native_cpu_up(unsigned int
>
> err = do_boot_cpu(apicid, cpu);
>
> - zap_low_mappings();
> + zap_low_mappings(false);
> low_mappings = 0;
> #else
> err = do_boot_cpu(apicid, cpu);
> Index: linux-2.6/arch/x86/mm/init_32.c
> ===================================================================
> --- linux-2.6.orig/arch/x86/mm/init_32.c
> +++ linux-2.6/arch/x86/mm/init_32.c
> @@ -576,7 +576,7 @@ static inline void save_pg_dir(void)
> }
> #endif /* !CONFIG_ACPI_SLEEP */
>
> -void zap_low_mappings(void)
> +void zap_low_mappings(bool early)
> {
> int i;
>
> @@ -593,7 +593,11 @@ void zap_low_mappings(void)
> set_pgd(swapper_pg_dir+i, __pgd(0));
> #endif
> }
> - flush_tlb_all();
> +
> + if (early)
> + __flush_tlb();
> + else
> + flush_tlb_all();
> }
>
> pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
> @@ -968,7 +972,7 @@ void __init mem_init(void)
> test_wp_bit();
>
> save_pg_dir();
> - zap_low_mappings();
> + zap_low_mappings(true);
> }
>
> #ifdef CONFIG_MEMORY_HOTPLUG

Applied, thanks!

Pekka