Hi Linus,
Here are some fixes to early boot slab. I have not tested this on PowerPC but
hopefully it will boot now with the GFP_WAIT masking during boot in SLUB and
SLAB. We'll produce tons of warnings on PPC for sure but at least we don't just
crash and burn like we did before.
Pekka
The following changes since commit 8ebf975608aaebd7feb33d77f07ba21a6380e086:
Randy Dunlap (1):
block: fix kernel-doc in recent block/ changes
are available in the git repository at:
ssh://master.kernel.org/pub/scm/linux/kernel/git/penberg/slab-2.6 topic/slab/earlyboot
KAMEZAWA Hiroyuki (1):
memcg: fix page_cgroup fatal error in FLATMEM
Pekka Enberg (4):
init: Use GFP_NOWAIT for early slab allocations
slab: don't enable interrupts during early boot
slab: fix gfp flag in setup_cpu_cache()
slab: setup cpu caches later on when interrupts are enabled
Yinghai Lu (2):
irq: slab alloc for default irq_affinity
x86: make zap_low_mapping could be used early
arch/x86/include/asm/tlbflush.h | 2 +-
arch/x86/kernel/smpboot.c | 2 +-
arch/x86/mm/init_32.c | 10 ++++++--
include/linux/page_cgroup.h | 18 ++++++++++++++-
include/linux/slab.h | 2 +
include/linux/slob_def.h | 5 ++++
include/linux/slub_def.h | 2 +
init/main.c | 6 +++++
kernel/irq/handle.c | 2 +-
kernel/params.c | 2 +-
kernel/profile.c | 6 ++--
mm/page_alloc.c | 2 +-
mm/page_cgroup.c | 29 ++++++++----------------
mm/slab.c | 45 +++++++++++++++++++++++++++++---------
mm/slub.c | 18 +++++++++++++++
mm/sparse-vmemmap.c | 2 +-
mm/sparse.c | 2 +-
17 files changed, 111 insertions(+), 44 deletions(-)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index a5ecc9c..7f3eba0 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -172,6 +172,6 @@ static inline void flush_tlb_kernel_range(unsigned long start,
flush_tlb_all();
}
-extern void zap_low_mappings(void);
+extern void zap_low_mappings(bool early);
#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7c80007..2fecda6 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -873,7 +873,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
err = do_boot_cpu(apicid, cpu);
- zap_low_mappings();
+ zap_low_mappings(false);
low_mappings = 0;
#else
err = do_boot_cpu(apicid, cpu);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 949708d..9ff3c08 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -564,7 +564,7 @@ static inline void save_pg_dir(void)
}
#endif /* !CONFIG_ACPI_SLEEP */
-void zap_low_mappings(void)
+void zap_low_mappings(bool early)
{
int i;
@@ -581,7 +581,11 @@ void zap_low_mappings(void)
set_pgd(swapper_pg_dir+i, __pgd(0));
#endif
}
- flush_tlb_all();
+
+ if (early)
+ __flush_tlb();
+ else
+ flush_tlb_all();
}
pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
@@ -956,7 +960,7 @@ void __init mem_init(void)
test_wp_bit();
save_pg_dir();
- zap_low_mappings();
+ zap_low_mappings(true);
}
#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 7339c7b..13f126c 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -18,7 +18,19 @@ struct page_cgroup {
};
void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat);
-void __init page_cgroup_init(void);
+
+#ifdef CONFIG_SPARSEMEM
+static inline void __init page_cgroup_init_flatmem(void)
+{
+}
+extern void __init page_cgroup_init(void);
+#else
+void __init page_cgroup_init_flatmem(void);
+static inline void __init page_cgroup_init(void)
+{
+}
+#endif
+
struct page_cgroup *lookup_page_cgroup(struct page *page);
enum {
@@ -87,6 +99,10 @@ static inline void page_cgroup_init(void)
{
}
+static inline void __init page_cgroup_init_flatmem(void)
+{
+}
+
#endif
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 4880306..219b8fb 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -319,4 +319,6 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
return kmalloc_node(size, flags | __GFP_ZERO, node);
}
+void __init kmem_cache_init_late(void);
+
#endif /* _LINUX_SLAB_H */
diff --git a/include/linux/slob_def.h b/include/linux/slob_def.h
index 0ec00b3..bb5368d 100644
--- a/include/linux/slob_def.h
+++ b/include/linux/slob_def.h
@@ -34,4 +34,9 @@ static __always_inline void *__kmalloc(size_t size, gfp_t flags)
return kmalloc(size, flags);
}
+static inline void kmem_cache_init_late(void)
+{
+ /* Nothing to do */
+}
+
#endif /* __LINUX_SLOB_DEF_H */
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index be5d40c..4dcbc2c 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -302,4 +302,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
}
#endif
+void __init kmem_cache_init_late(void);
+
#endif /* _LINUX_SLUB_DEF_H */
diff --git a/init/main.c b/init/main.c
index 5616661..f6204f7 100644
--- a/init/main.c
+++ b/init/main.c
@@ -539,6 +539,11 @@ void __init __weak thread_info_cache_init(void)
*/
static void __init mm_init(void)
{
+ /*
+ * page_cgroup requires countinous pages as memmap
+ * and it's bigger than MAX_ORDER unless SPARSEMEM.
+ */
+ page_cgroup_init_flatmem();
mem_init();
kmem_cache_init();
vmalloc_init();
@@ -635,6 +640,7 @@ asmlinkage void __init start_kernel(void)
"enabled early\n");
early_boot_irqs_on();
local_irq_enable();
+ kmem_cache_init_late();
/*
* HACK ALERT! This is early. We're enabling the console before
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 1045785..065205b 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -45,7 +45,7 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
static void __init init_irq_default_affinity(void)
{
- alloc_bootmem_cpumask_var(&irq_default_affinity);
+ alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
cpumask_setall(irq_default_affinity);
}
#else
diff --git a/kernel/params.c b/kernel/params.c
index de273ec..5c239c3 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -227,7 +227,7 @@ int param_set_charp(const char *val, struct kernel_param *kp)
* don't need to; this mangled commandline is preserved. */
if (slab_is_available()) {
kp->perm |= KPARAM_KMALLOCED;
- *(char **)kp->arg = kstrdup(val, GFP_KERNEL);
+ *(char **)kp->arg = kstrdup(val, GFP_NOWAIT);
if (!kp->arg)
return -ENOMEM;
} else
diff --git a/kernel/profile.c b/kernel/profile.c
index 28cf26a..86ada09 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -112,16 +112,16 @@ int __ref profile_init(void)
prof_len = (_etext - _stext) >> prof_shift;
buffer_bytes = prof_len*sizeof(atomic_t);
- if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
+ if (!alloc_cpumask_var(&prof_cpu_mask, GFP_NOWAIT))
return -ENOMEM;
cpumask_copy(prof_cpu_mask, cpu_possible_mask);
- prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL);
+ prof_buffer = kzalloc(buffer_bytes, GFP_NOWAIT);
if (prof_buffer)
return 0;
- prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO);
+ prof_buffer = alloc_pages_exact(buffer_bytes, GFP_NOWAIT|__GFP_ZERO);
if (prof_buffer)
return 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 17d5f53..7760ef9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2903,7 +2903,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
* To use this new node's memory, further consideration will be
* necessary.
*/
- zone->wait_table = vmalloc(alloc_size);
+ zone->wait_table = __vmalloc(alloc_size, GFP_NOWAIT, PAGE_KERNEL);
}
if (!zone->wait_table)
return -ENOMEM;
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3dd4a90..11a8a10 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -47,8 +47,6 @@ static int __init alloc_node_page_cgroup(int nid)
struct page_cgroup *base, *pc;
unsigned long table_size;
unsigned long start_pfn, nr_pages, index;
- struct page *page;
- unsigned int order;
start_pfn = NODE_DATA(nid)->node_start_pfn;
nr_pages = NODE_DATA(nid)->node_spanned_pages;
@@ -57,13 +55,11 @@ static int __init alloc_node_page_cgroup(int nid)
return 0;
table_size = sizeof(struct page_cgroup) * nr_pages;
- order = get_order(table_size);
- page = alloc_pages_node(nid, GFP_NOWAIT | __GFP_ZERO, order);
- if (!page)
- page = alloc_pages_node(-1, GFP_NOWAIT | __GFP_ZERO, order);
- if (!page)
+
+ base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
+ table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ if (!base)
return -ENOMEM;
- base = page_address(page);
for (index = 0; index < nr_pages; index++) {
pc = base + index;
__init_page_cgroup(pc, start_pfn + index);
@@ -73,7 +69,7 @@ static int __init alloc_node_page_cgroup(int nid)
return 0;
}
-void __init page_cgroup_init(void)
+void __init page_cgroup_init_flatmem(void)
{
int nid, fail;
@@ -117,16 +113,11 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
if (!section->page_cgroup) {
nid = page_to_nid(pfn_to_page(pfn));
table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
- if (slab_is_available()) {
- base = kmalloc_node(table_size,
- GFP_KERNEL | __GFP_NOWARN, nid);
- if (!base)
- base = vmalloc_node(table_size, nid);
- } else {
- base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
- table_size,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
- }
+ VM_BUG_ON(!slab_is_available());
+ base = kmalloc_node(table_size,
+ GFP_KERNEL | __GFP_NOWARN, nid);
+ if (!base)
+ base = vmalloc_node(table_size, nid);
} else {
/*
* We don't have to allocate page_cgroup again, but
diff --git a/mm/slab.c b/mm/slab.c
index f46b65d..84368b8 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -304,6 +304,12 @@ struct kmem_list3 {
};
/*
+ * The slab allocator is initialized with interrupts disabled. Therefore, make
+ * sure early boot allocations don't accidentally enable interrupts.
+ */
+static gfp_t slab_gfp_mask __read_mostly = __GFP_BITS_MASK & ~__GFP_WAIT;
+
+/*
* Need this for bootstrapping a per node allocator.
*/
#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
@@ -753,6 +759,7 @@ static enum {
NONE,
PARTIAL_AC,
PARTIAL_L3,
+ EARLY,
FULL
} g_cpucache_up;
@@ -761,7 +768,7 @@ static enum {
*/
int slab_is_available(void)
{
- return g_cpucache_up == FULL;
+ return g_cpucache_up >= EARLY;
}
static DEFINE_PER_CPU(struct delayed_work, reap_work);
@@ -1625,19 +1632,27 @@ void __init kmem_cache_init(void)
}
}
- /* 6) resize the head arrays to their final sizes */
- {
- struct kmem_cache *cachep;
- mutex_lock(&cache_chain_mutex);
- list_for_each_entry(cachep, &cache_chain, next)
- if (enable_cpucache(cachep, GFP_NOWAIT))
- BUG();
- mutex_unlock(&cache_chain_mutex);
- }
+ g_cpucache_up = EARLY;
/* Annotate slab for lockdep -- annotate the malloc caches */
init_lock_keys();
+}
+
+void __init kmem_cache_init_late(void)
+{
+ struct kmem_cache *cachep;
+
+ /*
+ * Interrupts are enabled now so all GFP allocations are safe.
+ */
+ slab_gfp_mask = __GFP_BITS_MASK;
+ /* 6) resize the head arrays to their final sizes */
+ mutex_lock(&cache_chain_mutex);
+ list_for_each_entry(cachep, &cache_chain, next)
+ if (enable_cpucache(cachep, GFP_NOWAIT))
+ BUG();
+ mutex_unlock(&cache_chain_mutex);
/* Done! */
g_cpucache_up = FULL;
@@ -2102,7 +2117,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
for_each_online_node(node) {
cachep->nodelists[node] =
kmalloc_node(sizeof(struct kmem_list3),
- GFP_KERNEL, node);
+ gfp, node);
BUG_ON(!cachep->nodelists[node]);
kmem_list3_init(cachep->nodelists[node]);
}
@@ -2812,6 +2827,10 @@ static int cache_grow(struct kmem_cache *cachep,
offset *= cachep->colour_off;
+ /* Lets avoid crashing in early boot code. */
+ if (WARN_ON_ONCE((local_flags & ~slab_gfp_mask) != 0))
+ local_flags &= slab_gfp_mask;
+
if (local_flags & __GFP_WAIT)
local_irq_enable();
@@ -3237,6 +3256,10 @@ retry:
}
if (!obj) {
+ /* Lets avoid crashing in early boot code. */
+ if (WARN_ON_ONCE((local_flags & ~slab_gfp_mask) != 0))
+ local_flags &= slab_gfp_mask;
+
/*
* This allocation will be performed within the constraints
* of the current cpuset / memory policy requirements.
diff --git a/mm/slub.c b/mm/slub.c
index 3964d3c..651bb34 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -178,6 +178,12 @@ static enum {
SYSFS /* Sysfs up */
} slab_state = DOWN;
+/*
+ * The slab allocator is initialized with interrupts disabled. Therefore, make
+ * sure early boot allocations don't accidentally enable interrupts.
+ */
+static gfp_t slab_gfp_mask __read_mostly = __GFP_BITS_MASK & ~__GFP_WAIT;
+
/* A list of all slab caches on the system */
static DECLARE_RWSEM(slub_lock);
static LIST_HEAD(slab_caches);
@@ -1548,6 +1554,10 @@ new_slab:
goto load_freelist;
}
+ /* Lets avoid crashing in early boot code. */
+ if (WARN_ON_ONCE((gfpflags & ~slab_gfp_mask) != 0))
+ gfpflags &= slab_gfp_mask;
+
if (gfpflags & __GFP_WAIT)
local_irq_enable();
@@ -3104,6 +3114,14 @@ void __init kmem_cache_init(void)
nr_cpu_ids, nr_node_ids);
}
+void __init kmem_cache_init_late(void)
+{
+ /*
+ * Interrupts are enabled now so all GFP allocations are safe.
+ */
+ slab_gfp_mask = __GFP_BITS_MASK;
+}
+
/*
* Find a mergeable slab cache
*/
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index a13ea64..9df6d99 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -49,7 +49,7 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
/* If the main allocator is up use that, fallback to bootmem. */
if (slab_is_available()) {
struct page *page = alloc_pages_node(node,
- GFP_KERNEL | __GFP_ZERO, get_order(size));
+ GFP_NOWAIT | __GFP_ZERO, get_order(size));
if (page)
return page_address(page);
return NULL;
diff --git a/mm/sparse.c b/mm/sparse.c
index da432d9..dd558d2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -63,7 +63,7 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
sizeof(struct mem_section);
if (slab_is_available())
- section = kmalloc_node(array_size, GFP_KERNEL, nid);
+ section = kmalloc_node(array_size, GFP_NOWAIT, nid);
else
section = alloc_bootmem_node(NODE_DATA(nid), array_size);
On Fri, 2009-06-12 at 16:25 +0300, Pekka J Enberg wrote:
> + /* Lets avoid crashing in early boot code. */
> + if (WARN_ON_ONCE((local_flags & ~slab_gfp_mask) != 0))
> + local_flags &= slab_gfp_mask;
> +
I still object to the WARN_ON, see all the previous discussions on that
subject...
Cheers,
Ben.
Hi Ben,
On Fri, 2009-06-12 at 16:25 +0300, Pekka J Enberg wrote:
> > + /* Lets avoid crashing in early boot code. */
> > + if (WARN_ON_ONCE((local_flags & ~slab_gfp_mask) != 0))
> > + local_flags &= slab_gfp_mask;
> > +
On Fri, 2009-06-12 at 23:38 +1000, Benjamin Herrenschmidt wrote:
> I still object to the WARN_ON, see all the previous discussions on that
> subject...
Yes, I know that!
I am not proposing this as the final fix to powerpc problems. But if we
were to do the masking you suggested, this is not the proper way to do
it at all. We need to mask the flags much earlier than this which is why
I am keeping the WARN_ON() there.
The patch is a stop-gap measure until we've reached an agreement how to
fix things properly. And I do think my patch is the right thing to do
here while we wait for Linus' opinion on this.
Pekka
Having to use GFP_NOWAIT instead of GFP_KERNEL in code that may perhaps be
used in early boot defeats the purpose of the whole endeavor. Some of
these function may also be used after boot and thus we will have to add
special casing so that GFP_KERNEL is used if boot is complete. We are
replacing one special casing with another.
Hi Linus,
I dropped the GFP_WAIT conversion patch and added the gfp masking patch
you liked. I tested this on x86-64 with both SLAB and SLUB.
Pekka
The following changes since commit 8ebf975608aaebd7feb33d77f07ba21a6380e086:
Randy Dunlap (1):
block: fix kernel-doc in recent block/ changes
are available in the git repository at:
ssh://master.kernel.org/pub/scm/linux/kernel/git/penberg/slab-2.6 topic/slab/earlyboot-v2
KAMEZAWA Hiroyuki (1):
memcg: fix page_cgroup fatal error in FLATMEM
Pekka Enberg (3):
slab: fix gfp flag in setup_cpu_cache()
slab,slub: don't enable interrupts during early boot
slab: setup cpu caches later on when interrupts are enabled
Yinghai Lu (2):
irq: slab alloc for default irq_affinity
x86: make zap_low_mapping could be used early
arch/x86/include/asm/tlbflush.h | 2 +-
arch/x86/kernel/smpboot.c | 2 +-
arch/x86/mm/init_32.c | 10 ++++++--
include/linux/gfp.h | 3 ++
include/linux/page_cgroup.h | 18 ++++++++++++++++-
include/linux/slab.h | 2 +
include/linux/slob_def.h | 5 ++++
include/linux/slub_def.h | 2 +
init/main.c | 6 +++++
kernel/irq/handle.c | 2 +-
mm/page_cgroup.c | 29 +++++++++------------------
mm/slab.c | 41 ++++++++++++++++++++++++++++----------
mm/slub.c | 16 +++++++++++++++
13 files changed, 101 insertions(+), 37 deletions(-)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index a5ecc9c..7f3eba0 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -172,6 +172,6 @@ static inline void flush_tlb_kernel_range(unsigned long start,
flush_tlb_all();
}
-extern void zap_low_mappings(void);
+extern void zap_low_mappings(bool early);
#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7c80007..2fecda6 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -873,7 +873,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
err = do_boot_cpu(apicid, cpu);
- zap_low_mappings();
+ zap_low_mappings(false);
low_mappings = 0;
#else
err = do_boot_cpu(apicid, cpu);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 949708d..9ff3c08 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -564,7 +564,7 @@ static inline void save_pg_dir(void)
}
#endif /* !CONFIG_ACPI_SLEEP */
-void zap_low_mappings(void)
+void zap_low_mappings(bool early)
{
int i;
@@ -581,7 +581,11 @@ void zap_low_mappings(void)
set_pgd(swapper_pg_dir+i, __pgd(0));
#endif
}
- flush_tlb_all();
+
+ if (early)
+ __flush_tlb();
+ else
+ flush_tlb_all();
}
pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
@@ -956,7 +960,7 @@ void __init mem_init(void)
test_wp_bit();
save_pg_dir();
- zap_low_mappings();
+ zap_low_mappings(true);
}
#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0bbc15f..3760e7c 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -85,6 +85,9 @@ struct vm_area_struct;
__GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\
__GFP_NORETRY|__GFP_NOMEMALLOC)
+/* Control slab gfp mask during early boot */
+#define SLAB_GFP_BOOT_MASK __GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS)
+
/* Control allocation constraints */
#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 7339c7b..13f126c 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -18,7 +18,19 @@ struct page_cgroup {
};
void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat);
-void __init page_cgroup_init(void);
+
+#ifdef CONFIG_SPARSEMEM
+static inline void __init page_cgroup_init_flatmem(void)
+{
+}
+extern void __init page_cgroup_init(void);
+#else
+void __init page_cgroup_init_flatmem(void);
+static inline void __init page_cgroup_init(void)
+{
+}
+#endif
+
struct page_cgroup *lookup_page_cgroup(struct page *page);
enum {
@@ -87,6 +99,10 @@ static inline void page_cgroup_init(void)
{
}
+static inline void __init page_cgroup_init_flatmem(void)
+{
+}
+
#endif
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 4880306..219b8fb 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -319,4 +319,6 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
return kmalloc_node(size, flags | __GFP_ZERO, node);
}
+void __init kmem_cache_init_late(void);
+
#endif /* _LINUX_SLAB_H */
diff --git a/include/linux/slob_def.h b/include/linux/slob_def.h
index 0ec00b3..bb5368d 100644
--- a/include/linux/slob_def.h
+++ b/include/linux/slob_def.h
@@ -34,4 +34,9 @@ static __always_inline void *__kmalloc(size_t size, gfp_t flags)
return kmalloc(size, flags);
}
+static inline void kmem_cache_init_late(void)
+{
+ /* Nothing to do */
+}
+
#endif /* __LINUX_SLOB_DEF_H */
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index be5d40c..4dcbc2c 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -302,4 +302,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
}
#endif
+void __init kmem_cache_init_late(void);
+
#endif /* _LINUX_SLUB_DEF_H */
diff --git a/init/main.c b/init/main.c
index 5616661..f6204f7 100644
--- a/init/main.c
+++ b/init/main.c
@@ -539,6 +539,11 @@ void __init __weak thread_info_cache_init(void)
*/
static void __init mm_init(void)
{
+ /*
+ * page_cgroup requires countinous pages as memmap
+ * and it's bigger than MAX_ORDER unless SPARSEMEM.
+ */
+ page_cgroup_init_flatmem();
mem_init();
kmem_cache_init();
vmalloc_init();
@@ -635,6 +640,7 @@ asmlinkage void __init start_kernel(void)
"enabled early\n");
early_boot_irqs_on();
local_irq_enable();
+ kmem_cache_init_late();
/*
* HACK ALERT! This is early. We're enabling the console before
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 1045785..065205b 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -45,7 +45,7 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
static void __init init_irq_default_affinity(void)
{
- alloc_bootmem_cpumask_var(&irq_default_affinity);
+ alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
cpumask_setall(irq_default_affinity);
}
#else
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3dd4a90..11a8a10 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -47,8 +47,6 @@ static int __init alloc_node_page_cgroup(int nid)
struct page_cgroup *base, *pc;
unsigned long table_size;
unsigned long start_pfn, nr_pages, index;
- struct page *page;
- unsigned int order;
start_pfn = NODE_DATA(nid)->node_start_pfn;
nr_pages = NODE_DATA(nid)->node_spanned_pages;
@@ -57,13 +55,11 @@ static int __init alloc_node_page_cgroup(int nid)
return 0;
table_size = sizeof(struct page_cgroup) * nr_pages;
- order = get_order(table_size);
- page = alloc_pages_node(nid, GFP_NOWAIT | __GFP_ZERO, order);
- if (!page)
- page = alloc_pages_node(-1, GFP_NOWAIT | __GFP_ZERO, order);
- if (!page)
+
+ base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
+ table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ if (!base)
return -ENOMEM;
- base = page_address(page);
for (index = 0; index < nr_pages; index++) {
pc = base + index;
__init_page_cgroup(pc, start_pfn + index);
@@ -73,7 +69,7 @@ static int __init alloc_node_page_cgroup(int nid)
return 0;
}
-void __init page_cgroup_init(void)
+void __init page_cgroup_init_flatmem(void)
{
int nid, fail;
@@ -117,16 +113,11 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
if (!section->page_cgroup) {
nid = page_to_nid(pfn_to_page(pfn));
table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
- if (slab_is_available()) {
- base = kmalloc_node(table_size,
- GFP_KERNEL | __GFP_NOWARN, nid);
- if (!base)
- base = vmalloc_node(table_size, nid);
- } else {
- base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
- table_size,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
- }
+ VM_BUG_ON(!slab_is_available());
+ base = kmalloc_node(table_size,
+ GFP_KERNEL | __GFP_NOWARN, nid);
+ if (!base)
+ base = vmalloc_node(table_size, nid);
} else {
/*
* We don't have to allocate page_cgroup again, but
diff --git a/mm/slab.c b/mm/slab.c
index f46b65d..18e3164 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -304,6 +304,12 @@ struct kmem_list3 {
};
/*
+ * The slab allocator is initialized with interrupts disabled. Therefore, make
+ * sure early boot allocations don't accidentally enable interrupts.
+ */
+static gfp_t slab_gfp_mask __read_mostly = SLAB_GFP_BOOT_MASK;
+
+/*
* Need this for bootstrapping a per node allocator.
*/
#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
@@ -753,6 +759,7 @@ static enum {
NONE,
PARTIAL_AC,
PARTIAL_L3,
+ EARLY,
FULL
} g_cpucache_up;
@@ -761,7 +768,7 @@ static enum {
*/
int slab_is_available(void)
{
- return g_cpucache_up == FULL;
+ return g_cpucache_up >= EARLY;
}
static DEFINE_PER_CPU(struct delayed_work, reap_work);
@@ -1625,19 +1632,27 @@ void __init kmem_cache_init(void)
}
}
- /* 6) resize the head arrays to their final sizes */
- {
- struct kmem_cache *cachep;
- mutex_lock(&cache_chain_mutex);
- list_for_each_entry(cachep, &cache_chain, next)
- if (enable_cpucache(cachep, GFP_NOWAIT))
- BUG();
- mutex_unlock(&cache_chain_mutex);
- }
+ g_cpucache_up = EARLY;
/* Annotate slab for lockdep -- annotate the malloc caches */
init_lock_keys();
+}
+
+void __init kmem_cache_init_late(void)
+{
+ struct kmem_cache *cachep;
+
+ /*
+ * Interrupts are enabled now so all GFP allocations are safe.
+ */
+ slab_gfp_mask = __GFP_BITS_MASK;
+ /* 6) resize the head arrays to their final sizes */
+ mutex_lock(&cache_chain_mutex);
+ list_for_each_entry(cachep, &cache_chain, next)
+ if (enable_cpucache(cachep, GFP_NOWAIT))
+ BUG();
+ mutex_unlock(&cache_chain_mutex);
/* Done! */
g_cpucache_up = FULL;
@@ -2102,7 +2117,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
for_each_online_node(node) {
cachep->nodelists[node] =
kmalloc_node(sizeof(struct kmem_list3),
- GFP_KERNEL, node);
+ gfp, node);
BUG_ON(!cachep->nodelists[node]);
kmem_list3_init(cachep->nodelists[node]);
}
@@ -3354,6 +3369,8 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
unsigned long save_flags;
void *ptr;
+ flags &= slab_gfp_mask;
+
lockdep_trace_alloc(flags);
if (slab_should_failslab(cachep, flags))
@@ -3434,6 +3451,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
unsigned long save_flags;
void *objp;
+ flags &= slab_gfp_mask;
+
lockdep_trace_alloc(flags);
if (slab_should_failslab(cachep, flags))
diff --git a/mm/slub.c b/mm/slub.c
index 3964d3c..30354bf 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -178,6 +178,12 @@ static enum {
SYSFS /* Sysfs up */
} slab_state = DOWN;
+/*
+ * The slab allocator is initialized with interrupts disabled. Therefore, make
+ * sure early boot allocations don't accidentally enable interrupts.
+ */
+static gfp_t slab_gfp_mask __read_mostly = SLAB_GFP_BOOT_MASK;
+
/* A list of all slab caches on the system */
static DECLARE_RWSEM(slub_lock);
static LIST_HEAD(slab_caches);
@@ -1595,6 +1601,8 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
unsigned long flags;
unsigned int objsize;
+ gfpflags &= slab_gfp_mask;
+
lockdep_trace_alloc(gfpflags);
might_sleep_if(gfpflags & __GFP_WAIT);
@@ -3104,6 +3112,14 @@ void __init kmem_cache_init(void)
nr_cpu_ids, nr_node_ids);
}
+void __init kmem_cache_init_late(void)
+{
+ /*
+ * Interrupts are enabled now so all GFP allocations are safe.
+ */
+ slab_gfp_mask = __GFP_BITS_MASK;
+}
+
/*
* Find a mergeable slab cache
*/
On Fri, 12 Jun 2009, Pekka J Enberg wrote:
> @@ -3434,6 +3451,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
> unsigned long save_flags;
> void *objp;
>
> + flags &= slab_gfp_mask;
> +
> lockdep_trace_alloc(flags);
>
> if (slab_should_failslab(cachep, flags))
Adds code to hot code path.
> @@ -1595,6 +1601,8 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
> unsigned long flags;
> unsigned int objsize;
>
> + gfpflags &= slab_gfp_mask;
> +
> lockdep_trace_alloc(gfpflags);
> might_sleep_if(gfpflags & __GFP_WAIT);
>
Adds code to hot code path.
The allocators mask flags passed to the page allocator through
GFP_RECLAIM_MASK|GFP_CONSTRAINT_MASK. This is done outside of the critical
code paths.
The might_sleep issues may be fixed by adding another check to
__might_sleep().
> Adds code to hot code path.
A single mask which should be mostly in the noise...
> The allocators mask flags passed to the page allocator through
> GFP_RECLAIM_MASK|GFP_CONSTRAINT_MASK. This is done outside of the critical
> code paths.
>
> The might_sleep issues may be fixed by adding another check to
> __might_sleep().
That is not enough because slab and slub explicitely re-enable
interrupts when __GFP_WAIT is set, which is the problem we need to solve
in the first place, so we need to do the masking there.
Cheers,
Ben.
On Fri, 12 Jun 2009, Christoph Lameter wrote:
> On Fri, 12 Jun 2009, Pekka J Enberg wrote:
>
> > @@ -3434,6 +3451,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
> > unsigned long save_flags;
> > void *objp;
> >
> > + flags &= slab_gfp_mask;
> > +
> > lockdep_trace_alloc(flags);
> >
> > if (slab_should_failslab(cachep, flags))
>
> Adds code to hot code path.
>
> > @@ -1595,6 +1601,8 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
> > unsigned long flags;
> > unsigned int objsize;
> >
> > + gfpflags &= slab_gfp_mask;
> > +
> > lockdep_trace_alloc(gfpflags);
> > might_sleep_if(gfpflags & __GFP_WAIT);
> >
>
> Adds code to hot code path.
>
> The allocators mask flags passed to the page allocator through
> GFP_RECLAIM_MASK|GFP_CONSTRAINT_MASK. This is done outside of the critical
> code paths.
>
> The might_sleep issues may be fixed by adding another check to
> __might_sleep().
How about something like this? There should be no extra code in fastpaths
for production configs with this one.
Pekka
diff --git a/mm/slub.c b/mm/slub.c
index 4d6004c..5e8cea1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1613,6 +1613,8 @@ another_slab:
deactivate_slab(s, c);
new_slab:
+ gfpflags &= slab_gfp_mask;
+
new = get_partial(s, gfpflags, node);
if (new) {
c->page = new;
@@ -1668,13 +1670,14 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
struct kmem_cache_cpu *c;
unsigned long flags;
unsigned int objsize;
+ gfp_t real_gfp;
- gfpflags &= slab_gfp_mask;
+ real_gfp = gfpflags & slab_gfp_mask;
- lockdep_trace_alloc(gfpflags);
- might_sleep_if(gfpflags & __GFP_WAIT);
+ lockdep_trace_alloc(real_gfp);
+ might_sleep_if(real_gfp & __GFP_WAIT);
- if (should_failslab(s->objsize, gfpflags))
+ if (should_failslab(s->objsize, real_gfp))
return NULL;
local_irq_save(flags);
On Sat, Jun 13, 2009 at 07:46:03AM +1000, Benjamin Herrenschmidt wrote:
>
> > Adds code to hot code path.
>
> A single mask which should be mostly in the noise...
Cacheline load. Yes it will be in the noise. Like all other slowdowns
that get merged ;)
> > The allocators mask flags passed to the page allocator through
> > GFP_RECLAIM_MASK|GFP_CONSTRAINT_MASK. This is done outside of the critical
> > code paths.
> >
> > The might_sleep issues may be fixed by adding another check to
> > __might_sleep().
>
> That is not enough because slab and slub explicitely re-enable
> interrupts when __GFP_WAIT is set, which is the problem we need to solve
> in the first place, so we need to do the masking there.
>
> Cheers,
> Ben.
>
On Fri, Jun 12, 2009 at 07:16:30PM +0300, Pekka J Enberg wrote:
> Hi Linus,
>
> I dropped the GFP_WAIT conversion patch and added the gfp masking patch
> you liked. I tested this on x86-64 with both SLAB and SLUB.
Hi Pekka,
I tried to convert some of the early allocations on s390. Some callsites
however need to have the GFP_DMA flag, since we need to allocate memory below
2GB. Passing GFP_DMA causes this crash:
<1>Unable to handle kernel pointer dereference at virtual kernel address fffffffffffff000
<4>Oops: 0038 [#1] PREEMPT SMP
<4>Modules linked in:
<4>CPU: 0 Not tainted 2.6.30-03984-g45e3e19-dirty #233
<4>Process swapper (pid: 0, task: 00000000006a2ef0, ksp: 0000000000718000)
<4>Krnl PSW : 0700100180000000 00000000000808ee (queue_work_on+0x8e/0xe0)
<4> R:0 T:1 IO:1 EX:1 Key:0 M:0 W:0 P:0 AS:0 CC:1 PM:0 EA:3
<4>Krnl GPRS: 0000000000000000 ffffffffffffffff 0000000000000000 00000000006b8b88
<4> 00000000006b8b88 0000000000000001 0000000000000008 0000000000000200
<4> 000000003fe28000 0000000000008001 00000000011da730 0000000000717ca0
<4> 00000000006b8b88 0000000000488650 0000000000717cd0 0000000000717ca0
<4>Krnl Code: 00000000000808de: e310d0000082 xg %r1,0(%r13)
<4> 00000000000808e4: eb220003000d sllg %r2,%r2,3
<4> 00000000000808ea: b9040034 lgr %r3,%r4
<4> >00000000000808ee: e32210000004 lg %r2,0(%r2,%r1)
<4> 00000000000808f4: c0e5ffffff28 brasl %r14,80744
<4> 00000000000808fa: a7280001 lhi %r2,1
<4> 00000000000808fe: e340b0b80004 lg %r4,184(%r11)
<4> 0000000000080904: b9140022 lgfr %r2,%r2
<4>Call Trace:
<4>([<000000003fe28000>] 0x3fe28000)
<4> [<0000000000080e96>] queue_work+0x62/0xa4
<4> [<0000000000080f26>] schedule_work+0x4e/0x60
<4> [<0000000000132f7e>] dma_kmalloc_cache+0x1ca/0x1d0
<4> [<00000000001330ae>] get_slab+0x12a/0x130
<4> [<00000000001337b6>] __kmalloc+0x5e/0x364
<4> [<0000000000739132>] con3215_init+0x1c2/0x2e4
<4> [<00000000007333ea>] console_init+0x42/0x5c
<4> [<0000000000718e50>] start_kernel+0x53c/0x6b8
<4> [<0000000000012020>] _ehead+0x20/0x80
I didn't look any deeper into this, but looks to me like doing something like
schedule_work() this early isn't ok.
This is the conversion that leads to the crash:
- alloc_bootmem_low(sizeof(struct raw3215_info));
+ kzalloc(sizeof(struct raw3215_info), GFP_NOWAIT | GFP_DMA);
Might be that I missed something. Maybe some special flag?
On Mon, Jun 15, 2009 at 10:18:31AM +0200, Heiko Carstens wrote:
> On Fri, Jun 12, 2009 at 07:16:30PM +0300, Pekka J Enberg wrote:
> > Hi Linus,
> >
> > I dropped the GFP_WAIT conversion patch and added the gfp masking patch
> > you liked. I tested this on x86-64 with both SLAB and SLUB.
>
> Hi Pekka,
>
> I tried to convert some of the early allocations on s390. Some callsites
> however need to have the GFP_DMA flag, since we need to allocate memory below
> 2GB. Passing GFP_DMA causes this crash:
>
> <1>Unable to handle kernel pointer dereference at virtual kernel address fffffffffffff000
> <4>Oops: 0038 [#1] PREEMPT SMP
> <4>Modules linked in:
> <4>CPU: 0 Not tainted 2.6.30-03984-g45e3e19-dirty #233
> <4>Process swapper (pid: 0, task: 00000000006a2ef0, ksp: 0000000000718000)
> <4>Krnl PSW : 0700100180000000 00000000000808ee (queue_work_on+0x8e/0xe0)
> <4> R:0 T:1 IO:1 EX:1 Key:0 M:0 W:0 P:0 AS:0 CC:1 PM:0 EA:3
> <4>Krnl GPRS: 0000000000000000 ffffffffffffffff 0000000000000000 00000000006b8b88
> <4> 00000000006b8b88 0000000000000001 0000000000000008 0000000000000200
> <4> 000000003fe28000 0000000000008001 00000000011da730 0000000000717ca0
> <4> 00000000006b8b88 0000000000488650 0000000000717cd0 0000000000717ca0
> <4>Krnl Code: 00000000000808de: e310d0000082 xg %r1,0(%r13)
> <4> 00000000000808e4: eb220003000d sllg %r2,%r2,3
> <4> 00000000000808ea: b9040034 lgr %r3,%r4
> <4> >00000000000808ee: e32210000004 lg %r2,0(%r2,%r1)
> <4> 00000000000808f4: c0e5ffffff28 brasl %r14,80744
> <4> 00000000000808fa: a7280001 lhi %r2,1
> <4> 00000000000808fe: e340b0b80004 lg %r4,184(%r11)
> <4> 0000000000080904: b9140022 lgfr %r2,%r2
> <4>Call Trace:
> <4>([<000000003fe28000>] 0x3fe28000)
> <4> [<0000000000080e96>] queue_work+0x62/0xa4
> <4> [<0000000000080f26>] schedule_work+0x4e/0x60
> <4> [<0000000000132f7e>] dma_kmalloc_cache+0x1ca/0x1d0
> <4> [<00000000001330ae>] get_slab+0x12a/0x130
> <4> [<00000000001337b6>] __kmalloc+0x5e/0x364
> <4> [<0000000000739132>] con3215_init+0x1c2/0x2e4
> <4> [<00000000007333ea>] console_init+0x42/0x5c
> <4> [<0000000000718e50>] start_kernel+0x53c/0x6b8
> <4> [<0000000000012020>] _ehead+0x20/0x80
>
> I didn't look any deeper into this, but looks to me like doing something like
> schedule_work() this early isn't ok.
>
> This is the conversion that leads to the crash:
>
> - alloc_bootmem_low(sizeof(struct raw3215_info));
> + kzalloc(sizeof(struct raw3215_info), GFP_NOWAIT | GFP_DMA);
>
> Might be that I missed something. Maybe some special flag?
No, just a bug in the conversion.
If you predicate the schedule_work call on slab_state == SYSFS, then
it should work (when sysfs comes up later in init, previously added
slabs will be registered with sysfs).
Oh, and you'd need to also not pass __SYSFS_ADD_DEFERRED into
kmem_cache_create in that case too.
Hi Nick,
On Mon, 2009-06-15 at 10:26 +0200, Nick Piggin wrote:
> On Mon, Jun 15, 2009 at 10:18:31AM +0200, Heiko Carstens wrote:
> > On Fri, Jun 12, 2009 at 07:16:30PM +0300, Pekka J Enberg wrote:
> > > Hi Linus,
> > >
> > > I dropped the GFP_WAIT conversion patch and added the gfp masking patch
> > > you liked. I tested this on x86-64 with both SLAB and SLUB.
> >
> > Hi Pekka,
> >
> > I tried to convert some of the early allocations on s390. Some callsites
> > however need to have the GFP_DMA flag, since we need to allocate memory below
> > 2GB. Passing GFP_DMA causes this crash:
> >
> > <1>Unable to handle kernel pointer dereference at virtual kernel address fffffffffffff000
> > <4>Oops: 0038 [#1] PREEMPT SMP
> > <4>Modules linked in:
> > <4>CPU: 0 Not tainted 2.6.30-03984-g45e3e19-dirty #233
> > <4>Process swapper (pid: 0, task: 00000000006a2ef0, ksp: 0000000000718000)
> > <4>Krnl PSW : 0700100180000000 00000000000808ee (queue_work_on+0x8e/0xe0)
> > <4> R:0 T:1 IO:1 EX:1 Key:0 M:0 W:0 P:0 AS:0 CC:1 PM:0 EA:3
> > <4>Krnl GPRS: 0000000000000000 ffffffffffffffff 0000000000000000 00000000006b8b88
> > <4> 00000000006b8b88 0000000000000001 0000000000000008 0000000000000200
> > <4> 000000003fe28000 0000000000008001 00000000011da730 0000000000717ca0
> > <4> 00000000006b8b88 0000000000488650 0000000000717cd0 0000000000717ca0
> > <4>Krnl Code: 00000000000808de: e310d0000082 xg %r1,0(%r13)
> > <4> 00000000000808e4: eb220003000d sllg %r2,%r2,3
> > <4> 00000000000808ea: b9040034 lgr %r3,%r4
> > <4> >00000000000808ee: e32210000004 lg %r2,0(%r2,%r1)
> > <4> 00000000000808f4: c0e5ffffff28 brasl %r14,80744
> > <4> 00000000000808fa: a7280001 lhi %r2,1
> > <4> 00000000000808fe: e340b0b80004 lg %r4,184(%r11)
> > <4> 0000000000080904: b9140022 lgfr %r2,%r2
> > <4>Call Trace:
> > <4>([<000000003fe28000>] 0x3fe28000)
> > <4> [<0000000000080e96>] queue_work+0x62/0xa4
> > <4> [<0000000000080f26>] schedule_work+0x4e/0x60
> > <4> [<0000000000132f7e>] dma_kmalloc_cache+0x1ca/0x1d0
> > <4> [<00000000001330ae>] get_slab+0x12a/0x130
> > <4> [<00000000001337b6>] __kmalloc+0x5e/0x364
> > <4> [<0000000000739132>] con3215_init+0x1c2/0x2e4
> > <4> [<00000000007333ea>] console_init+0x42/0x5c
> > <4> [<0000000000718e50>] start_kernel+0x53c/0x6b8
> > <4> [<0000000000012020>] _ehead+0x20/0x80
> >
> > I didn't look any deeper into this, but looks to me like doing something like
> > schedule_work() this early isn't ok.
> >
> > This is the conversion that leads to the crash:
> >
> > - alloc_bootmem_low(sizeof(struct raw3215_info));
> > + kzalloc(sizeof(struct raw3215_info), GFP_NOWAIT | GFP_DMA);
> >
> > Might be that I missed something. Maybe some special flag?
>
> No, just a bug in the conversion.
>
> If you predicate the schedule_work call on slab_state == SYSFS, then
> it should work (when sysfs comes up later in init, previously added
> slabs will be registered with sysfs).
>
> Oh, and you'd need to also not pass __SYSFS_ADD_DEFERRED into
> kmem_cache_create in that case too.
I am not sure I follow you here. We are setting up slab so early that we
absolutely _must_ defer sysfs setup. But we're also setting up slab much
earlier than workqueues, so we shouldn't really do schedule_work() at
that point. Furthermore, early boot cache sysfs setup is explicitly
handled in slab_sysfs_init() so I think we need something like the patch
below?
Pekka
diff --git a/mm/slub.c b/mm/slub.c
index 30354bf..4c12138 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2642,7 +2642,13 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
list_add(&s->list, &slab_caches);
kmalloc_caches_dma[index] = s;
- schedule_work(&sysfs_add_work);
+ /*
+ * The slab allocator is set up much earlier than workqueues. As early
+ * boot caches are handle by slab_sysfs_init(), avoid calling
+ * schedule_work() until keventd is up.
+ */
+ if (keventd_up())
+ schedule_work(&sysfs_add_work);
unlock_out:
up_write(&slub_lock);
Hi Pekka,
On Mon, Jun 15, 2009 at 11:32:59AM +0300, Pekka Enberg wrote:
> Hi Nick,
>
> On Mon, 2009-06-15 at 10:26 +0200, Nick Piggin wrote:
> > >
> > > Might be that I missed something. Maybe some special flag?
> >
> > No, just a bug in the conversion.
> >
> > If you predicate the schedule_work call on slab_state == SYSFS, then
> > it should work (when sysfs comes up later in init, previously added
> > slabs will be registered with sysfs).
> >
> > Oh, and you'd need to also not pass __SYSFS_ADD_DEFERRED into
> > kmem_cache_create in that case too.
>
> I am not sure I follow you here. We are setting up slab so early that we
> absolutely _must_ defer sysfs setup. But we're also setting up slab much
> earlier than workqueues, so we shouldn't really do schedule_work() at
> that point.
There's 2 types of deferring going on here. In early boot we
obviously don't touch sysfs at all. All other times, we still
need to defer because we don't know the context it is being
called from.
> Furthermore, early boot cache sysfs setup is explicitly
> handled in slab_sysfs_init() so I think we need something like the patch
> below?
I don't think so because keventd probably comes up before sysfs
still is ready (in this respect the existing pre-early-slab code
is probably still broken but just never done such early DMA
allocations). So it just makes sense to do it all via the slab_state
flag.
Also, if you keep the deferred flag on there for early created
slabs, then the next sysfs_add_work will try to add it again
(after slab_sysfs_init already added it).
Here
---
mm/slub.c | 17 ++++++++++++++---
1 file changed, 14 insertions(+), 3 deletions(-)
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c
+++ linux-2.6/mm/slub.c
@@ -2596,6 +2596,7 @@ static noinline struct kmem_cache *dma_k
struct kmem_cache *s;
char *text;
size_t realsize;
+ unsigned long slabflags;
s = kmalloc_caches_dma[index];
if (s)
@@ -2617,9 +2618,18 @@ static noinline struct kmem_cache *dma_k
(unsigned int)realsize);
s = kmalloc(kmem_size, flags & ~SLUB_DMA);
+ /*
+ * Must defer sysfs creation to a workqueue because we don't know
+ * what context we are called from. Before sysfs comes up, we don't
+ * need to do anything because slab_sysfs_init will start by
+ * adding all existing slabs to sysfs.
+ */
+ slabflags = SLAB_CACHE_DMA;
+ if (slab_state >= SYSFS)
+ slabflags |= __SYSFS_ADD_DEFERRED;
+
if (!s || !text || !kmem_cache_open(s, flags, text,
- realsize, ARCH_KMALLOC_MINALIGN,
- SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) {
+ realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
kfree(s);
kfree(text);
goto unlock_out;
@@ -2628,7 +2638,8 @@ static noinline struct kmem_cache *dma_k
list_add(&s->list, &slab_caches);
kmalloc_caches_dma[index] = s;
- schedule_work(&sysfs_add_work);
+ if (slab_state >= SYSFS)
+ schedule_work(&sysfs_add_work);
unlock_out:
up_write(&slub_lock);
On Mon, 2009-06-15 at 10:52 +0200, Nick Piggin wrote:
> > Furthermore, early boot cache sysfs setup is explicitly
> > handled in slab_sysfs_init() so I think we need something like the patch
> > below?
>
> I don't think so because keventd probably comes up before sysfs
> still is ready (in this respect the existing pre-early-slab code
> is probably still broken but just never done such early DMA
> allocations). So it just makes sense to do it all via the slab_state
> flag.
>
> Also, if you keep the deferred flag on there for early created
> slabs, then the next sysfs_add_work will try to add it again
> (after slab_sysfs_init already added it).
>
> Here
>
> ---
> mm/slub.c | 17 ++++++++++++++---
> 1 file changed, 14 insertions(+), 3 deletions(-)
>
> Index: linux-2.6/mm/slub.c
> ===================================================================
> --- linux-2.6.orig/mm/slub.c
> +++ linux-2.6/mm/slub.c
> @@ -2596,6 +2596,7 @@ static noinline struct kmem_cache *dma_k
> struct kmem_cache *s;
> char *text;
> size_t realsize;
> + unsigned long slabflags;
>
> s = kmalloc_caches_dma[index];
> if (s)
> @@ -2617,9 +2618,18 @@ static noinline struct kmem_cache *dma_k
> (unsigned int)realsize);
> s = kmalloc(kmem_size, flags & ~SLUB_DMA);
>
> + /*
> + * Must defer sysfs creation to a workqueue because we don't know
> + * what context we are called from. Before sysfs comes up, we don't
> + * need to do anything because slab_sysfs_init will start by
> + * adding all existing slabs to sysfs.
> + */
> + slabflags = SLAB_CACHE_DMA;
> + if (slab_state >= SYSFS)
> + slabflags |= __SYSFS_ADD_DEFERRED;
> +
> if (!s || !text || !kmem_cache_open(s, flags, text,
> - realsize, ARCH_KMALLOC_MINALIGN,
> - SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) {
> + realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
> kfree(s);
> kfree(text);
> goto unlock_out;
> @@ -2628,7 +2638,8 @@ static noinline struct kmem_cache *dma_k
> list_add(&s->list, &slab_caches);
> kmalloc_caches_dma[index] = s;
>
> - schedule_work(&sysfs_add_work);
> + if (slab_state >= SYSFS)
> + schedule_work(&sysfs_add_work);
>
> unlock_out:
> up_write(&slub_lock);
Looks good to me. Heiko, does it fix your case?
Pekka
On Mon, Jun 15, 2009 at 11:18 AM, Heiko
Carstens<[email protected]> wrote:
> I didn't look any deeper into this, but looks to me like doing something like
> schedule_work() this early isn't ok.
>
> This is the conversion that leads to the crash:
>
> - ? ? ? ? ? ? ? alloc_bootmem_low(sizeof(struct raw3215_info));
> + ? ? ? ? ? ? ? kzalloc(sizeof(struct raw3215_info), GFP_NOWAIT | GFP_DMA);
>
> Might be that I missed something. Maybe some special flag?
Btw, you should not need to use GFP_NOWAIT anymore and GFP_KERNEL
should be fine even during early boot.
Pekka
On Mon, 2009-06-15 at 08:46 +0200, Nick Piggin wrote:
> On Sat, Jun 13, 2009 at 07:46:03AM +1000, Benjamin Herrenschmidt wrote:
> >
> > > Adds code to hot code path.
> >
> > A single mask which should be mostly in the noise...
>
> Cacheline load. Yes it will be in the noise. Like all other slowdowns
> that get merged ;)
Yes, agreed. Did you see the patch I suggested to move all of this
outside of fastpaths for production configs?
On Mon, Jun 15, 2009 at 12:10:52PM +0300, Pekka Enberg wrote:
> On Mon, 2009-06-15 at 08:46 +0200, Nick Piggin wrote:
> > On Sat, Jun 13, 2009 at 07:46:03AM +1000, Benjamin Herrenschmidt wrote:
> > >
> > > > Adds code to hot code path.
> > >
> > > A single mask which should be mostly in the noise...
> >
> > Cacheline load. Yes it will be in the noise. Like all other slowdowns
> > that get merged ;)
>
> Yes, agreed. Did you see the patch I suggested to move all of this
> outside of fastpaths for production configs?
Yes I did see it go by. I'm always in favour of cutting
even a few cycles and definitely a cacheline from mm fastpaths.
On Mon, Jun 15, 2009 at 12:10:01PM +0300, Pekka Enberg wrote:
> On Mon, Jun 15, 2009 at 11:18 AM, Heiko
> Carstens<[email protected]> wrote:
> > I didn't look any deeper into this, but looks to me like doing something like
> > schedule_work() this early isn't ok.
> >
> > This is the conversion that leads to the crash:
> >
> > - ? ? ? ? ? ? ? alloc_bootmem_low(sizeof(struct raw3215_info));
> > + ? ? ? ? ? ? ? kzalloc(sizeof(struct raw3215_info), GFP_NOWAIT | GFP_DMA);
> >
> > Might be that I missed something. Maybe some special flag?
>
> Btw, you should not need to use GFP_NOWAIT anymore and GFP_KERNEL
> should be fine even during early boot.
Is this the agreed way forward? I would like to maybe continue to
try having early allocations pass in special flags where possible
(it could even be a GFP_BOOT or something). It can make it easier
to perhaps reduce branches in core code in future and things can
be flagged in warnings....
I just like the idea of keeping such annotations.
Hi Nick,
On Mon, Jun 15, 2009 at 11:18 AM, Heiko Carstens<[email protected]> wrote:
> > > I didn't look any deeper into this, but looks to me like doing something like
> > > schedule_work() this early isn't ok.
> > >
> > > This is the conversion that leads to the crash:
> > >
> > > - alloc_bootmem_low(sizeof(struct raw3215_info));
> > > + kzalloc(sizeof(struct raw3215_info), GFP_NOWAIT | GFP_DMA);
> > >
> > > Might be that I missed something. Maybe some special flag?
On Mon, Jun 15, 2009 at 12:10:01PM +0300, Pekka Enberg wrote:
> > Btw, you should not need to use GFP_NOWAIT anymore and GFP_KERNEL
> > should be fine even during early boot.
On Mon, 2009-06-15 at 11:41 +0200, Nick Piggin wrote:
> Is this the agreed way forward? I would like to maybe continue to
> try having early allocations pass in special flags where possible
> (it could even be a GFP_BOOT or something). It can make it easier
> to perhaps reduce branches in core code in future and things can
> be flagged in warnings....
>
> I just like the idea of keeping such annotations.
I don't know if we agreed or not but Linus expressed his liking to the
masking patch (that is merged now).
I was more on the GFP_BOOT side also but I am beginning to like the fact
that we can just do GFP_KERNEL and expect that to work in a sane way
during boot (and perhaps later on during suspend). We can probably shave
off even more cycles for production configs if we push the masking down
to the page allocator so I am not sure if extra cycles are going to be a
real issue.
Pekka
On Mon, 2009-06-15 at 11:41 +0200, Nick Piggin wrote:
>
> > Btw, you should not need to use GFP_NOWAIT anymore and GFP_KERNEL
> > should be fine even during early boot.
>
> Is this the agreed way forward?
Yes.
> I would like to maybe continue to
> try having early allocations pass in special flags where possible
> (it could even be a GFP_BOOT or something). It can make it easier
> to perhaps reduce branches in core code in future and things can
> be flagged in warnings....
The whole point of the exercise in removing the need for alloc_bootmem
in a whole bunch of code is defeated if you now also want specific flags
passed. I think we can cope reasonably easily.
> I just like the idea of keeping such annotations.
I think the boot order is too likely to change to make it a sane thing
to have all call sites "know" at what point they are in the boot
process. In your example, what does GFP_BOOT would mean ? Before
scheduler is initialized ? before interrupts are on ?
There's just too much stuff involved and we don't want random
allocations in various subsystem or arch code to be done with that
special knowledge of where specifically in that process they are done.
Especially since it may change.
Additionally, I believe the flag test/masking can be moved easily enough
out of the fast path... slub shouldn't need it there afaik and if it's
pushed down into the allocation of new slab's then it shouldn't be a big
deal.
Cheers,
Ben.
On Mon, 2009-06-15 at 19:51 +1000, Benjamin Herrenschmidt wrote:
> I think the boot order is too likely to change to make it a sane thing
> to have all call sites "know" at what point they are in the boot
> process. In your example, what does GFP_BOOT would mean ? Before
> scheduler is initialized ? before interrupts are on ?
Btw, I think this is a pretty important point. Linus suggested trying to
make slab initialization even earlier than what we now have. If we do
require GFP_BOOT annotations, then we'd need to sprinkle those all over
the place when we do that.
So from code shuffling point of view, it's better to support GFP_KERNEL
(almost) everywhere rather than require special annotations.
Pekka
On Mon, Jun 15, 2009 at 12:48:44PM +0300, Pekka Enberg wrote:
> Hi Nick,
>
> On Mon, Jun 15, 2009 at 11:18 AM, Heiko Carstens<[email protected]> wrote:
> > > > I didn't look any deeper into this, but looks to me like doing something like
> > > > schedule_work() this early isn't ok.
> > > >
> > > > This is the conversion that leads to the crash:
> > > >
> > > > - alloc_bootmem_low(sizeof(struct raw3215_info));
> > > > + kzalloc(sizeof(struct raw3215_info), GFP_NOWAIT | GFP_DMA);
> > > >
> > > > Might be that I missed something. Maybe some special flag?
>
> On Mon, Jun 15, 2009 at 12:10:01PM +0300, Pekka Enberg wrote:
> > > Btw, you should not need to use GFP_NOWAIT anymore and GFP_KERNEL
> > > should be fine even during early boot.
>
> On Mon, 2009-06-15 at 11:41 +0200, Nick Piggin wrote:
> > Is this the agreed way forward? I would like to maybe continue to
> > try having early allocations pass in special flags where possible
> > (it could even be a GFP_BOOT or something). It can make it easier
> > to perhaps reduce branches in core code in future and things can
> > be flagged in warnings....
> >
> > I just like the idea of keeping such annotations.
>
> I don't know if we agreed or not but Linus expressed his liking to the
> masking patch (that is merged now).
Yes I do like your patch, but more as the implementation of the
workaround for slab rather than being able to use it to mask
of random bits.
> I was more on the GFP_BOOT side also but I am beginning to like the fact
> that we can just do GFP_KERNEL and expect that to work in a sane way
> during boot (and perhaps later on during suspend). We can probably shave
I'd just not like to give up on it right away. I'm not convinced
that it is a "too hard" problem to have callers figure out the
right context (they do it for all other problems like whether
they can sleep and whether they hold specific types of locks
or whether they can be called in reclaim path etc etc etc).
And it is pretty trivial to add warnings to catch any of these
conditions. So while auditing might not be trivial, we would
likely be able to mop up remainders with some runtime warnings.
On Mon, Jun 15, 2009 at 07:51:16PM +1000, Benjamin Herrenschmidt wrote:
> On Mon, 2009-06-15 at 11:41 +0200, Nick Piggin wrote:
> >
> > > Btw, you should not need to use GFP_NOWAIT anymore and GFP_KERNEL
> > > should be fine even during early boot.
> >
> > Is this the agreed way forward?
>
> Yes.
>
> > I would like to maybe continue to
> > try having early allocations pass in special flags where possible
> > (it could even be a GFP_BOOT or something). It can make it easier
> > to perhaps reduce branches in core code in future and things can
> > be flagged in warnings....
>
> The whole point of the exercise in removing the need for alloc_bootmem
> in a whole bunch of code is defeated if you now also want specific flags
> passed. I think we can cope reasonably easily.
Why? The best reason to use slab allocator is that the allocations
are much more efficient and also can be freed later.
> > I just like the idea of keeping such annotations.
>
> I think the boot order is too likely to change to make it a sane thing
> to have all call sites "know" at what point they are in the boot
> process.
I disagree.
> In your example, what does GFP_BOOT would mean ? Before
> scheduler is initialized ? before interrupts are on ?
Before initcalls is probably easiest. But it really does not
matter that much. Why? Because if we run out of memory before
then, then there is not going to be anything to reclaim
anyway.
> There's just too much stuff involved and we don't want random
> allocations in various subsystem or arch code to be done with that
> special knowledge of where specifically in that process they are done.
If they're done that early, of course they have to know where
they are because they only get to use a subset of kernel
services depending exactly on what has already been done.
> Especially since it may change.
"it" meaning the ability to reclaim memory? Not really. Not a
significant amount of memory may be reclaimed really until
after init process starts running.
> Additionally, I believe the flag test/masking can be moved easily enough
> out of the fast path... slub shouldn't need it there afaik and if it's
> pushed down into the allocation of new slab's then it shouldn't be a big
> deal.
Given that things have been apparently coping fine so far, I
think it will be a backward step to just give up now and say
it is too hard simply because slab is available to use slightly
earlier.
It's not that the world is going to come to an end if we
can't remove the masking, but just maybe the information
can be used in future to avoid adding more overhead, or
maybe some other debugging features can be added or something.
I just think it is cleaner to go that way if possible, and
claiming that callers can't be expected to know what context
they clal the slab allocator from just sounds like a
contradiction to me.
> > ---
> > mm/slub.c | 17 ++++++++++++++---
> > 1 file changed, 14 insertions(+), 3 deletions(-)
> >
> > Index: linux-2.6/mm/slub.c
> > ===================================================================
> > --- linux-2.6.orig/mm/slub.c
> > +++ linux-2.6/mm/slub.c
> > @@ -2596,6 +2596,7 @@ static noinline struct kmem_cache *dma_k
> > struct kmem_cache *s;
> > char *text;
> > size_t realsize;
> > + unsigned long slabflags;
> >
> > s = kmalloc_caches_dma[index];
> > if (s)
> > @@ -2617,9 +2618,18 @@ static noinline struct kmem_cache *dma_k
> > (unsigned int)realsize);
> > s = kmalloc(kmem_size, flags & ~SLUB_DMA);
> >
> > + /*
> > + * Must defer sysfs creation to a workqueue because we don't know
> > + * what context we are called from. Before sysfs comes up, we don't
> > + * need to do anything because slab_sysfs_init will start by
> > + * adding all existing slabs to sysfs.
> > + */
> > + slabflags = SLAB_CACHE_DMA;
> > + if (slab_state >= SYSFS)
> > + slabflags |= __SYSFS_ADD_DEFERRED;
> > +
> > if (!s || !text || !kmem_cache_open(s, flags, text,
> > - realsize, ARCH_KMALLOC_MINALIGN,
> > - SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) {
> > + realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
> > kfree(s);
> > kfree(text);
> > goto unlock_out;
> > @@ -2628,7 +2638,8 @@ static noinline struct kmem_cache *dma_k
> > list_add(&s->list, &slab_caches);
> > kmalloc_caches_dma[index] = s;
> >
> > - schedule_work(&sysfs_add_work);
> > + if (slab_state >= SYSFS)
> > + schedule_work(&sysfs_add_work);
> >
> > unlock_out:
> > up_write(&slub_lock);
>
> Looks good to me. Heiko, does it fix your case?
Yes, works fine. Thanks!
Tested-by: Heiko Carstens <[email protected]>
On Mon, 2009-06-15 at 12:20 +0200, Heiko Carstens wrote:
> > > ---
> > > mm/slub.c | 17 ++++++++++++++---
> > > 1 file changed, 14 insertions(+), 3 deletions(-)
> > >
> > > Index: linux-2.6/mm/slub.c
> > > ===================================================================
> > > --- linux-2.6.orig/mm/slub.c
> > > +++ linux-2.6/mm/slub.c
> > > @@ -2596,6 +2596,7 @@ static noinline struct kmem_cache *dma_k
> > > struct kmem_cache *s;
> > > char *text;
> > > size_t realsize;
> > > + unsigned long slabflags;
> > >
> > > s = kmalloc_caches_dma[index];
> > > if (s)
> > > @@ -2617,9 +2618,18 @@ static noinline struct kmem_cache *dma_k
> > > (unsigned int)realsize);
> > > s = kmalloc(kmem_size, flags & ~SLUB_DMA);
> > >
> > > + /*
> > > + * Must defer sysfs creation to a workqueue because we don't know
> > > + * what context we are called from. Before sysfs comes up, we don't
> > > + * need to do anything because slab_sysfs_init will start by
> > > + * adding all existing slabs to sysfs.
> > > + */
> > > + slabflags = SLAB_CACHE_DMA;
> > > + if (slab_state >= SYSFS)
> > > + slabflags |= __SYSFS_ADD_DEFERRED;
> > > +
> > > if (!s || !text || !kmem_cache_open(s, flags, text,
> > > - realsize, ARCH_KMALLOC_MINALIGN,
> > > - SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) {
> > > + realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
> > > kfree(s);
> > > kfree(text);
> > > goto unlock_out;
> > > @@ -2628,7 +2638,8 @@ static noinline struct kmem_cache *dma_k
> > > list_add(&s->list, &slab_caches);
> > > kmalloc_caches_dma[index] = s;
> > >
> > > - schedule_work(&sysfs_add_work);
> > > + if (slab_state >= SYSFS)
> > > + schedule_work(&sysfs_add_work);
> > >
> > > unlock_out:
> > > up_write(&slub_lock);
> >
> > Looks good to me. Heiko, does it fix your case?
>
> Yes, works fine. Thanks!
>
> Tested-by: Heiko Carstens <[email protected]>
Thanks! Nick, care to send a patch with your sign-off? Does SLQB need
something like this too (I didn't check)?
Pekka
On Mon, Jun 15, 2009 at 12:57:39PM +0300, Pekka Enberg wrote:
> On Mon, 2009-06-15 at 19:51 +1000, Benjamin Herrenschmidt wrote:
> > I think the boot order is too likely to change to make it a sane thing
> > to have all call sites "know" at what point they are in the boot
> > process. In your example, what does GFP_BOOT would mean ? Before
> > scheduler is initialized ? before interrupts are on ?
>
> Btw, I think this is a pretty important point. Linus suggested trying to
> make slab initialization even earlier than what we now have. If we do
> require GFP_BOOT annotations, then we'd need to sprinkle those all over
> the place when we do that.
I don't understand. You'd be converting all these from bootmem
anyway, so where's the problem?
> So from code shuffling point of view, it's better to support GFP_KERNEL
> (almost) everywhere rather than require special annotations.
Nor this. We require special allocation annotations *everywhere*
according to context. Why is using GFP_KERNEL for unsleeping
allocations a good thing if we have GFP_ATOMIC etc?
We could also mask off __GFP_WAIT from allocations when we take
a spinlock or enter an interrupt, right?
Init code doesn't deserve to be more lazy than anybody else, and
part of the reason why such a core piece of code is so crufty
is exactly because people have been lazy there.
On Mon, Jun 15, 2009 at 01:21:44PM +0300, Pekka Enberg wrote:
> On Mon, 2009-06-15 at 12:20 +0200, Heiko Carstens wrote:
> > > > ---
> > > > mm/slub.c | 17 ++++++++++++++---
> > > > 1 file changed, 14 insertions(+), 3 deletions(-)
> > > >
> > > > Index: linux-2.6/mm/slub.c
> > > > ===================================================================
> > > > --- linux-2.6.orig/mm/slub.c
> > > > +++ linux-2.6/mm/slub.c
> > > > @@ -2596,6 +2596,7 @@ static noinline struct kmem_cache *dma_k
> > > > struct kmem_cache *s;
> > > > char *text;
> > > > size_t realsize;
> > > > + unsigned long slabflags;
> > > >
> > > > s = kmalloc_caches_dma[index];
> > > > if (s)
> > > > @@ -2617,9 +2618,18 @@ static noinline struct kmem_cache *dma_k
> > > > (unsigned int)realsize);
> > > > s = kmalloc(kmem_size, flags & ~SLUB_DMA);
> > > >
> > > > + /*
> > > > + * Must defer sysfs creation to a workqueue because we don't know
> > > > + * what context we are called from. Before sysfs comes up, we don't
> > > > + * need to do anything because slab_sysfs_init will start by
> > > > + * adding all existing slabs to sysfs.
> > > > + */
> > > > + slabflags = SLAB_CACHE_DMA;
> > > > + if (slab_state >= SYSFS)
> > > > + slabflags |= __SYSFS_ADD_DEFERRED;
> > > > +
> > > > if (!s || !text || !kmem_cache_open(s, flags, text,
> > > > - realsize, ARCH_KMALLOC_MINALIGN,
> > > > - SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) {
> > > > + realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
> > > > kfree(s);
> > > > kfree(text);
> > > > goto unlock_out;
> > > > @@ -2628,7 +2638,8 @@ static noinline struct kmem_cache *dma_k
> > > > list_add(&s->list, &slab_caches);
> > > > kmalloc_caches_dma[index] = s;
> > > >
> > > > - schedule_work(&sysfs_add_work);
> > > > + if (slab_state >= SYSFS)
> > > > + schedule_work(&sysfs_add_work);
> > > >
> > > > unlock_out:
> > > > up_write(&slub_lock);
> > >
> > > Looks good to me. Heiko, does it fix your case?
> >
> > Yes, works fine. Thanks!
> >
> > Tested-by: Heiko Carstens <[email protected]>
>
> Thanks! Nick, care to send a patch with your sign-off? Does SLQB need
> something like this too (I didn't check)?
No, it does not defer DMA slab creation.
--
Recent change to use slab allocations earlier exposed a bug where
SLUB can call schedule_work and try to call sysfs before it is
safe to do so.
Reported-by: Heiko Carstens <[email protected]>
Tested-by: Heiko Carstens <[email protected]>
Signed-off-by: Nick Piggin <[email protected]>
---
mm/slub.c | 17 ++++++++++++++---
1 file changed, 14 insertions(+), 3 deletions(-)
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c
+++ linux-2.6/mm/slub.c
@@ -2596,6 +2596,7 @@ static noinline struct kmem_cache *dma_k
struct kmem_cache *s;
char *text;
size_t realsize;
+ unsigned long slabflags;
s = kmalloc_caches_dma[index];
if (s)
@@ -2617,9 +2618,18 @@ static noinline struct kmem_cache *dma_k
(unsigned int)realsize);
s = kmalloc(kmem_size, flags & ~SLUB_DMA);
+ /*
+ * Must defer sysfs creation to a workqueue because we don't know
+ * what context we are called from. Before sysfs comes up, we don't
+ * need to do anything because our sysfs initcall will start by
+ * adding all existing slabs to sysfs.
+ */
+ slabflags = SLAB_CACHE_DMA;
+ if (slab_state >= SYSFS)
+ slabflags |= __SYSFS_ADD_DEFERRED;
+
if (!s || !text || !kmem_cache_open(s, flags, text,
- realsize, ARCH_KMALLOC_MINALIGN,
- SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) {
+ realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
kfree(s);
kfree(text);
goto unlock_out;
@@ -2628,7 +2638,8 @@ static noinline struct kmem_cache *dma_k
list_add(&s->list, &slab_caches);
kmalloc_caches_dma[index] = s;
- schedule_work(&sysfs_add_work);
+ if (slab_state >= SYSFS)
+ schedule_work(&sysfs_add_work);
unlock_out:
up_write(&slub_lock);
On Mon, Jun 15, 2009 at 1:31 PM, Nick Piggin<[email protected]> wrote:
> Recent change to use slab allocations earlier exposed a bug where
> SLUB can call schedule_work and try to call sysfs before it is
> safe to do so.
>
> Reported-by: Heiko Carstens <[email protected]>
> Tested-by: Heiko Carstens <[email protected]>
> Signed-off-by: Nick Piggin <[email protected]>
Applied, thanks!
> Why? The best reason to use slab allocator is that the allocations
> are much more efficient and also can be freed later.
I think you are making the mistake of reasoning too much in term of
implementation of the allocator itself, and not enough in term of the
consistency of the API exposed to the rest of the kernel.
I think the current approach is a good compromise. If you can make it
more optimal (by pushing the masking in slow path for example), then go
for it, but from an API standpoint, I don't like having anybody who
needs to allocate memory have to know about seemingly unrelated things
such as whether interrupts have been enabled globally yet, scheduler
have been initialized, or whatever else we might stumble upon.
> > I think the boot order is too likely to change to make it a sane thing
> > to have all call sites "know" at what point they are in the boot
> > process.
>
> I disagree.
How so ? IE. We are changing things in the boot order today, and I'm
expecting things to continue to move in that area. I believe it's going
to be endless headaches and breakage if we have to get the right set of
flags on every caller.
In addition, there's the constant issue of code that can be called both
at boot and non-boot time and shouldn't have to know where it has been
called from, while wanting to make allocations, such as get_vm_area().
I don't think it will make anybody's life better to push out the "boot
state" up into those APIs, duplicating them, etc...
> > In your example, what does GFP_BOOT would mean ? Before
> > scheduler is initialized ? before interrupts are on ?
>
> Before initcalls is probably easiest. But it really does not
> matter that much. Why? Because if we run out of memory before
> then, then there is not going to be anything to reclaim
> anyway.
Precisely. It -doesn't matter- (to the caller). So why make it matter in
term of API ? There's a whole bunch of things in arch code or subsystems
that really don't have any business knowing in what context or at what
time they have been called.
> > There's just too much stuff involved and we don't want random
> > allocations in various subsystem or arch code to be done with that
> > special knowledge of where specifically in that process they are done.
>
> If they're done that early, of course they have to know where
> they are because they only get to use a subset of kernel
> services depending exactly on what has already been done.
To a certain extent, yes. But not -that- much, expecially when it comes
to a very basic service such as allocating memory.
> > Especially since it may change.
>
> "it" meaning the ability to reclaim memory? Not really. Not a
> significant amount of memory may be reclaimed really until
> after init process starts running.
How much stuff allocated during boot needs to be reclaimed ?
>
> > Additionally, I believe the flag test/masking can be moved easily enough
> > out of the fast path... slub shouldn't need it there afaik and if it's
> > pushed down into the allocation of new slab's then it shouldn't be a big
> > deal.
>
> Given that things have been apparently coping fine so far, I
> think it will be a backward step to just give up now and say
> it is too hard simply because slab is available to use slightly
> earlier.
Things have been coping thanks to horrors such as
if (slab_is_available())
kmalloc
else
alloc_bootmem.
Now you are proposing to change that into
if (whatever_are_we_talking_about())
kmalloc(... GFP_KERNEL)
else
kmalloc(... GFP_BOOT)
Not a very big improvement in my book :-)
> It's not that the world is going to come to an end if we
> can't remove the masking, but just maybe the information
> can be used in future to avoid adding more overhead, or
> maybe some other debugging features can be added or something.
> I just think it is cleaner to go that way if possible, and
> claiming that callers can't be expected to know what context
> they clal the slab allocator from just sounds like a
> contradiction to me.
I agree with the general principle of pushing state information out to
the caller as much as possible. But like all principles, there are
meaningful exceptions and I believe this is a good example of one.
Cheers,
Ben.
On Mon, 2009-06-15 at 12:27 +0200, Nick Piggin wrote:
>
> > So from code shuffling point of view, it's better to support
> GFP_KERNEL
> > (almost) everywhere rather than require special annotations.
>
> Nor this. We require special allocation annotations *everywhere*
> according to context. Why is using GFP_KERNEL for unsleeping
> allocations a good thing if we have GFP_ATOMIC etc?
GFP_ATOMIC is a good point in fact. We -could- make it unnecessary since
we can pretty much tell nowadays whether we are in atomic context or
not.
So where do we put the line ? The subtle differences here are, imho:
- It's good to discourage allocations in atomic contexts. So requiring
GFP_ATOMIC to some extent helps making people think twice about they are
doing.
- Code that requires GFP_ATOMIC is generally code that knows pretty
well in which context it's running, because most of the time, it's the
same piece of code or subsystem that is -causing- the context to be
atomic (because it's an interrupt handler or because it used a
spinlock). The case of boot time is subtely different. We have a whole
bunch of things that will be called both at boot time and later, without
necessarily knowing where they have been called from. IE. The call
traces are longer if you want in the typical boot context than in the
typical atomic context.
It all boils down to a fine line to be found here between the two
options, and my gut feeling is that we should keep GFP_ATOMIC explicit,
but avoid the need to provide an explicit API for boot time allocations.
> We could also mask off __GFP_WAIT from allocations when we take
> a spinlock or enter an interrupt, right?
We -could-. Whether we want to is to be debated. Also the overhead of
doing so would be higher I believe.
> Init code doesn't deserve to be more lazy than anybody else, and
> part of the reason why such a core piece of code is so crufty
> is exactly because people have been lazy there.
I think the main problem isn't necessarily init code per se, but the
pile of -common- code that can be called both at init time and later.
Cheers,
Ben.
On Mon, Jun 15, 2009 at 08:39:48PM +1000, Benjamin Herrenschmidt wrote:
>
> > Why? The best reason to use slab allocator is that the allocations
> > are much more efficient and also can be freed later.
>
> I think you are making the mistake of reasoning too much in term of
> implementation of the allocator itself, and not enough in term of the
> consistency of the API exposed to the rest of the kernel.
No I just think giving accurate context information is better
than giving inaccurate and then fixing it post facto by testing
some global flag.
> I think the current approach is a good compromise. If you can make it
> more optimal (by pushing the masking in slow path for example), then go
> for it, but from an API standpoint, I don't like having anybody who
> needs to allocate memory have to know about seemingly unrelated things
> such as whether interrupts have been enabled globally yet, scheduler
> have been initialized, or whatever else we might stumble upon.
"pre initcalls" is pretty fair. Before then you are talking about
pretty specialised boot code that has to know a lot of context
about what other subsystems are up anyway.
> > > I think the boot order is too likely to change to make it a sane thing
> > > to have all call sites "know" at what point they are in the boot
> > > process.
> >
> > I disagree.
>
> How so ? IE. We are changing things in the boot order today, and I'm
> expecting things to continue to move in that area. I believe it's going
> to be endless headaches and breakage if we have to get the right set of
> flags on every caller.
If you replace alloc_bootmem with kmalloc(GFP_KERNEL), then what do
you expect to happen? What would be the problem with using GFP_ATOMIC
like everybody else does?
It's not like it's constantly changing or something that cannot be
detected and warned about. How many times has it moved in the entire
2.5/2.6 series I wonder? Once would be my guess.
> In addition, there's the constant issue of code that can be called both
> at boot and non-boot time and shouldn't have to know where it has been
> called from, while wanting to make allocations, such as get_vm_area().
>
> I don't think it will make anybody's life better to push out the "boot
> state" up into those APIs, duplicating them, etc...
I don't see that one example being a significant or common problem.
> > > In your example, what does GFP_BOOT would mean ? Before
> > > scheduler is initialized ? before interrupts are on ?
> >
> > Before initcalls is probably easiest. But it really does not
> > matter that much. Why? Because if we run out of memory before
> > then, then there is not going to be anything to reclaim
> > anyway.
>
> Precisely. It -doesn't matter- (to the caller). So why make it matter in
> term of API ? There's a whole bunch of things in arch code or subsystems
> that really don't have any business knowing in what context or at what
> time they have been called.
Wait, what? The context does of course still matter because we don't
want to, for example, reenable interrupts.
"it does not matter" when I said it means that it is not critical
the exact where we would decide to mandate the use of GFP_BOOT.
> > > There's just too much stuff involved and we don't want random
> > > allocations in various subsystem or arch code to be done with that
> > > special knowledge of where specifically in that process they are done.
> >
> > If they're done that early, of course they have to know where
> > they are because they only get to use a subset of kernel
> > services depending exactly on what has already been done.
>
> To a certain extent, yes. But not -that- much, expecially when it comes
> to a very basic service such as allocating memory.
Disagree. See bootmem.
> > > Especially since it may change.
> >
> > "it" meaning the ability to reclaim memory? Not really. Not a
> > significant amount of memory may be reclaimed really until
> > after init process starts running.
>
> How much stuff allocated during boot needs to be reclaimed ?
IIRC the last one I wrote may have been sched-domains or something.
And not just reclaimed but also efficiency of allocation. And it
actually does matter a few hundred bytes even on tiny systems.
> > > Additionally, I believe the flag test/masking can be moved easily enough
> > > out of the fast path... slub shouldn't need it there afaik and if it's
> > > pushed down into the allocation of new slab's then it shouldn't be a big
> > > deal.
> >
> > Given that things have been apparently coping fine so far, I
> > think it will be a backward step to just give up now and say
> > it is too hard simply because slab is available to use slightly
> > earlier.
>
> Things have been coping thanks to horrors such as
>
> if (slab_is_available())
> kmalloc
> else
> alloc_bootmem.
>
> Now you are proposing to change that into
>
> if (whatever_are_we_talking_about())
> kmalloc(... GFP_KERNEL)
> else
> kmalloc(... GFP_BOOT)
>
> Not a very big improvement in my book :-)
Of core code, we have one in kernel/params.c that cannot go away,
one in kernel/profile.c that goes away regardless, one in lib/cpumask.c
that can probably just go away, ones in mm/page_alloc.c and page_cgroup.c
that cannot go away, and some in the memory model code that cannot
go away.
None of them would transform from your sequence A to B.
The rest of the callers are in s390, which probably tells you something
about the state of their init code. Don't know enough to know whether
your concerns hold there or not though.
But I think this is just scare mongering ;) If we actually look at
the code on a case by case basis then I think it won't be too much
problem.
> > It's not that the world is going to come to an end if we
> > can't remove the masking, but just maybe the information
> > can be used in future to avoid adding more overhead, or
> > maybe some other debugging features can be added or something.
> > I just think it is cleaner to go that way if possible, and
> > claiming that callers can't be expected to know what context
> > they clal the slab allocator from just sounds like a
> > contradiction to me.
>
> I agree with the general principle of pushing state information out to
> the caller as much as possible. But like all principles, there are
> meaningful exceptions and I believe this is a good example of one.
And I'm willing to accept some burden on core services to
relieve callers of hard work, but I don't see the big problem
with boot code.
Code that is called from both sleeping and non-sleeping context
for example is *far far* more common than code that is called
from both early boot and normal running situation.
The preferred solution we take is not to test if (in_interrupt())
or other hacks like that, but simply to have the caller pass in
that information. Really, why does that not work other than
laziness?
I believe the principle of having hacks in the boot code like
that which is different to our normal idea of clean code is
a big part of the problem of why it is so ugly there. I can
live with that because it is well tested and not performance
critical code (and I don't usually have to ever look at it).
But I won't live with having it shit in our nice core code...
Well, at least I won't throw up my hands and give up this
early.
If we are talking about something that makes life of regular
driver/fs/subsystem/etc writer easier then I would give it much
more consideration. But we're talking about adding hacks in
the core allocators because we're too lazy to write early boot
code that even knows what context it is in...
On Mon, Jun 15, 2009 at 08:45:27PM +1000, Benjamin Herrenschmidt wrote:
> On Mon, 2009-06-15 at 12:27 +0200, Nick Piggin wrote:
> > Init code doesn't deserve to be more lazy than anybody else, and
> > part of the reason why such a core piece of code is so crufty
> > is exactly because people have been lazy there.
>
> I think the main problem isn't necessarily init code per se, but the
> pile of -common- code that can be called both at init time and later.
Just seems bogus argument. Everwhere else that does this (ie.
allocations that are called from multiple allocation contexts)
passes correct gfp flags down.
On Mon, Jun 15, 2009 at 01:22:05PM +0200, Nick Piggin wrote:
> On Mon, Jun 15, 2009 at 08:39:48PM +1000, Benjamin Herrenschmidt wrote:
> But I won't live with having it shit in our nice core code...
> Well, at least I won't throw up my hands and give up this
> early.
Just the principle, btw. The cost of this one load and
branch (when moved out to the slowpath -- in the fastpath
then adding a single cycle is like a 1% slowdown you know)
is not prohibitive, but it's a slipperly slope.
If a couple of specialised early boot code places have
to test slab_is_available() to prevent crap leaking into
our main allocators, then that's quite fine by me.
On Mon, Jun 15, 2009 at 01:28:28PM +0200, Nick Piggin wrote:
> On Mon, Jun 15, 2009 at 01:22:05PM +0200, Nick Piggin wrote:
> > On Mon, Jun 15, 2009 at 08:39:48PM +1000, Benjamin Herrenschmidt wrote:
> > But I won't live with having it shit in our nice core code...
> > Well, at least I won't throw up my hands and give up this
> > early.
>
> Just the principle, btw.
I have the same opinion for suspend/resume too, although
in that case I know less about the issues and if we
found that it indeed does make a random driver writers
life easier[*] then it might be a reason to do this. But
I still don't think that would give boot code a license to
just revert back to "I don't know or care, GFP_KERNEL pelase"
[*] and note that being unaware of your context I don't
think is making life easier automatically.
On Mon, 15 Jun 2009, Nick Piggin wrote:
> On Mon, Jun 15, 2009 at 08:45:27PM +1000, Benjamin Herrenschmidt wrote:
> > On Mon, 2009-06-15 at 12:27 +0200, Nick Piggin wrote:
> > > Init code doesn't deserve to be more lazy than anybody else, and
> > > part of the reason why such a core piece of code is so crufty
> > > is exactly because people have been lazy there.
> >
> > I think the main problem isn't necessarily init code per se, but the
> > pile of -common- code that can be called both at init time and later.
>
> Just seems bogus argument. Everwhere else that does this (ie.
> allocations that are called from multiple allocation contexts)
> passes correct gfp flags down.
Fair enough that you jealously defend SL?B code from onslaught, but
FWIW I strongly agree with Ben on all this. I cannot see the point
of the pain of moving around SL?B versus bootmem, if we immediately
force such a distinction (differently dressed) upon their users again.
I fully agree with Ben that it's the job of the allocator to provide
a service, and part of that job to understand its own limitations at
different stages of running.
Hugh
Hi Hugh,
On Mon, 2009-06-15 at 13:38 +0100, Hugh Dickins wrote:
> Fair enough that you jealously defend SL?B code from onslaught, but
> FWIW I strongly agree with Ben on all this. I cannot see the point
> of the pain of moving around SL?B versus bootmem, if we immediately
> force such a distinction (differently dressed) upon their users again.
I'm fine with the current approach but I don't think this is completely
accurate. Passing a GFP flag from top to bottom (like we do in existing
code) is pretty natural compared to passing a "boot" boolean or using
system_state checks to switch between kmalloc() and bootmem_alloc().
So even with a GFP_BOOT flag, I do see advantages in being able to use
kmalloc() et al almost universally.
Pekka
On Sat, 13 Jun 2009, Benjamin Herrenschmidt wrote:
>
> > Adds code to hot code path.
>
> A single mask which should be mostly in the noise...
>
> > The allocators mask flags passed to the page allocator through
> > GFP_RECLAIM_MASK|GFP_CONSTRAINT_MASK. This is done outside of the critical
> > code paths.
> >
> > The might_sleep issues may be fixed by adding another check to
> > __might_sleep().
>
> That is not enough because slab and slub explicitely re-enable
> interrupts when __GFP_WAIT is set, which is the problem we need to solve
> in the first place, so we need to do the masking there.
If you mask the bit then both wont reenable interrupts.
On Sun, 14 Jun 2009, Pekka J Enberg wrote:
> How about something like this? There should be no extra code in fastpaths
> for production configs with this one.
Yes something like that would be good. More comments below.
> index 4d6004c..5e8cea1 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1613,6 +1613,8 @@ another_slab:
> deactivate_slab(s, c);
>
> new_slab:
> + gfpflags &= slab_gfp_mask;
> +
Move the processing of GFP_RECLAIM_MASK etc up to here from new_slab? Then
the flow is also more logical. The flags handling is concentrated in one
spot in the allocator and its more obvious how we handle gfp flags.
> @@ -1668,13 +1670,14 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
> struct kmem_cache_cpu *c;
> unsigned long flags;
> unsigned int objsize;
> + gfp_t real_gfp;
>
> - gfpflags &= slab_gfp_mask;
> + real_gfp = gfpflags & slab_gfp_mask;
>
> - lockdep_trace_alloc(gfpflags);
> - might_sleep_if(gfpflags & __GFP_WAIT);
> + lockdep_trace_alloc(real_gfp);
> + might_sleep_if(real_gfp & __GFP_WAIT);
>
> - if (should_failslab(s->objsize, gfpflags))
> + if (should_failslab(s->objsize, real_gfp))
> return NULL;
>
> local_irq_save(flags);
Dont do it there. Only modify the slow path.
Look at __might_sleep(). It already has an exception for system_state !=
RUNNING. If it still triggers then add to the condition there.
On Mon, 2009-06-15 at 10:55 -0400, Christoph Lameter wrote:
> On Sun, 14 Jun 2009, Pekka J Enberg wrote:
>
> > How about something like this? There should be no extra code in fastpaths
> > for production configs with this one.
>
> Yes something like that would be good. More comments below.
>
> > index 4d6004c..5e8cea1 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -1613,6 +1613,8 @@ another_slab:
> > deactivate_slab(s, c);
> >
> > new_slab:
> > + gfpflags &= slab_gfp_mask;
> > +
>
> Move the processing of GFP_RECLAIM_MASK etc up to here from new_slab? Then
> the flow is also more logical. The flags handling is concentrated in one
> spot in the allocator and its more obvious how we handle gfp flags.
Sure. Will fix.
> > @@ -1668,13 +1670,14 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
> > struct kmem_cache_cpu *c;
> > unsigned long flags;
> > unsigned int objsize;
> > + gfp_t real_gfp;
> >
> > - gfpflags &= slab_gfp_mask;
> > + real_gfp = gfpflags & slab_gfp_mask;
> >
> > - lockdep_trace_alloc(gfpflags);
> > - might_sleep_if(gfpflags & __GFP_WAIT);
> > + lockdep_trace_alloc(real_gfp);
> > + might_sleep_if(real_gfp & __GFP_WAIT);
> >
> > - if (should_failslab(s->objsize, gfpflags))
> > + if (should_failslab(s->objsize, real_gfp))
> > return NULL;
> >
> > local_irq_save(flags);
>
> Dont do it there. Only modify the slow path.
>
> Look at __might_sleep(). It already has an exception for system_state !=
> RUNNING. If it still triggers then add to the condition there.
But does this matter? When the debugging options are turned off, there
are no users for "real_gfp" and thus GCC optimizes everything away. For
debugging configs, the extra cacheline load doesn't matter, does it?
On Mon, 15 Jun 2009, Pekka Enberg wrote:
> > Dont do it there. Only modify the slow path.
> >
> > Look at __might_sleep(). It already has an exception for system_state !=
> > RUNNING. If it still triggers then add to the condition there.
>
> But does this matter? When the debugging options are turned off, there
> are no users for "real_gfp" and thus GCC optimizes everything away. For
> debugging configs, the extra cacheline load doesn't matter, does it?
It cleaner to have the fastpath as small as possible. Having unused
variables in there is a bit confusing.
Also the path is performance critical. That may not matter for the debug
case in some sitiations. But there are people that keep the debugging
options on. Better to limit the impact as much as possible.
And the "extra cacheline load does not matter" reasoning can only be
applied so many times. An extra cacheline load increases the cache foot
print of the fast path after all.
Hi Christoph,
On Mon, 2009-06-15 at 11:05 -0400, Christoph Lameter wrote:
> On Mon, 15 Jun 2009, Pekka Enberg wrote:
>
> > > Dont do it there. Only modify the slow path.
> > >
> > > Look at __might_sleep(). It already has an exception for system_state !=
> > > RUNNING. If it still triggers then add to the condition there.
> >
> > But does this matter? When the debugging options are turned off, there
> > are no users for "real_gfp" and thus GCC optimizes everything away. For
> > debugging configs, the extra cacheline load doesn't matter, does it?
>
> It cleaner to have the fastpath as small as possible. Having unused
> variables in there is a bit confusing.
OK, I can clean this up, no problem.
On Mon, 2009-06-15 at 11:05 -0400, Christoph Lameter wrote:
> Also the path is performance critical. That may not matter for the debug
> case in some sitiations. But there are people that keep the debugging
> options on. Better to limit the impact as much as possible.
>
> And the "extra cacheline load does not matter" reasoning can only be
> applied so many times. An extra cacheline load increases the cache foot
> print of the fast path after all.
The point here is that the actual debugging hooks do a whole lot more
than cacheline load + alu op even in the "fastpath" case (if such a
thing exists for those).
Pekka
Hi Christoph,
On Mon, Jun 15, 2009 at 6:11 PM, Pekka Enberg<[email protected]> wrote:
>> > But does this matter? When the debugging options are turned off, there
>> > are no users for "real_gfp" and thus GCC optimizes everything away. For
>> > debugging configs, the extra cacheline load doesn't matter, does it?
>>
>> It cleaner to have the fastpath as small as possible. Having unused
>> variables in there is a bit confusing.
>
> OK, I can clean this up, no problem.
Actually, there's a slight complication here. If I push gfp mask to
__might_sleep(), lockdep_trace_alloc() and so on, the mask is
effective _everywhere_ even outside of slab. Yes, it makes sense if we
push the masking right down to the page allocator but I wonder if
that's something we want to do at this point?
Pekka
On Mon, 15 Jun 2009, Pekka Enberg wrote:
> The point here is that the actual debugging hooks do a whole lot more
> than cacheline load + alu op even in the "fastpath" case (if such a
> thing exists for those).
Then it is best to confine additional boot time checking to
those debugging function.
On Mon, 15 Jun 2009, Pekka Enberg wrote:
> > OK, I can clean this up, no problem.
>
> Actually, there's a slight complication here. If I push gfp mask to
> __might_sleep(), lockdep_trace_alloc() and so on, the mask is
> effective _everywhere_ even outside of slab. Yes, it makes sense if we
> push the masking right down to the page allocator but I wonder if
> that's something we want to do at this point?
__might_sleep just should not trigger right? The mask does not need to be
passed. __might_sleep may be called uselessly during bootup if __GFP_WAIT
is set. But it should not trigger any output. Look at the initial
statements of __might_sleep: They are already prepared to simply return in
the early boot case.
On Mon, 2009-06-15 at 11:51 -0400, Christoph Lameter wrote:
> On Mon, 15 Jun 2009, Pekka Enberg wrote:
>
> > > OK, I can clean this up, no problem.
> >
> > Actually, there's a slight complication here. If I push gfp mask to
> > __might_sleep(), lockdep_trace_alloc() and so on, the mask is
> > effective _everywhere_ even outside of slab. Yes, it makes sense if we
> > push the masking right down to the page allocator but I wonder if
> > that's something we want to do at this point?
>
> __might_sleep just should not trigger right? The mask does not need to be
> passed. __might_sleep may be called uselessly during bootup if __GFP_WAIT
> is set. But it should not trigger any output. Look at the initial
> statements of __might_sleep: They are already prepared to simply return in
> the early boot case.
Oh, yeah, you're right about __might_sleep() so we can just ignore that.
But we still have bit of a problem for should_failslab() and
__lockdep_trace_alloc(). We might be able to deal with the former by
adding system_state check but for the latter, we need to mask the gfp
flags. Hmm.
Pekka
On Mon, 15 Jun 2009, Pekka Enberg wrote:
> But we still have bit of a problem for should_failslab() and
> __lockdep_trace_alloc(). We might be able to deal with the former by
> adding system_state check but for the latter, we need to mask the gfp
> flags. Hmm.
These issues will appear in multiple places in the core code. Its best to
fix them within these functions.
On Mon, 15 Jun 2009, Pekka Enberg wrote:
>
> Actually, there's a slight complication here. If I push gfp mask to
> __might_sleep(), lockdep_trace_alloc() and so on, the mask is
> effective _everywhere_ even outside of slab. Yes, it makes sense if we
> push the masking right down to the page allocator but I wonder if
> that's something we want to do at this point?
This actually doesn't sound like a complication to me, but a potential
cleanup.
Right now we already have that magic "system_state" test in __might_sleep.
Maybe we could get rid of that, and replace it with that test for gpf
bits. So we'd have just _one_ magic special case, and it's directly
related to memory allocation (which is really the reason for that system
state thing too).
But maybe we have other reasons for that system_state special case, that
are independent. I have not checked.
Linus
Hi Linus,
On Mon, 15 Jun 2009, Pekka Enberg wrote:
>> Actually, there's a slight complication here. If I push gfp mask to
>> __might_sleep(), lockdep_trace_alloc() and so on, the mask is
>> effective _everywhere_ even outside of slab. Yes, it makes sense if we
>> push the masking right down to the page allocator but I wonder if
>> that's something we want to do at this point?
On Mon, Jun 15, 2009 at 8:15 PM, Linus
Torvalds<[email protected]> wrote:
> This actually doesn't sound like a complication to me, but a potential
> cleanup.
>
> Right now we already have that magic "system_state" test in __might_sleep.
> Maybe we could get rid of that, and replace it with that test for gpf
> bits. So we'd have just _one_ magic special case, and it's directly
> related to memory allocation (which is really the reason for that system
> state thing too).
>
> But maybe we have other reasons for that system_state special case, that
> are independent. I have not checked.
I'll double-check this but I think we can do that. The only
problematic one is __lockdep_trace_alloc() which is used by the page
allocator and the slab allocator but we only want to do masking in the
latter one. But I guess we can just introduce a
lockdep_trace_slab_alloc() or something that does it before calling
the internal function.
Pekka
On Mon, 2009-06-15 at 13:23 +0200, Nick Piggin wrote:
> > I think the main problem isn't necessarily init code per se, but the
> > pile of -common- code that can be called both at init time and
> later.
>
> Just seems bogus argument. Everwhere else that does this (ie.
> allocations that are called from multiple allocation contexts)
> passes correct gfp flags down.
So you say we should create new variants of all these APIs that take gfp
flags as arguments just because they might be called early during boot :
- All the vmalloc interfaces (__get_vm_area() and it's 5 or 6 variants)
- Allocation of PCI host bridges data structures in the powerpc code
- Allocation of interrupt controller domains in the powerpc code
- Page table allocations (oops ... can't change that arch specific,
would have to be a generic change)
- ioremap (which call both __get_vm_area() and page table allocs)
- ...
Are you just insane ? :-)
Cheers,
Ben.
On Mon, 2009-06-15 at 13:38 +0200, Nick Piggin wrote:
> On Mon, Jun 15, 2009 at 01:28:28PM +0200, Nick Piggin wrote:
> > On Mon, Jun 15, 2009 at 01:22:05PM +0200, Nick Piggin wrote:
> > > On Mon, Jun 15, 2009 at 08:39:48PM +1000, Benjamin Herrenschmidt wrote:
> > > But I won't live with having it shit in our nice core code...
> > > Well, at least I won't throw up my hands and give up this
> > > early.
> >
> > Just the principle, btw.
>
> I have the same opinion for suspend/resume too, although
> in that case I know less about the issues and if we
> found that it indeed does make a random driver writers
> life easier[*] then it might be a reason to do this. But
> I still don't think that would give boot code a license to
> just revert back to "I don't know or care, GFP_KERNEL pelase"
>
> [*] and note that being unaware of your context I don't
> think is making life easier automatically.
The suspend/resume case is even worse ... because drivers don't know,
and don't have to.
IE. We are talking here about pretty much -any- kmalloc in the kernel,
you don't seem to understand that.
The problem here is that driver A has suspended and happen to be on the
swapout path. driver B hasn't been suspended yet, and potentially
doesn't even know there's a suspend/resume cycle in progress.
Now, driver B, while holding for example one of its internal mutexes,
calls something that calls something that does a kmalloc(GFP_KERNEL) ...
The later will potentially block forever (or at least until resume)
because the allocator may try to swap something out to devices driven by
driver A while it's suspended.
Now, driver B suspend() is called, which tries to take the above
mutex... kaboom.
Yes, we -could- probably try to invent some scheme for block devices to
"teach" upper layers that they are being suspended. That would cover
some of the cases and would probably not be done properly for 10 kernel
versions to come... Or we can make all kmalloc() degrade automatically
to GFP_NOIO when suspend is started.
Which one is more likely to actually work ? :-)
Cheers,
Ben.
On Mon, 2009-06-15 at 13:28 +0200, Nick Piggin wrote:
> If a couple of specialised early boot code places have
> to test slab_is_available() to prevent crap leaking into
> our main allocators, then that's quite fine by me.
slab_is_available() will not fix the problem in that case. In fact,
existing tests of slab_is_available() have been broken because they
assumed that when it was true, then GFP_KERNEL would work.
Ben.
On Tue, Jun 16, 2009 at 07:37:22AM +1000, Benjamin Herrenschmidt wrote:
> On Mon, 2009-06-15 at 13:38 +0200, Nick Piggin wrote:
> > On Mon, Jun 15, 2009 at 01:28:28PM +0200, Nick Piggin wrote:
> > > On Mon, Jun 15, 2009 at 01:22:05PM +0200, Nick Piggin wrote:
> > > > On Mon, Jun 15, 2009 at 08:39:48PM +1000, Benjamin Herrenschmidt wrote:
> > > > But I won't live with having it shit in our nice core code...
> > > > Well, at least I won't throw up my hands and give up this
> > > > early.
> > >
> > > Just the principle, btw.
> >
> > I have the same opinion for suspend/resume too, although
> > in that case I know less about the issues and if we
> > found that it indeed does make a random driver writers
> > life easier[*] then it might be a reason to do this. But
> > I still don't think that would give boot code a license to
> > just revert back to "I don't know or care, GFP_KERNEL pelase"
> >
> > [*] and note that being unaware of your context I don't
> > think is making life easier automatically.
>
> The suspend/resume case is even worse ... because drivers don't know,
> and don't have to.
I said suspend/resume I have the same opinion. The difference
there is that these kind of allocator hacks _could_ possibly
help regular driver writers and such. Which I said is a much
better case than helping early boot code.
On Tue, Jun 16, 2009 at 07:31:38AM +1000, Benjamin Herrenschmidt wrote:
> On Mon, 2009-06-15 at 13:23 +0200, Nick Piggin wrote:
> > > I think the main problem isn't necessarily init code per se, but the
> > > pile of -common- code that can be called both at init time and
> > later.
> >
> > Just seems bogus argument. Everwhere else that does this (ie.
> > allocations that are called from multiple allocation contexts)
> > passes correct gfp flags down.
>
> So you say we should create new variants of all these APIs that take gfp
> flags as arguments just because they might be called early during boot :
No, just create the ones that actually are called in early boot.
> - All the vmalloc interfaces (__get_vm_area() and it's 5 or 6 variants)
> - Allocation of PCI host bridges data structures in the powerpc code
> - Allocation of interrupt controller domains in the powerpc code
> - Page table allocations (oops ... can't change that arch specific,
> would have to be a generic change)
No it would not. If an arch (eg s390) does this in early boot, then
it can create its own allocation function which takes a gfp mask, and
define the generic one to just pass it a GFP_KERNEL. generic code does
not call this in early boot of course.
> - ioremap (which call both __get_vm_area() and page table allocs)
> - ...
>
> Are you just insane ? :-)
I think so, but that's besides the point ;)
On Mon, Jun 15, 2009 at 01:38:12PM +0100, Hugh Dickins wrote:
> On Mon, 15 Jun 2009, Nick Piggin wrote:
> > On Mon, Jun 15, 2009 at 08:45:27PM +1000, Benjamin Herrenschmidt wrote:
> > > On Mon, 2009-06-15 at 12:27 +0200, Nick Piggin wrote:
> > > > Init code doesn't deserve to be more lazy than anybody else, and
> > > > part of the reason why such a core piece of code is so crufty
> > > > is exactly because people have been lazy there.
> > >
> > > I think the main problem isn't necessarily init code per se, but the
> > > pile of -common- code that can be called both at init time and later.
> >
> > Just seems bogus argument. Everwhere else that does this (ie.
> > allocations that are called from multiple allocation contexts)
> > passes correct gfp flags down.
>
> Fair enough that you jealously defend SL?B code from onslaught, but
> FWIW I strongly agree with Ben on all this. I cannot see the point
> of the pain of moving around SL?B versus bootmem, if we immediately
> force such a distinction (differently dressed) upon their users again.
Well, it is a good idea for other reasons too, not just to
relieve the author of the distinction.
But the distinction now is much smaller. It is not a case of
allocfn()
{
if (system_state == something crazy)
alloc_bootmem
else
kmalloc
}
freefn()
{
if (object was kmalloced)
kfree
}
It is now this:
allocfn(gfp)
{
kmalloc(gfp)
}
> I fully agree with Ben that it's the job of the allocator to provide
> a service, and part of that job to understand its own limitations at
> different stages of running.
Yes but that's heavily qualified. As I said, we already require
a lot of knowledge of context passed in to it. I have no interest
in adding code to make *early boot* code not have to care about
that, especially because everybody else certainly has to know
whether they are calling the allocator with interrupts disabled
or a lock held etc.
To be clear about this: the allocator is fully servicable and
no different to normal system running at this point. The
difference for example is that code runs with interrupts off
but incorrectly uses GFP_KERNEL for allocations. This is a
blatent bug in any other kernel code, I don't know why boot
code is OK to be horrible and wrong and work around it with
the equally horrible system_state (and this gfp mask which is
just system_state under another name).
I just don't want to use this slab fix in question to be a
license to throw away and forget all about any context information
in the early boot code because someone asserts "it will make the
code better". I'm fine with the slab change for now, but let's
try to retain context information as well.
If somebody comes up with a patch to remove thousands of lines
of boot code by ignoring context, then I might concede the
point and we could remove the context annotations.
On Tue, 2009-06-16 at 06:46 +0200, Nick Piggin wrote:
> On Tue, Jun 16, 2009 at 07:31:38AM +1000, Benjamin Herrenschmidt wrote:
> > On Mon, 2009-06-15 at 13:23 +0200, Nick Piggin wrote:
> > > > I think the main problem isn't necessarily init code per se, but the
> > > > pile of -common- code that can be called both at init time and
> > > later.
> > >
> > > Just seems bogus argument. Everwhere else that does this (ie.
> > > allocations that are called from multiple allocation contexts)
> > > passes correct gfp flags down.
> >
> > So you say we should create new variants of all these APIs that take gfp
> > flags as arguments just because they might be called early during boot :
>
> No, just create the ones that actually are called in early boot.
About all of the ones below and more :-) It all depends what you define
by "early" though :-)
> > - All the vmalloc interfaces (__get_vm_area() and it's 5 or 6 variants)
> > - Allocation of PCI host bridges data structures in the powerpc code
> > - Allocation of interrupt controller domains in the powerpc code
> > - Page table allocations (oops ... can't change that arch specific,
> > would have to be a generic change)
>
> No it would not. If an arch (eg s390) does this in early boot, then
> it can create its own allocation function which takes a gfp mask, and
> define the generic one to just pass it a GFP_KERNEL. generic code does
> not call this in early boot of course.
Right, which means that ioremap needs a special code path early boot
since it uses this etc...
We are basically adding special-cases in a whole lot of places, which
could -ALL- be removed just by having the allocator do the "right
thing" :-)
> > - ioremap (which call both __get_vm_area() and page table allocs)
> > - ...
> >
> > Are you just insane ? :-)
>
> I think so, but that's besides the point ;)
Allright, I concede that :-)
Cheers,
Ben.
On Tue, 2009-06-16 at 06:57 +0200, Nick Piggin wrote:
>
> Yes but that's heavily qualified. As I said, we already require
> a lot of knowledge of context passed in to it. I have no interest
> in adding code to make *early boot* code not have to care about
> that, especially because everybody else certainly has to know
> whether they are calling the allocator with interrupts disabled
> or a lock held etc.
You seem to totally ignore the argument I made to answer this specific
point in one of my previous emails. Right, they are to some extent
subjective, but I believe they have some standing. The main one is
probably that it's a lot less obvious to a lot of code where in the boot
process it's going to be called, or even if it's going to be called at
boot, later, both. This is especially true nowadays with all the talks
about shuffling more of the boot process around.
> To be clear about this: the allocator is fully servicable and
> no different to normal system running at this point. The
> difference for example is that code runs with interrupts off
> but incorrectly uses GFP_KERNEL for allocations. This is a
> blatent bug in any other kernel code, I don't know why boot
> code is OK to be horrible and wrong and work around it with
> the equally horrible system_state (and this gfp mask which is
> just system_state under another name).
Because it would be extremely impractical to have to explicitely pass
the gfp_flags around for anything that can be called at boot time. This
is as simple as that. A -lot- more impractical than requiring atomic
call sites to know what they are doing.
> I just don't want to use this slab fix in question to be a
> license to throw away and forget all about any context information
> in the early boot code because someone asserts "it will make the
> code better". I'm fine with the slab change for now, but let's
> try to retain context information as well.
But in many case it's meaningless. Again, what do you define as "boot"
is a very obscure thing here.
> If somebody comes up with a patch to remove thousands of lines
> of boot code by ignoring context, then I might concede the
> point and we could remove the context annotations.
No, we don't want to -add- thousands of lines of code :-) And we can
indeed remove a bunch of the old slab_is_available() tests too indeed.
And no, they should not -all- be converted to NOWAIT. See vmalloc() as a
good example, I have a few more like that.
Cheers,
Ben.
On Tue, Jun 16, 2009 at 03:18:06PM +1000, Benjamin Herrenschmidt wrote:
> On Tue, 2009-06-16 at 06:46 +0200, Nick Piggin wrote:
> > On Tue, Jun 16, 2009 at 07:31:38AM +1000, Benjamin Herrenschmidt wrote:
> > > On Mon, 2009-06-15 at 13:23 +0200, Nick Piggin wrote:
> > > > > I think the main problem isn't necessarily init code per se, but the
> > > > > pile of -common- code that can be called both at init time and
> > > > later.
> > > >
> > > > Just seems bogus argument. Everwhere else that does this (ie.
> > > > allocations that are called from multiple allocation contexts)
> > > > passes correct gfp flags down.
> > >
> > > So you say we should create new variants of all these APIs that take gfp
> > > flags as arguments just because they might be called early during boot :
> >
> > No, just create the ones that actually are called in early boot.
>
> About all of the ones below and more :-) It all depends what you define
> by "early" though :-)
>
> > > - All the vmalloc interfaces (__get_vm_area() and it's 5 or 6 variants)
> > > - Allocation of PCI host bridges data structures in the powerpc code
> > > - Allocation of interrupt controller domains in the powerpc code
> > > - Page table allocations (oops ... can't change that arch specific,
> > > would have to be a generic change)
> >
> > No it would not. If an arch (eg s390) does this in early boot, then
> > it can create its own allocation function which takes a gfp mask, and
> > define the generic one to just pass it a GFP_KERNEL. generic code does
> > not call this in early boot of course.
>
> Right, which means that ioremap needs a special code path early boot
> since it uses this etc...
You could do it in parts. Having vmalloc layer test for early boot
would capture a lot of these.
> We are basically adding special-cases in a whole lot of places, which
> could -ALL- be removed just by having the allocator do the "right
> thing" :-)
I don't know that we're adding special cases. Simply setting up
slab earlier does not require anything more from anyone. In
fact it allows less special case code.
I'm just not convinced that teaching the slab allocator about
early boot context is the right thing to do, so I ask that we
not throw away this context information we already have, if
possible.
On Tue, Jun 16, 2009 at 03:28:07PM +1000, Benjamin Herrenschmidt wrote:
> On Tue, 2009-06-16 at 06:57 +0200, Nick Piggin wrote:
> >
> > Yes but that's heavily qualified. As I said, we already require
> > a lot of knowledge of context passed in to it. I have no interest
> > in adding code to make *early boot* code not have to care about
> > that, especially because everybody else certainly has to know
> > whether they are calling the allocator with interrupts disabled
> > or a lock held etc.
>
> You seem to totally ignore the argument I made to answer this specific
> point in one of my previous emails. Right, they are to some extent
> subjective, but I believe they have some standing. The main one is
> probably that it's a lot less obvious to a lot of code where in the boot
> process it's going to be called, or even if it's going to be called at
> boot, later, both. This is especially true nowadays with all the talks
> about shuffling more of the boot process around.
I didn't ignore that argument, I just don't agree. It only does
not "know" the context it is called from if it does not get that
information passed to it from its caller who does know.
> > To be clear about this: the allocator is fully servicable and
> > no different to normal system running at this point. The
> > difference for example is that code runs with interrupts off
> > but incorrectly uses GFP_KERNEL for allocations. This is a
> > blatent bug in any other kernel code, I don't know why boot
> > code is OK to be horrible and wrong and work around it with
> > the equally horrible system_state (and this gfp mask which is
> > just system_state under another name).
>
> Because it would be extremely impractical to have to explicitely pass
> the gfp_flags around for anything that can be called at boot time. This
> is as simple as that. A -lot- more impractical than requiring atomic
> call sites to know what they are doing.
We'll see.
> > I just don't want to use this slab fix in question to be a
> > license to throw away and forget all about any context information
> > in the early boot code because someone asserts "it will make the
> > code better". I'm fine with the slab change for now, but let's
> > try to retain context information as well.
>
> But in many case it's meaningless. Again, what do you define as "boot"
> is a very obscure thing here.
It's not obscure. I'm vague because it doesn't matter *all that much*.
> > If somebody comes up with a patch to remove thousands of lines
> > of boot code by ignoring context, then I might concede the
> > point and we could remove the context annotations.
>
> No, we don't want to -add- thousands of lines of code :-) And we can
I don't understand. Where would you be adding thousands of lines
of code?
> indeed remove a bunch of the old slab_is_available() tests too indeed.
> And no, they should not -all- be converted to NOWAIT. See vmalloc() as a
> good example, I have a few more like that.
There aren't too many significant code simplifications AFAIKS.
On Tue, 16 Jun 2009, Benjamin Herrenschmidt wrote:
> On Mon, 2009-06-15 at 13:28 +0200, Nick Piggin wrote:
> > If a couple of specialised early boot code places have
> > to test slab_is_available() to prevent crap leaking into
> > our main allocators, then that's quite fine by me.
>
> slab_is_available() will not fix the problem in that case. In fact,
> existing tests of slab_is_available() have been broken because they
> assumed that when it was true, then GFP_KERNEL would work.
Can we get rid of slab_is_available for that purpose? AFAICT: There needs
to be something that tells us that the page allocator is available or that
the mm subsystem is fully operational.
Could we add more bootup states to enum system_states?
Something like
SYSTEM_BOOTMEM for the period where only boot memory allocations are possible
SYSTEM_ALLOC_AVAIL Page / slab allocators available
SYSTEM_PERCPU_AVAIL Percpu allocator functional
SYSTEM_MEM Memory subsystem operational.
?
On Tue, 16 Jun 2009, Nick Piggin wrote:
> There aren't too many significant code simplifications AFAIKS.
The code simplification comes from the ability to run the same code during
boot that is also running when the full system is operational. I thought
the intend of this whole exercise was to avoid special casing as much as
possible and reduce the amount of code specifically duplicated for boot
situations?
On Tue, Jun 16, 2009 at 11:12:58AM -0400, Christoph Lameter wrote:
> On Tue, 16 Jun 2009, Nick Piggin wrote:
>
> > There aren't too many significant code simplifications AFAIKS.
>
> The code simplification comes from the ability to run the same code during
> boot that is also running when the full system is operational. I thought
> the intend of this whole exercise was to avoid special casing as much as
> possible and reduce the amount of code specifically duplicated for boot
> situations?
That is one of the advantages, yes. As far as I can see though,
there isn't like *huge* amounts of things you can simplify. And
if it is just a matter of removing a few branches from a few
cold paths, then it isn't necessarily a good reason to push
branches into the slab and page allocators.
I'm not saying we definitely should go one way or the other, I
would just like to wait and see.
On Tue, 16 Jun 2009, Nick Piggin wrote:
> On Tue, Jun 16, 2009 at 07:31:38AM +1000, Benjamin Herrenschmidt wrote:
> > On Mon, 2009-06-15 at 13:23 +0200, Nick Piggin wrote:
> > > > I think the main problem isn't necessarily init code per se, but the
> > > > pile of -common- code that can be called both at init time and
> > > later.
> > >
> > > Just seems bogus argument. Everwhere else that does this (ie.
> > > allocations that are called from multiple allocation contexts)
> > > passes correct gfp flags down.
> >
> > So you say we should create new variants of all these APIs that take gfp
> > flags as arguments just because they might be called early during boot :
>
> No, just create the ones that actually are called in early boot.
No.
Nick, I don't think you've follow the problems.
The thing is, we do end up wanting to do a lot of allocations, and it's
not even very "early" - we've already initialized all the allocators. It's
just that WE HAVE NOT ENABLED INTERRUPTS YET!
So the "hack" is to let everybody act as if everything is normal. Which it
pretty much is. Just use kmalloc/kfree etc, and use _all_ the regular
functions. Setting up the core layers so that we _can_ enable interrupts
involves quite a lot of random crud, they should be able to use regular
code.
And the hack is there because we really are in a magic stage. The memory
management works, but it just can't do certain things yet. It's not the
callers that need to be changed, because the callers are usually regular
routines that work perfectly normally long after boot, and having to add a
magic "I'm now doign this during early boot" argument to the whole stack
is just _stupid_, when the stack itself doesn't actually care - only the
allocators do.
Linus
On Tue, 16 Jun 2009, Christoph Lameter wrote:
> On Tue, 16 Jun 2009, Benjamin Herrenschmidt wrote:
>
> > On Mon, 2009-06-15 at 13:28 +0200, Nick Piggin wrote:
> > > If a couple of specialised early boot code places have
> > > to test slab_is_available() to prevent crap leaking into
> > > our main allocators, then that's quite fine by me.
> >
> > slab_is_available() will not fix the problem in that case. In fact,
> > existing tests of slab_is_available() have been broken because they
> > assumed that when it was true, then GFP_KERNEL would work.
>
> Can we get rid of slab_is_available for that purpose? AFAICT: There needs
> to be something that tells us that the page allocator is available or that
> the mm subsystem is fully operational.
>
> Could we add more bootup states to enum system_states?
I'd rather get rid of the whole slab_is_available() thing entirely.
All the recent init ordering changes should mean that the slab allocator
is available _much_ earlier - to the point that hopefully any code that
runs before slab is initialized should know very deep down that it's
special, and uses the bootmem allocator without doing any conditionals
what-so-ever.
If that isn't true yet, then we should make it more true.
IOW, we should strive for not having any code that is at all confused
about whether it can do slab allocations or not.
Linus
On Tue, 16 Jun 2009, Linus Torvalds wrote:
> > Could we add more bootup states to enum system_states?
>
> I'd rather get rid of the whole slab_is_available() thing entirely.
slab_is_available is necessary for slab bootstrap but if we use
system_state for this then its not necessary anymore.
> If that isn't true yet, then we should make it more true.
Right.
> IOW, we should strive for not having any code that is at all confused
> about whether it can do slab allocations or not.
So encode the stage of boot up in system_state? Maybe add documentation to
explan what is possible at these states and what is not?
On Tue, 16 Jun 2009, Christoph Lameter wrote:
> >
> > I'd rather get rid of the whole slab_is_available() thing entirely.
>
> slab_is_available is necessary for slab bootstrap but if we use
> system_state for this then its not necessary anymore.
Well, if it's purely internal to slab itself, then I don't care. It should
be made static inside mm/slab.c
> > IOW, we should strive for not having any code that is at all confused
> > about whether it can do slab allocations or not.
>
> So encode the stage of boot up in system_state? Maybe add documentation to
> explan what is possible at these states and what is not?
No.
I mean that there should never be any _need_ for dynamically querying
whether slab is available or not. Nothing should ever do it, because we
should strive for slab being available so early that anything that happens
before it _knows_ that it is special case code (eg "set up initial page
tables") and would never have any reason what-so-ever for even
conditionally asking "should I use slab".
So I literally mean "get rid of slab_is_available()" - and the fact that
SLAB itself _internally_ has some internal state is irrelevant (SLUB and
SLOB do not, for example).
Linus
On Tue, 16 Jun 2009, Linus Torvalds wrote:
> So I literally mean "get rid of slab_is_available()" - and the fact that
> SLAB itself _internally_ has some internal state is irrelevant (SLUB and
> SLOB do not, for example).
SLUB has slab_state which is necessary for bootstrap of NUMA and sysfs
support.
On Tue, 2009-06-16 at 12:10 -0700, Linus Torvalds wrote:
>
> All the recent init ordering changes should mean that the slab allocator
> is available _much_ earlier - to the point that hopefully any code that
> runs before slab is initialized should know very deep down that it's
> special, and uses the bootmem allocator without doing any conditionals
> what-so-ever.
I would normally agree ... but we still have some stuff in setup_arch()
that will need that :-( IE, we need to allow ioremap very early on
powerpc (and possibly various other archs that don't have magic PIO to
whack their chipset without the need for a mapping), so we have tricks
to allocate page tables using LMB, bootmem or slab depending on what's
available that still rely on slab_is_available().
Same goes with some PCI PHB related data structures that we are
allocating in setup_arch() as well, though that's something I do intend
to move to normal initcalls asap (there's a few skeletons hiding in some
corners there so it's not totally trivial).
In fact, I would love to be able to call the SLAB init myself from
somewhere in setup_arch() ... basically instead of init_bootmem() and be
done with it :-) I know it's a chicken-and-egg problem with allocating
the memmap etc... but heh. Which brings me back to my point, we do have
some kind of very very very early allocator on powerpc and sparc
(lib/lmb.), and I think x86 has one too (i820 or whatever it's called)
right ? Maybe we can wrap a common API on top of these things and avoid
bootmem alltogether ? Or that's just not realistic ?
> If that isn't true yet, then we should make it more true.
I agree.
> IOW, we should strive for not having any code that is at all confused
> about whether it can do slab allocations or not.
Cheers,
Ben.
On Wed, 17 Jun 2009, Benjamin Herrenschmidt wrote:
>
> I would normally agree ... but we still have some stuff in setup_arch()
> that will need that :-( IE, we need to allow ioremap very early on
> powerpc (and possibly various other archs that don't have magic PIO to
> whack their chipset without the need for a mapping), so we have tricks
> to allocate page tables using LMB, bootmem or slab depending on what's
> available that still rely on slab_is_available().
>
> Same goes with some PCI PHB related data structures that we are
> allocating in setup_arch() as well, though that's something I do intend
> to move to normal initcalls asap (there's a few skeletons hiding in some
> corners there so it's not totally trivial).
Both of these sound like they should be moved. Perhaps not to "normal
initcalls" (those do happen very late), but maybe to some slightly later
phase in init/main.c.
Why do you need ioremap so early that it has to happen before even the
basic MM data is up? That sounds bogus, and like just a random
implementation issue for historical reasons.
Linus
On Tue, 2009-06-16 at 15:06 -0700, Linus Torvalds wrote:
> Why do you need ioremap so early that it has to happen before even the
> basic MM data is up? That sounds bogus, and like just a random
> implementation issue for historical reasons.
ioremap is probably the -one- thing we constantly try to move
earlier ... sadly :-) But again, I agree and it's mostly historical
reason, coupled with the lack of good "generic" (rather than dedicated
like mem_init(), time_init(), init_IRQ() ...) archs hooks after
setup_arch() and before full blown initcalls.
The main user of it is debugging. We have debug options that can
initialize some kind of console output pretty much even before
setup_arch() is called :-) Most of that stuff isn't normally compiled in
a production kernel though.
Now, if we could do less things in setup_arch() we would have less need
to debug stuff in there, that's true :-) The worst offender is probably
ppc64 which currently does shitloads of stuff even before we call
start_kernel() and that is just plain wrong.
This stuff has been on my to-fix list for a long time, I want to move a
lot of it to later during the boot process and make it more common
between 32 and 64 bit etc... Just give me some time :-)
A few other users of that sort of early ioremap are some bits that
access chipset registers to fixup crap, most of it is in setup_arch()
because we don't actually have a good arch callback that comes later and
is still reasonably before anything important takes place. Maybe we
should introduce one just after the initialization of the allocators.
On ppc32, we also ioremap IO space from setup_arch() and on ppc64 we do
that but only for the legacy part of it. That also could be moved as we
move the PHB init later.
Oh, one big one too is access to the system controller registers (aka
Cuda or PMU on powermacs) which we also do very early.
None of that is unfixable, and in some case even hard to fix. But just
don't remove slab_is_available() -just-yet- :-)
Cheers,
Ben.
Hi Linus,
On Tue, 2009-06-16 at 12:33 -0700, Linus Torvalds wrote:
> I mean that there should never be any _need_ for dynamically querying
> whether slab is available or not. Nothing should ever do it, because we
> should strive for slab being available so early that anything that happens
> before it _knows_ that it is special case code (eg "set up initial page
> tables") and would never have any reason what-so-ever for even
> conditionally asking "should I use slab".
So how does the page allocator fit in this new scheme of things? I have
been looking at doing the cleanups you and Christoph suggested and it
seems to me we need a 'system_gfp_mask' that's respected by basically
everyone, including __might_sleep() and other debugging functions.
If we want to keep the masking within the slab allocator, we need to
make sure there are two sets of APIs: one for slab and one for the page
allocator. Otherwise we end up masking in the debugging checks but not
in page_alloc().
Pekka
On Tue, Jun 16, 2009 at 11:45:24AM -0700, Linus Torvalds wrote:
>
>
> On Tue, 16 Jun 2009, Nick Piggin wrote:
>
> > On Tue, Jun 16, 2009 at 07:31:38AM +1000, Benjamin Herrenschmidt wrote:
> > > On Mon, 2009-06-15 at 13:23 +0200, Nick Piggin wrote:
> > > > > I think the main problem isn't necessarily init code per se, but the
> > > > > pile of -common- code that can be called both at init time and
> > > > later.
> > > >
> > > > Just seems bogus argument. Everwhere else that does this (ie.
> > > > allocations that are called from multiple allocation contexts)
> > > > passes correct gfp flags down.
> > >
> > > So you say we should create new variants of all these APIs that take gfp
> > > flags as arguments just because they might be called early during boot :
> >
> > No, just create the ones that actually are called in early boot.
>
> No.
>
> Nick, I don't think you've follow the problems.
>
> The thing is, we do end up wanting to do a lot of allocations, and it's
> not even very "early" - we've already initialized all the allocators. It's
> just that WE HAVE NOT ENABLED INTERRUPTS YET!
Right, I know.
> So the "hack" is to let everybody act as if everything is normal. Which it
> pretty much is. Just use kmalloc/kfree etc, and use _all_ the regular
> functions. Setting up the core layers so that we _can_ enable interrupts
> involves quite a lot of random crud, they should be able to use regular
> code.
I just don't quite see why the problem got bigger though. Doing
earlier slab allocations than we had previously is going to be
replacing even more specialised code using bootmem right? I know
there are a few hacks for this, but I don't see anywhere getting
*worse* and I don't see anywhere that is all that bad today.
> And the hack is there because we really are in a magic stage. The memory
> management works, but it just can't do certain things yet. It's not the
> callers that need to be changed, because the callers are usually regular
> routines that work perfectly normally long after boot, and having to add a
> magic "I'm now doign this during early boot" argument to the whole stack
> is just _stupid_, when the stack itself doesn't actually care - only the
> allocators do.
In some cases perhaps it is difficult. In others it should be
pretty natural. Lots of memory allocating paths pass gfp
a long way down the stack.
On Wed, 17 Jun 2009, Nick Piggin wrote:
>
> In some cases perhaps it is difficult. In others it should be
> pretty natural. Lots of memory allocating paths pass gfp
> a long way down the stack.
I agree that some cases would be pretty natural. In fact, that's what we
started out doing. On the x86 side, we didn't have a lot of issues in
testing, and we fixed them up by using GFP_NOWAIT (see for example
cpupri_init() in kernel/sched_cpupri.c, or init_irq_default_affinity() in
kernel/irq/handle.c - both of which were fixed up in that phase).
Those paths, in fact, in general already had "bootmem" flags etc. And x86
doesn't need to initialize a lot of state at bootup.
Then Ben happened, and crazy PPC ioremap crap.
So the problem is exactly that it was perfectly natural to pass down a gfp
or other flag in _some_ cases. And then in a few cases it's much more
painful.
Linus
On Wed, Jun 17, 2009 at 09:01:16AM -0700, Linus Torvalds wrote:
>
>
> On Wed, 17 Jun 2009, Nick Piggin wrote:
> >
> > In some cases perhaps it is difficult. In others it should be
> > pretty natural. Lots of memory allocating paths pass gfp
> > a long way down the stack.
>
> I agree that some cases would be pretty natural. In fact, that's what we
> started out doing. On the x86 side, we didn't have a lot of issues in
> testing, and we fixed them up by using GFP_NOWAIT (see for example
> cpupri_init() in kernel/sched_cpupri.c, or init_irq_default_affinity() in
> kernel/irq/handle.c - both of which were fixed up in that phase).
>
> Those paths, in fact, in general already had "bootmem" flags etc. And x86
> doesn't need to initialize a lot of state at bootup.
Yes.
> Then Ben happened, and crazy PPC ioremap crap.
OK. I agree ioremap probably would get painful. Are they doing that
too early anyway? From your email it sounds like maybe they are.
> So the problem is exactly that it was perfectly natural to pass down a gfp
> or other flag in _some_ cases. And then in a few cases it's much more
> painful.
Sure... I understand, and I don't want to force people to add
more nasty code to work around these. I am fine with the fix
to slab for now... I was just hoping maybe we don't put the
rule in place that all early boot allocations shall use GFP_KERNEL.
I'd like to see the allocation context passed down (including via
code that knows when interrupts are off in early boot) if possible
without ttoo much ugliness.
On Wed, 17 Jun 2009, Pekka Enberg wrote:
>
> So how does the page allocator fit in this new scheme of things? I have
> been looking at doing the cleanups you and Christoph suggested and it
> seems to me we need a 'system_gfp_mask' that's respected by basically
> everyone, including __might_sleep() and other debugging functions.
So I'm very much ok with the whole "use magic gfp_mask to indicate what
works at what stage". And yes, I think it makes sense to extend it to the
page allocator and might_sleep too, because GFP_KERNEL has all the same
issues regardless of whether it's about page allocation or about slab
allocators. And any "might_sleep" suppression really does tend to be about
the exact same thing.
So the only thing I was arguing against wrt Christoph was really that I
think this thing should be an "internal" thing, and never ever be used as
a flag for others to decide what to do. We do _not_ want drivers or other
crazy people using it to decide what state they are running in.
Keep it simple, and keep it minimal, in other words.
Linus
On Wed, 2009-06-17 at 09:01 -0700, Linus Torvalds wrote:
>
> On Wed, 17 Jun 2009, Nick Piggin wrote:
> >
> > In some cases perhaps it is difficult. In others it should be
> > pretty natural. Lots of memory allocating paths pass gfp
> > a long way down the stack.
>
> I agree that some cases would be pretty natural. In fact, that's what we
> started out doing. On the x86 side, we didn't have a lot of issues in
> testing, and we fixed them up by using GFP_NOWAIT (see for example
> cpupri_init() in kernel/sched_cpupri.c, or init_irq_default_affinity() in
> kernel/irq/handle.c - both of which were fixed up in that phase).
>
> Those paths, in fact, in general already had "bootmem" flags etc. And x86
> doesn't need to initialize a lot of state at bootup.
>
> Then Ben happened, and crazy PPC ioremap crap.
:-)
Are you sure that none of the kmalloc's you turned into NOWAIT on x86
can be called after boot by things like CPU hotplug etc... ? In which
case, they just become much more prone to failure in the later case ...
How do you get away without ioremap in time_init(), init_IRQ() etc... ?
It's all IO ports and magic registers on x86 ?
So yeah, early ioremap is a problem, but it's also very useful to have
and will be hard to get rid of. I suspect most non-x86 platforms will
need something like that. Even if I end up managing to kill the callers
in setup_arch(), I will probably never get rid of ioremaps after the new
mm_init() and before interrupts are on.
There's a few other things in PPC land, like PCI stuffs, data structures
related to interrupt routing, etc... that can be allocated both at boot
and non boot, some of them we could probably push out to later but not
all and not without great care.
Cheers,
Ben.
> So I'm very much ok with the whole "use magic gfp_mask to indicate what
> works at what stage". And yes, I think it makes sense to extend it to the
> page allocator and might_sleep too, because GFP_KERNEL has all the same
> issues regardless of whether it's about page allocation or about slab
> allocators. And any "might_sleep" suppression really does tend to be about
> the exact same thing.
Argh... still broken.
In fact, my initial patch added it to the page allocator, which worked
for me. Pekka patch removed that and made it slab-only. So I'm blowing
up at boot in lockdep or so because I'm allocating page tables on
ppc32 with __get_free_pages() and GFP_KERNEL.
I'll cook up a patch.
Cheers,
Ben.
On Thu, 2009-06-18 at 12:00 +1000, Benjamin Herrenschmidt wrote:
> > So I'm very much ok with the whole "use magic gfp_mask to indicate what
> > works at what stage". And yes, I think it makes sense to extend it to the
> > page allocator and might_sleep too, because GFP_KERNEL has all the same
> > issues regardless of whether it's about page allocation or about slab
> > allocators. And any "might_sleep" suppression really does tend to be about
> > the exact same thing.
>
> Argh... still broken.
>
> In fact, my initial patch added it to the page allocator, which worked
> for me. Pekka patch removed that and made it slab-only. So I'm blowing
> up at boot in lockdep or so because I'm allocating page tables on
> ppc32 with __get_free_pages() and GFP_KERNEL.
>
> I'll cook up a patch.
Here it is:
mm: Extend gfp masking to the page allocator
The page allocator also needs the masking of gfp flags during boot,
so this moves it out of slab/slub and uses it with the page allocator
as well.
Signed-off-by: Benjamin Herrenschmidt <[email protected]>
---
This will also make it easier to use it for limiting allocations that
can block during suspend/resume, though doing this really fool-proof
will require some kind of synchronization in set_gfp_allowed_mask()
vs. allocations that have already started sleeping waiting for IOs.
Index: linux-work/include/linux/gfp.h
===================================================================
--- linux-work.orig/include/linux/gfp.h 2009-06-18 12:03:14.000000000 +1000
+++ linux-work/include/linux/gfp.h 2009-06-18 12:08:21.000000000 +1000
@@ -99,7 +99,7 @@ struct vm_area_struct;
__GFP_NORETRY|__GFP_NOMEMALLOC)
/* Control slab gfp mask during early boot */
-#define SLAB_GFP_BOOT_MASK __GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS)
+#define GFP_BOOT_MASK __GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS)
/* Control allocation constraints */
#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
@@ -348,4 +348,11 @@ static inline void oom_killer_enable(voi
oom_killer_disabled = false;
}
+extern gfp_t gfp_allowed_mask;
+
+static inline void set_gfp_allowed_mask(gfp_t mask)
+{
+ gfp_allowed_mask = mask;
+}
+
#endif /* __LINUX_GFP_H */
Index: linux-work/init/main.c
===================================================================
--- linux-work.orig/init/main.c 2009-06-18 12:06:49.000000000 +1000
+++ linux-work/init/main.c 2009-06-18 12:08:35.000000000 +1000
@@ -642,6 +642,10 @@ asmlinkage void __init start_kernel(void
"enabled early\n");
early_boot_irqs_on();
local_irq_enable();
+
+ /* Interrupts are enabled now so all GFP allocations are safe. */
+ set_gfp_allowed_mask(__GFP_BITS_MASK);
+
kmem_cache_init_late();
/*
Index: linux-work/mm/page_alloc.c
===================================================================
--- linux-work.orig/mm/page_alloc.c 2009-06-18 12:04:58.000000000 +1000
+++ linux-work/mm/page_alloc.c 2009-06-18 12:09:27.000000000 +1000
@@ -73,6 +73,7 @@ unsigned long totalram_pages __read_most
unsigned long totalreserve_pages __read_mostly;
unsigned long highest_memmap_pfn __read_mostly;
int percpu_pagelist_fraction;
+gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
int pageblock_order __read_mostly;
@@ -1863,6 +1864,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, u
struct page *page;
int migratetype = allocflags_to_migratetype(gfp_mask);
+ gfp_mask &= gfp_allowed_mask;
+
lockdep_trace_alloc(gfp_mask);
might_sleep_if(gfp_mask & __GFP_WAIT);
Index: linux-work/mm/slab.c
===================================================================
--- linux-work.orig/mm/slab.c 2009-06-18 12:05:47.000000000 +1000
+++ linux-work/mm/slab.c 2009-06-18 12:06:19.000000000 +1000
@@ -305,12 +305,6 @@ struct kmem_list3 {
};
/*
- * The slab allocator is initialized with interrupts disabled. Therefore, make
- * sure early boot allocations don't accidentally enable interrupts.
- */
-static gfp_t slab_gfp_mask __read_mostly = SLAB_GFP_BOOT_MASK;
-
-/*
* Need this for bootstrapping a per node allocator.
*/
#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
@@ -1559,11 +1553,6 @@ void __init kmem_cache_init_late(void)
{
struct kmem_cache *cachep;
- /*
- * Interrupts are enabled now so all GFP allocations are safe.
- */
- slab_gfp_mask = __GFP_BITS_MASK;
-
/* 6) resize the head arrays to their final sizes */
mutex_lock(&cache_chain_mutex);
list_for_each_entry(cachep, &cache_chain, next)
@@ -3307,7 +3296,7 @@ __cache_alloc_node(struct kmem_cache *ca
unsigned long save_flags;
void *ptr;
- flags &= slab_gfp_mask;
+ flags &= gfp_allowed_mask;
lockdep_trace_alloc(flags);
@@ -3392,7 +3381,7 @@ __cache_alloc(struct kmem_cache *cachep,
unsigned long save_flags;
void *objp;
- flags &= slab_gfp_mask;
+ flags &= gfp_allowed_mask;
lockdep_trace_alloc(flags);
Index: linux-work/mm/slub.c
===================================================================
--- linux-work.orig/mm/slub.c 2009-06-18 12:02:46.000000000 +1000
+++ linux-work/mm/slub.c 2009-06-18 12:06:35.000000000 +1000
@@ -179,12 +179,6 @@ static enum {
SYSFS /* Sysfs up */
} slab_state = DOWN;
-/*
- * The slab allocator is initialized with interrupts disabled. Therefore, make
- * sure early boot allocations don't accidentally enable interrupts.
- */
-static gfp_t slab_gfp_mask __read_mostly = SLAB_GFP_BOOT_MASK;
-
/* A list of all slab caches on the system */
static DECLARE_RWSEM(slub_lock);
static LIST_HEAD(slab_caches);
@@ -1692,7 +1686,7 @@ static __always_inline void *slab_alloc(
unsigned long flags;
unsigned int objsize;
- gfpflags &= slab_gfp_mask;
+ gfpflags &= gfp_allowed_mask;
lockdep_trace_alloc(gfpflags);
might_sleep_if(gfpflags & __GFP_WAIT);
@@ -3220,10 +3214,6 @@ void __init kmem_cache_init(void)
void __init kmem_cache_init_late(void)
{
- /*
- * Interrupts are enabled now so all GFP allocations are safe.
- */
- slab_gfp_mask = __GFP_BITS_MASK;
}
/*
Hi Ben,
On Thu, 2009-06-18 at 13:24 +1000, Benjamin Herrenschmidt wrote:
> On Thu, 2009-06-18 at 12:00 +1000, Benjamin Herrenschmidt wrote:
> > > So I'm very much ok with the whole "use magic gfp_mask to indicate what
> > > works at what stage". And yes, I think it makes sense to extend it to the
> > > page allocator and might_sleep too, because GFP_KERNEL has all the same
> > > issues regardless of whether it's about page allocation or about slab
> > > allocators. And any "might_sleep" suppression really does tend to be about
> > > the exact same thing.
> >
> > Argh... still broken.
> >
> > In fact, my initial patch added it to the page allocator, which worked
> > for me. Pekka patch removed that and made it slab-only. So I'm blowing
> > up at boot in lockdep or so because I'm allocating page tables on
> > ppc32 with __get_free_pages() and GFP_KERNEL.
> >
> > I'll cook up a patch.
>
> Here it is:
>
> mm: Extend gfp masking to the page allocator
>
> The page allocator also needs the masking of gfp flags during boot,
> so this moves it out of slab/slub and uses it with the page allocator
> as well.
>
> Signed-off-by: Benjamin Herrenschmidt <[email protected]>
> ---
>
> This will also make it easier to use it for limiting allocations that
> can block during suspend/resume, though doing this really fool-proof
> will require some kind of synchronization in set_gfp_allowed_mask()
> vs. allocations that have already started sleeping waiting for IOs.
>
> Index: linux-work/include/linux/gfp.h
> ===================================================================
> --- linux-work.orig/include/linux/gfp.h 2009-06-18 12:03:14.000000000 +1000
> +++ linux-work/include/linux/gfp.h 2009-06-18 12:08:21.000000000 +1000
> @@ -99,7 +99,7 @@ struct vm_area_struct;
> __GFP_NORETRY|__GFP_NOMEMALLOC)
>
> /* Control slab gfp mask during early boot */
> -#define SLAB_GFP_BOOT_MASK __GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS)
> +#define GFP_BOOT_MASK __GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS)
>
> /* Control allocation constraints */
> #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
> @@ -348,4 +348,11 @@ static inline void oom_killer_enable(voi
> oom_killer_disabled = false;
> }
>
> +extern gfp_t gfp_allowed_mask;
> +
> +static inline void set_gfp_allowed_mask(gfp_t mask)
> +{
> + gfp_allowed_mask = mask;
> +}
The only thing I don't like about this patch is that the caller gets to
decide which bits should be masked. I really think this should be a
mm_late_init() function that sets the mask _internally_ in the page
allocator.
But anyway, I am about to go completely off-line until Sunday or so
consider the approach:
Acked-by: Pekka Enberg <[email protected]>
Pekka
On Thu, 2009-06-18 at 09:01 +0300, Pekka Enberg wrote:
>
> The only thing I don't like about this patch is that the caller gets
> to decide which bits should be masked. I really think this should be a
> mm_late_init() function that sets the mask _internally_ in the page
> allocator.
Well, the bits are common to slab, slub and page_allocator, so it
doesn't hurt -that- much to have init/main.c use a pre-cooked constant
from gfp.h for that :-)
Also, I still have in mind to use it to clear out some bits during
suspend resume.
But if you really want to be pedantic, we could turn that into inline
functions like gfp_boot_finished() to keep the bits local to gfp.h
completely...
> But anyway, I am about to go completely off-line until Sunday or so
> consider the approach:
Thanks for the ack.
Cheers,
Ben.