This fixes various problems in slab found during memory hotadd testing.
All straight forward bug fixes, so could be still .33 candidates.
-Andi
So kmalloc_node() works even if no CPU is up yet on the new node.
Requires previous refactor patch.
Signed-off-by: Andi Kleen <[email protected]>
---
mm/slab.c | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
Index: linux-2.6.33-rc3-ak/mm/slab.c
===================================================================
--- linux-2.6.33-rc3-ak.orig/mm/slab.c
+++ linux-2.6.33-rc3-ak/mm/slab.c
@@ -115,6 +115,7 @@
#include <linux/reciprocal_div.h>
#include <linux/debugobjects.h>
#include <linux/kmemcheck.h>
+#include <linux/memory.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -1560,6 +1561,20 @@ void __init kmem_cache_init(void)
g_cpucache_up = EARLY;
}
+static int slab_memory_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ struct memory_notify *mn = (struct memory_notify *)arg;
+
+ /*
+ * When a node goes online allocate l3s early. This way
+ * kmalloc_node() works for it.
+ */
+ if (action == MEM_ONLINE && mn->status_change_nid >= 0)
+ slab_node_prepare(mn->status_change_nid);
+ return NOTIFY_OK;
+}
+
void __init kmem_cache_init_late(void)
{
struct kmem_cache *cachep;
@@ -1583,6 +1598,8 @@ void __init kmem_cache_init_late(void)
*/
register_cpu_notifier(&cpucache_notifier);
+ hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
+
/*
* The reap timers are started later, with a module init call: That part
* of the kernel is not yet operational.
When fallback_alloc() runs the node of the CPU might not be initialized yet.
Handle this case by allocating in another node.
Signed-off-by: Andi Kleen <[email protected]>
---
mm/slab.c | 19 ++++++++++++++++++-
1 file changed, 18 insertions(+), 1 deletion(-)
Index: linux-2.6.33-rc3-ak/mm/slab.c
===================================================================
--- linux-2.6.33-rc3-ak.orig/mm/slab.c
+++ linux-2.6.33-rc3-ak/mm/slab.c
@@ -3210,7 +3210,24 @@ retry:
if (local_flags & __GFP_WAIT)
local_irq_enable();
kmem_flagcheck(cache, flags);
- obj = kmem_getpages(cache, local_flags, numa_node_id());
+
+ /*
+ * Node not set up yet? Try one that the cache has been set up
+ * for.
+ */
+ nid = numa_node_id();
+ if (cache->nodelists[nid] == NULL) {
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ nid = zone_to_nid(zone);
+ if (cache->nodelists[nid])
+ break;
+ }
+ if (!cache->nodelists[nid])
+ return NULL;
+ }
+
+
+ obj = kmem_getpages(cache, local_flags, nid);
if (local_flags & __GFP_WAIT)
local_irq_disable();
if (obj) {
No functional changes.
Needed for next patch.
Signed-off-by: Andi Kleen <[email protected]>
---
mm/slab.c | 34 +++++++++++++++++++++-------------
1 file changed, 21 insertions(+), 13 deletions(-)
Index: linux-2.6.33-rc3-ak/mm/slab.c
===================================================================
--- linux-2.6.33-rc3-ak.orig/mm/slab.c
+++ linux-2.6.33-rc3-ak/mm/slab.c
@@ -1171,19 +1171,9 @@ free_array_cache:
}
}
-static int __cpuinit cpuup_prepare(long cpu)
+static int slab_node_prepare(int node)
{
struct kmem_cache *cachep;
- struct kmem_list3 *l3 = NULL;
- int node = cpu_to_node(cpu);
- const int memsize = sizeof(struct kmem_list3);
-
- /*
- * We need to do this right in the beginning since
- * alloc_arraycache's are going to use this list.
- * kmalloc_node allows us to add the slab to the right
- * kmem_list3 and not this cpu's kmem_list3
- */
list_for_each_entry(cachep, &cache_chain, next) {
/*
@@ -1192,9 +1182,10 @@ static int __cpuinit cpuup_prepare(long
* node has not already allocated this
*/
if (!cachep->nodelists[node]) {
- l3 = kmalloc_node(memsize, GFP_KERNEL, node);
+ struct kmem_list3 *l3;
+ l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
if (!l3)
- goto bad;
+ return -1;
kmem_list3_init(l3);
l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
((unsigned long)cachep) % REAPTIMEOUT_LIST3;
@@ -1213,6 +1204,23 @@ static int __cpuinit cpuup_prepare(long
cachep->batchcount + cachep->num;
spin_unlock_irq(&cachep->nodelists[node]->list_lock);
}
+ return 0;
+}
+
+static int __cpuinit cpuup_prepare(long cpu)
+{
+ struct kmem_cache *cachep;
+ struct kmem_list3 *l3 = NULL;
+ int node = cpu_to_node(cpu);
+
+ /*
+ * We need to do this right in the beginning since
+ * alloc_arraycache's are going to use this list.
+ * kmalloc_node allows us to add the slab to the right
+ * kmem_list3 and not this cpu's kmem_list3
+ */
+ if (slab_node_prepare(node) < 0)
+ goto bad;
/*
* Now we can go ahead with allocating the shared arrays and
cache_reap can run before the node is set up and then reference a NULL
l3 list. Check for this explicitely and just continue. The node
will be eventually set up.
Signed-off-by: Andi Kleen <[email protected]>
---
mm/slab.c | 3 +++
1 file changed, 3 insertions(+)
Index: linux-2.6.33-rc3-ak/mm/slab.c
===================================================================
--- linux-2.6.33-rc3-ak.orig/mm/slab.c
+++ linux-2.6.33-rc3-ak/mm/slab.c
@@ -4112,6 +4112,9 @@ static void cache_reap(struct work_struc
* we can do some work if the lock was obtained.
*/
l3 = searchp->nodelists[node];
+ /* Note node yet set up */
+ if (!l3)
+ break;
reap_alien(searchp, l3);
Hi Andi,
On Wed, Feb 3, 2010 at 11:39 PM, Andi Kleen <[email protected]> wrote:
> This fixes various problems in slab found during memory hotadd testing.
>
> All straight forward bug fixes, so could be still .33 candidates.
AFAICT, they aren't regression fixes so they should wait for .34, no?
The patches look good to me. Nick, Christoph, you might want to take a
peek at them before I shove them in linux-next.
Pekka
On Wed, 3 Feb 2010, Andi Kleen wrote:
> Requires previous refactor patch.
Not in this series?
> + if (action == MEM_ONLINE && mn->status_change_nid >= 0)
> + slab_node_prepare(mn->status_change_nid);
> + return NOTIFY_OK;
Never seen a slab_node_prepare function before.
Out of sequence. Last patch already uses a function introduced by this
one.
Acked-by: Christoph Lameter <[email protected]>
Acked-by: Christoph Lameter <[email protected]>
On Wed, 3 Feb 2010, Andi Kleen wrote:
> This fixes various problems in slab found during memory hotadd testing.
It changes the bootstrap semantics. The requirement was so far that slab
initialization must be complete before slab operations can be used.
This patchset allows such use before bootstrap on a node is complete and
also allows the running of cache reaper before bootstrap is done.
I have a bad feeling that this could be the result of Pekka's changes to
the bootstrap.
Christoph Lameter <[email protected]> writes:
> On Wed, 3 Feb 2010, Andi Kleen wrote:
>
>> This fixes various problems in slab found during memory hotadd testing.
>
> It changes the bootstrap semantics. The requirement was so far that slab
> initialization must be complete before slab operations can be used.
The problem is that slab itself uses slab it initialize itself.
> This patchset allows such use before bootstrap on a node is complete and
> also allows the running of cache reaper before bootstrap is done.
>
> I have a bad feeling that this could be the result of Pekka's changes to
> the bootstrap.
Not sure I fully follow what you're saying.
Are you saying this is a regression fix after all?
-Andi
--
[email protected] -- Speaking for myself only.
On Fri, 5 Feb 2010, Andi Kleen wrote:
> > It changes the bootstrap semantics. The requirement was so far that slab
> > initialization must be complete before slab operations can be used.
>
> The problem is that slab itself uses slab it initialize itself.
slab uses itself and also the page allocator to bootstrap itself. The
sequence was always a bit fragile. The page allocator also needs to use
the slab allocator in turn to bootstrap itself.
> > This patchset allows such use before bootstrap on a node is complete and
> > also allows the running of cache reaper before bootstrap is done.
> >
> > I have a bad feeling that this could be the result of Pekka's changes to
> > the bootstrap.
>
> Not sure I fully follow what you're saying.
>
> Are you saying this is a regression fix after all?
I am saying that we may have more trouble lurking here.
If we fix it this way then the bootstrap of a node is different from
system bootstrap that so far does not need these special measures.
On Wed, 3 Feb 2010, Andi Kleen wrote:
> When fallback_alloc() runs the node of the CPU might not be initialized yet.
> Handle this case by allocating in another node.
>
That other node must be allowed by current's cpuset, otherwise
kmem_getpages() will fail when get_page_from_freelist() iterates only over
unallowed nodes.
> Signed-off-by: Andi Kleen <[email protected]>
>
> ---
> mm/slab.c | 19 ++++++++++++++++++-
> 1 file changed, 18 insertions(+), 1 deletion(-)
>
> Index: linux-2.6.33-rc3-ak/mm/slab.c
> ===================================================================
> --- linux-2.6.33-rc3-ak.orig/mm/slab.c
> +++ linux-2.6.33-rc3-ak/mm/slab.c
> @@ -3210,7 +3210,24 @@ retry:
> if (local_flags & __GFP_WAIT)
> local_irq_enable();
> kmem_flagcheck(cache, flags);
> - obj = kmem_getpages(cache, local_flags, numa_node_id());
> +
> + /*
> + * Node not set up yet? Try one that the cache has been set up
> + * for.
> + */
> + nid = numa_node_id();
> + if (cache->nodelists[nid] == NULL) {
> + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
> + nid = zone_to_nid(zone);
> + if (cache->nodelists[nid])
> + break;
If you set a bit in a nodemask_t everytime ____cache_alloc_node() fails in
the previous for_each_zone_zonelist() iteration, you could just iterate
that nodemask here without duplicating the zone_to_nid() and
cache->nodelists[nid] != NULL check.
nid = numa_node_id();
if (!cache->nodelists[nid])
for_each_node_mask(nid, allowed_nodes) {
obj = kmem_getpages(cache, local_flags, nid);
if (obj)
break;
}
else
obj = kmem_getpages(cache, local_flags, nid);
This way you can try all allowed nodes for memory instead of just one when
cache->nodelists[numa_node_id()] == NULL.
> + }
> + if (!cache->nodelists[nid])
> + return NULL;
> + }
> +
> +
> + obj = kmem_getpages(cache, local_flags, nid);
> if (local_flags & __GFP_WAIT)
> local_irq_disable();
> if (obj) {
On Wed, 3 Feb 2010, Andi Kleen wrote:
> Index: linux-2.6.33-rc3-ak/mm/slab.c
> ===================================================================
> --- linux-2.6.33-rc3-ak.orig/mm/slab.c
> +++ linux-2.6.33-rc3-ak/mm/slab.c
> @@ -115,6 +115,7 @@
> #include <linux/reciprocal_div.h>
> #include <linux/debugobjects.h>
> #include <linux/kmemcheck.h>
> +#include <linux/memory.h>
>
> #include <asm/cacheflush.h>
> #include <asm/tlbflush.h>
> @@ -1560,6 +1561,20 @@ void __init kmem_cache_init(void)
> g_cpucache_up = EARLY;
> }
>
> +static int slab_memory_callback(struct notifier_block *self,
> + unsigned long action, void *arg)
> +{
> + struct memory_notify *mn = (struct memory_notify *)arg;
No cast necessary.
> +
> + /*
> + * When a node goes online allocate l3s early. This way
> + * kmalloc_node() works for it.
> + */
> + if (action == MEM_ONLINE && mn->status_change_nid >= 0)
> + slab_node_prepare(mn->status_change_nid);
> + return NOTIFY_OK;
> +}
> +
> void __init kmem_cache_init_late(void)
> {
> struct kmem_cache *cachep;
> @@ -1583,6 +1598,8 @@ void __init kmem_cache_init_late(void)
> */
> register_cpu_notifier(&cpucache_notifier);
>
> + hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
Only needed for CONFIG_NUMA.
> +
> /*
> * The reap timers are started later, with a module init call: That part
> * of the kernel is not yet operational.
On Wed, 3 Feb 2010, Andi Kleen wrote:
>
> No functional changes.
>
> Needed for next patch.
>
> Signed-off-by: Andi Kleen <[email protected]>
>
> ---
> mm/slab.c | 34 +++++++++++++++++++++-------------
> 1 file changed, 21 insertions(+), 13 deletions(-)
>
> Index: linux-2.6.33-rc3-ak/mm/slab.c
> ===================================================================
> --- linux-2.6.33-rc3-ak.orig/mm/slab.c
> +++ linux-2.6.33-rc3-ak/mm/slab.c
> @@ -1171,19 +1171,9 @@ free_array_cache:
> }
> }
>
> -static int __cpuinit cpuup_prepare(long cpu)
> +static int slab_node_prepare(int node)
> {
> struct kmem_cache *cachep;
> - struct kmem_list3 *l3 = NULL;
> - int node = cpu_to_node(cpu);
> - const int memsize = sizeof(struct kmem_list3);
> -
> - /*
> - * We need to do this right in the beginning since
> - * alloc_arraycache's are going to use this list.
> - * kmalloc_node allows us to add the slab to the right
> - * kmem_list3 and not this cpu's kmem_list3
> - */
>
> list_for_each_entry(cachep, &cache_chain, next) {
> /*
As Christoph mentioned, this patch is out of order with the previous one
in the series; slab_node_prepare() is called in that previous patch by a
memory hotplug callback without holding cache_chain_mutex (it's taken by
the cpu hotplug callback prior to calling cpuup_prepare() currently). So
slab_node_prepare() should note that we require the mutex and the memory
hotplug callback should take it in the previous patch.
> @@ -1192,9 +1182,10 @@ static int __cpuinit cpuup_prepare(long
> * node has not already allocated this
> */
> if (!cachep->nodelists[node]) {
> - l3 = kmalloc_node(memsize, GFP_KERNEL, node);
> + struct kmem_list3 *l3;
> + l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
> if (!l3)
> - goto bad;
> + return -1;
> kmem_list3_init(l3);
> l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
> ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
> @@ -1213,6 +1204,23 @@ static int __cpuinit cpuup_prepare(long
> cachep->batchcount + cachep->num;
> spin_unlock_irq(&cachep->nodelists[node]->list_lock);
> }
> + return 0;
> +}
> +
> +static int __cpuinit cpuup_prepare(long cpu)
> +{
> + struct kmem_cache *cachep;
> + struct kmem_list3 *l3 = NULL;
> + int node = cpu_to_node(cpu);
> +
> + /*
> + * We need to do this right in the beginning since
> + * alloc_arraycache's are going to use this list.
> + * kmalloc_node allows us to add the slab to the right
> + * kmem_list3 and not this cpu's kmem_list3
> + */
> + if (slab_node_prepare(node) < 0)
> + goto bad;
>
> /*
> * Now we can go ahead with allocating the shared arrays and
On Fri, Feb 05, 2010 at 01:06:56PM -0800, David Rientjes wrote:
> On Wed, 3 Feb 2010, Andi Kleen wrote:
>
> > When fallback_alloc() runs the node of the CPU might not be initialized yet.
> > Handle this case by allocating in another node.
> >
>
> That other node must be allowed by current's cpuset, otherwise
> kmem_getpages() will fail when get_page_from_freelist() iterates only over
> unallowed nodes.
All theses cases are really only interesting in the memory hotplug path
itself (afterwards the slab is working anyways and memory is there)
and if someone sets funny cpusets for those he gets what he deserves ...
-Andi
On Fri, Feb 05, 2010 at 01:17:56PM -0800, David Rientjes wrote:
> > +static int slab_memory_callback(struct notifier_block *self,
> > + unsigned long action, void *arg)
> > +{
> > + struct memory_notify *mn = (struct memory_notify *)arg;
>
> No cast necessary.
It's standard practice to cast void *.
> > void __init kmem_cache_init_late(void)
> > {
> > struct kmem_cache *cachep;
> > @@ -1583,6 +1598,8 @@ void __init kmem_cache_init_late(void)
> > */
> > register_cpu_notifier(&cpucache_notifier);
> >
> > + hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
>
> Only needed for CONFIG_NUMA.
Ok.
-Andi
--
[email protected] -- Speaking for myself only.
> As Christoph mentioned, this patch is out of order with the previous one
Ok.
> in the series; slab_node_prepare() is called in that previous patch by a
> memory hotplug callback without holding cache_chain_mutex (it's taken by
> the cpu hotplug callback prior to calling cpuup_prepare() currently). So
> slab_node_prepare() should note that we require the mutex and the memory
> hotplug callback should take it in the previous patch.
AFAIK the code is correct. If you feel the need for additional
documentation feel free to send patches yourself.
-Andi
On Sat, 6 Feb 2010, Andi Kleen wrote:
> > > +static int slab_memory_callback(struct notifier_block *self,
> > > + unsigned long action, void *arg)
> > > +{
> > > + struct memory_notify *mn = (struct memory_notify *)arg;
> >
> > No cast necessary.
>
> It's standard practice to cast void *.
>
$ grep -r "struct memory_notify.*=" *
arch/powerpc/platforms/pseries/cmm.c: struct memory_notify *marg = arg;
drivers/base/node.c: struct memory_notify *mnb = arg;
drivers/net/ehea/ehea_main.c: struct memory_notify *arg = data;
mm/ksm.c: struct memory_notify *mn = arg;
mm/slub.c: struct memory_notify *marg = arg;
mm/slub.c: struct memory_notify *marg = arg;
mm/page_cgroup.c: struct memory_notify *mn = arg;
On Sat, 6 Feb 2010, Andi Kleen wrote:
> > That other node must be allowed by current's cpuset, otherwise
> > kmem_getpages() will fail when get_page_from_freelist() iterates only over
> > unallowed nodes.
>
> All theses cases are really only interesting in the memory hotplug path
> itself (afterwards the slab is working anyways and memory is there)
> and if someone sets funny cpusets for those he gets what he deserves ...
>
If a hot-added node has not been initialized for the cache, your code is
picking an existing one in zonelist order which may be excluded by
current's cpuset. Thus, your code has a very real chance of having
kmem_getpages() return NULL because get_page_from_freelist() will reject
non-atomic ALLOC_CPUSET allocations for prohibited nodes. That isn't a
scenario that requires a "funny cpuset," it just has to not allow whatever
initialized node comes first in the zonelist.
My suggested alternative does not pick a single initialized node, rather
it tries all nodes that actually have a chance of having kmem_getpages()
succeed which increases the probability that your patch actually has an
effect for cpuset users.
On Sat, 6 Feb 2010, Andi Kleen wrote:
> > in the series; slab_node_prepare() is called in that previous patch by a
> > memory hotplug callback without holding cache_chain_mutex (it's taken by
> > the cpu hotplug callback prior to calling cpuup_prepare() currently). So
> > slab_node_prepare() should note that we require the mutex and the memory
> > hotplug callback should take it in the previous patch.
>
> AFAIK the code is correct. If you feel the need for additional
> documentation feel free to send patches yourself.
>
Documentation? You're required to take cache_chain_mutex before calling
slab_node_prepare() in your memory hotplug notifier, it iterates
cache_chain. Please look again.
> If a hot-added node has not been initialized for the cache, your code is
> picking an existing one in zonelist order which may be excluded by
> current's cpuset. Thus, your code has a very real chance of having
> kmem_getpages() return NULL because get_page_from_freelist() will reject
> non-atomic ALLOC_CPUSET allocations for prohibited nodes. That isn't a
> scenario that requires a "funny cpuset," it just has to not allow whatever
> initialized node comes first in the zonelist.
The point was that you would need to run whoever triggers the memory
hotadd in a cpuset with limitations. That would be a clear
don't do that if hurts(tm)
> My suggested alternative does not pick a single initialized node, rather
> it tries all nodes that actually have a chance of having kmem_getpages()
> succeed which increases the probability that your patch actually has an
> effect for cpuset users.
cpuset users are unlikely to trigger memory hotadds from inside limiting
cpusets. Typically that's done from udev etc.
-Andi
--
[email protected] -- Speaking for myself only.
On Sat, 6 Feb 2010, Andi Kleen wrote:
> > If a hot-added node has not been initialized for the cache, your code is
> > picking an existing one in zonelist order which may be excluded by
> > current's cpuset. Thus, your code has a very real chance of having
> > kmem_getpages() return NULL because get_page_from_freelist() will reject
> > non-atomic ALLOC_CPUSET allocations for prohibited nodes. That isn't a
> > scenario that requires a "funny cpuset," it just has to not allow whatever
> > initialized node comes first in the zonelist.
>
> The point was that you would need to run whoever triggers the memory
> hotadd in a cpuset with limitations. That would be a clear
> don't do that if hurts(tm)
>
With a subset of memory nodes, yes. What else prohibits that except for
your new code?
There's a second issue with this approach that I eluded to above: you're
picking the first initialized node for the cache based solely on whether
it is allocated or not. kmem_getpages() may still return NULL when it
would return new slab for any other initialized node, so you're better off
trying them all.
In other words, my earlier (untested) suggestion:
diff --git a/mm/slab.c b/mm/slab.c
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3172,6 +3172,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
gfp_t local_flags;
struct zoneref *z;
struct zone *zone;
+ nodemask_t allowed_nodes = NODE_MASK_NONE;
enum zone_type high_zoneidx = gfp_zone(flags);
void *obj = NULL;
int nid;
@@ -3197,6 +3198,7 @@ retry:
flags | GFP_THISNODE, nid);
if (obj)
break;
+ node_set(nid, allowed_nodes);
}
}
@@ -3210,7 +3212,15 @@ retry:
if (local_flags & __GFP_WAIT)
local_irq_enable();
kmem_flagcheck(cache, flags);
- obj = kmem_getpages(cache, local_flags, numa_node_id());
+ nid = numa_node_id();
+ if (cache->nodelists[nid])
+ obj = kmem_getpages(cache, local_flags, nid);
+ else
+ for_each_node_mask(nid, allowed_nodes) {
+ obj = kmem_getpages(cache, local_flags, nid);
+ if (obj)
+ break;
+ }
if (local_flags & __GFP_WAIT)
local_irq_disable();
if (obj) {
Anyway, I'll leave these otherwise unnecessary limitations to Pekka.
Thanks.