Here's an attempt towards doing away with lock_cpu_hotplug in the slab
subsystem. This approach also fixes a bug which shows up when cpus are
being offlined/onlined and slab caches are being tuned simultaneously.
http://marc.theaimsgroup.com/?l=linux-kernel&m=116098888100481&w=2
The patch has been stress tested overnight on a 2 socket 4 core AMD box with
repeated cpu online and offline, while dbench and kernbench process are
running, and slab caches being tuned at the same time.
There were no lockdep warnings either. (This test on 2,6.18 as 2.6.19-rc
crashes at __drain_pages
http://marc.theaimsgroup.com/?l=linux-kernel&m=116172164217678&w=2 )
The approach here is to hold cache_chain_mutex from CPU_UP_PREPARE until
CPU_ONLINE (similar in approach as worqueue_mutex) . Slab code sensitive
to cpu_online_map (kmem_cache_create, kmem_cache_destroy, slabinfo_write,
__cache_shrink) is already serialized with cache_chain_mutex. (This patch
lengthens cache_chain_mutex hold time at kmem_cache_destroy to cover this).
This patch also takes the cache_chain_sem at kmem_cache_shrink to
protect sanity of cpu_online_map at __cache_shrink, as viewed by slab.
(kmem_cache_shrink->__cache_shrink->drain_cpu_caches). But, really,
kmem_cache_shrink is used at just one place in the acpi subsystem!
Do we really need to keep kmem_cache_shrink at all?
Another note. Looks like a cpu hotplug event can send CPU_UP_CANCELED to
a registered subsystem even if the subsystem did not receive CPU_UP_PREPARE.
This could be due to a subsystem registered for notification earlier than
the current subsystem crapping out with NOTIFY_BAD. Badness can occur with
in the CPU_UP_CANCELED code path at slab if this happens (The same would
apply for workqueue.c as well). To overcome this, we might have to use either
a) a per subsystem flag and avoid handling of CPU_UP_CANCELED, or
b) Use a special notifier events like LOCK_ACQUIRE/RELEASE as Gautham was
using in his experiments, or
c) Do not send CPU_UP_CANCELED to a subsystem which did not receive
CPU_UP_PREPARE.
I would prefer c).
Comments?
Signed-off-by: Ravikiran Thirumalai <[email protected]>
Signed-off-by: Shai Fultheim <[email protected]>
Index: linux-2.6.19-rc3/mm/slab.c
===================================================================
--- linux-2.6.19-rc3.orig/mm/slab.c 2006-10-24 10:26:44.672341000 -0700
+++ linux-2.6.19-rc3/mm/slab.c 2006-10-27 16:09:48.371537000 -0700
@@ -730,7 +730,10 @@ static inline void init_lock_keys(void)
}
#endif
-/* Guard access to the cache-chain. */
+/*
+ * 1. Guard access to the cache-chain.
+ * 2. Protect sanity of cpu_online_map against cpu hotplug events
+ */
static DEFINE_MUTEX(cache_chain_mutex);
static struct list_head cache_chain;
@@ -1230,12 +1233,18 @@ static int __cpuinit cpuup_callback(stru
kfree(shared);
free_alien_cache(alien);
}
- mutex_unlock(&cache_chain_mutex);
break;
case CPU_ONLINE:
+ mutex_unlock(&cache_chain_mutex);
start_cpu_timer(cpu);
break;
#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DOWN_PREPARE:
+ mutex_lock(&cache_chain_mutex);
+ break;
+ case CPU_DOWN_FAILED:
+ mutex_unlock(&cache_chain_mutex);
+ break;
case CPU_DEAD:
/*
* Even if all the cpus of a node are down, we don't free the
@@ -1246,8 +1255,8 @@ static int __cpuinit cpuup_callback(stru
* gets destroyed at kmem_cache_destroy().
*/
/* fall thru */
+#endif
case CPU_UP_CANCELED:
- mutex_lock(&cache_chain_mutex);
list_for_each_entry(cachep, &cache_chain, next) {
struct array_cache *nc;
struct array_cache *shared;
@@ -1308,11 +1317,9 @@ free_array_cache:
}
mutex_unlock(&cache_chain_mutex);
break;
-#endif
}
return NOTIFY_OK;
bad:
- mutex_unlock(&cache_chain_mutex);
return NOTIFY_BAD;
}
@@ -2098,11 +2105,9 @@ kmem_cache_create (const char *name, siz
}
/*
- * Prevent CPUs from coming and going.
- * lock_cpu_hotplug() nests outside cache_chain_mutex
+ * We use cache_chain_mutex to ensure a consistent view of
+ * cpu_online_map as well. Please see cpuup_callback
*/
- lock_cpu_hotplug();
-
mutex_lock(&cache_chain_mutex);
list_for_each_entry(pc, &cache_chain, next) {
@@ -2326,7 +2331,6 @@ oops:
panic("kmem_cache_create(): failed to create slab `%s'\n",
name);
mutex_unlock(&cache_chain_mutex);
- unlock_cpu_hotplug();
return cachep;
}
EXPORT_SYMBOL(kmem_cache_create);
@@ -2444,6 +2448,7 @@ out:
return nr_freed;
}
+/* Called with cache_chain_mutex held to protect against cpu hotplug */
static int __cache_shrink(struct kmem_cache *cachep)
{
int ret = 0, i = 0;
@@ -2474,9 +2479,13 @@ static int __cache_shrink(struct kmem_ca
*/
int kmem_cache_shrink(struct kmem_cache *cachep)
{
+ int ret;
BUG_ON(!cachep || in_interrupt());
- return __cache_shrink(cachep);
+ mutex_lock(&cache_chain_mutex);
+ ret = __cache_shrink(cachep);
+ mutex_unlock(&cache_chain_mutex);
+ return ret;
}
EXPORT_SYMBOL(kmem_cache_shrink);
@@ -2500,23 +2509,16 @@ void kmem_cache_destroy(struct kmem_cach
{
BUG_ON(!cachep || in_interrupt());
- /* Don't let CPUs to come and go */
- lock_cpu_hotplug();
-
/* Find the cache in the chain of caches. */
mutex_lock(&cache_chain_mutex);
/*
* the chain is never empty, cache_cache is never destroyed
*/
list_del(&cachep->next);
- mutex_unlock(&cache_chain_mutex);
-
if (__cache_shrink(cachep)) {
slab_error(cachep, "Can't free all objects");
- mutex_lock(&cache_chain_mutex);
list_add(&cachep->next, &cache_chain);
mutex_unlock(&cache_chain_mutex);
- unlock_cpu_hotplug();
return;
}
@@ -2524,7 +2526,7 @@ void kmem_cache_destroy(struct kmem_cach
synchronize_rcu();
__kmem_cache_destroy(cachep);
- unlock_cpu_hotplug();
+ mutex_unlock(&cache_chain_mutex);
}
EXPORT_SYMBOL(kmem_cache_destroy);
On Fri, 27 Oct 2006 18:19:19 -0700
Ravikiran G Thirumalai <[email protected]> wrote:
> Another note. Looks like a cpu hotplug event can send CPU_UP_CANCELED to
> a registered subsystem even if the subsystem did not receive CPU_UP_PREPARE.
> This could be due to a subsystem registered for notification earlier than
> the current subsystem crapping out with NOTIFY_BAD. Badness can occur with
> in the CPU_UP_CANCELED code path at slab if this happens (The same would
> apply for workqueue.c as well).
yup, cancellation doesn't work at present.
> To overcome this, we might have to use either
> a) a per subsystem flag and avoid handling of CPU_UP_CANCELED, or
> b) Use a special notifier events like LOCK_ACQUIRE/RELEASE as Gautham was
> using in his experiments, or
> c) Do not send CPU_UP_CANCELED to a subsystem which did not receive
> CPU_UP_PREPARE.
>
> I would prefer c).
c) would work. I guess we could do that by simply counting the number of
called handlers rather than having to record state within each one.
It would require changes to the notifier_chain API, but I think the changes
are needed - the problem is general. Something like:
int __raw_notifier_call_chain(struct raw_notifier_head *nh,
unsigned long val, void *v, unsigned nr_to_call, int *nr_called);
int raw_notifier_call_chain(struct raw_notifier_head *nh,
unsigned long val, void *v)
{
return __raw_notifier_call_chain(nh, val, v, -1, NULL);
}
On Fri, Oct 27, 2006 at 06:19:19PM -0700, Ravikiran G Thirumalai wrote:
> This patch also takes the cache_chain_sem at kmem_cache_shrink to
> protect sanity of cpu_online_map at __cache_shrink, as viewed by slab.
> (kmem_cache_shrink->__cache_shrink->drain_cpu_caches). But, really,
drain_cpu_caches uses on_each_cpu() ..which does a preempt_disable()
before using the cpu_online_map. That should be a safe enough access to the
bitmap?
> kmem_cache_shrink is used at just one place in the acpi subsystem!
> Do we really need to keep kmem_cache_shrink at all?
>
> Another note. Looks like a cpu hotplug event can send CPU_UP_CANCELED to
> a registered subsystem even if the subsystem did not receive CPU_UP_PREPARE.
> This could be due to a subsystem registered for notification earlier than
> the current subsystem crapping out with NOTIFY_BAD. Badness can occur with
> in the CPU_UP_CANCELED code path at slab if this happens (The same would
> apply for workqueue.c as well). To overcome this, we might have to use either
> a) a per subsystem flag and avoid handling of CPU_UP_CANCELED, or
> b) Use a special notifier events like LOCK_ACQUIRE/RELEASE as Gautham was
> using in his experiments, or
> c) Do not send CPU_UP_CANCELED to a subsystem which did not receive
> CPU_UP_PREPARE.
>
> I would prefer c).
I think we need both b) and c).
Let me explain.
The need for c) is straightforward.
The need for b) comes from the fact that _cpu_down messes with the
tsk->cpus_allowed mask (to possibly jump off the dying CPU). This would cause
sched_getaffinity() to potentially return a false value back to the user and
hence it was modified to take lock_cpu_hotplug() before reading
tsk->cpus_allowed.
If we are discarding this whole lock_cpu_hotplug(), then IMO, we should
use LOCK_ACQUIRE/RELEASE, where ACQUIRE notification is sent *before*
messing with tsk->cpus_allowed and RELEASE notification sent *after*
restoring tsk->cpus_allowed (something like below):
@@ -186,13 +186,14 @@ int cpu_down(unsigned int cpu)
{
int err = 0;
- mutex_lock(&cpu_add_remove_lock);
+ blocking_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE,
+ (void *)(long)cpu);
if (cpu_hotplug_disabled)
err = -EBUSY;
else
err = _cpu_down(cpu);
-
- mutex_unlock(&cpu_add_remove_lock);
+ blocking_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE,
+ (void *)(long)cpu);
return err;
}
--
Regards,
vatsa
On Mon, Oct 30, 2006 at 03:52:06PM +0530, Srivatsa Vaddagiri wrote:
> On Fri, Oct 27, 2006 at 06:19:19PM -0700, Ravikiran G Thirumalai wrote:
> > This patch also takes the cache_chain_sem at kmem_cache_shrink to
> > protect sanity of cpu_online_map at __cache_shrink, as viewed by slab.
> > (kmem_cache_shrink->__cache_shrink->drain_cpu_caches). But, really,
>
> drain_cpu_caches uses on_each_cpu() ..which does a preempt_disable()
> before using the cpu_online_map. That should be a safe enough access to the
> bitmap?
drain_cpu_caches has two call sites -- kmem_cache_destroy and
kmem_cache_shrink.
kmem_cache_shrink:
>From what I can gather by looking at the cpu hotplug code, disabling
preemption before iterating over cpu_online_map ensures that a
cpu won't disappear from the bitmask (system). But it does not ensure that a
cpu won't come up right? (I see stop_machine usage in the cpu_down path, but
not in the cpu_up path). But then on closer look I see that on_each_cpu
uses call_lock to protect the cpu_online_map against cpu_online events.
So, yes we don't need to take the cache_chain_sem here.
kmem_cache_destroy:
We still need to stay serialized against cpu online here. I guess
you already know why :)
http://lkml.org/lkml/2004/3/23/80
> > Another note. Looks like a cpu hotplug event can send CPU_UP_CANCELED to
> > a registered subsystem even if the subsystem did not receive CPU_UP_PREPARE.
> > This could be due to a subsystem registered for notification earlier than
> > the current subsystem crapping out with NOTIFY_BAD. Badness can occur with
> > in the CPU_UP_CANCELED code path at slab if this happens (The same would
> > apply for workqueue.c as well). To overcome this, we might have to use either
> > a) a per subsystem flag and avoid handling of CPU_UP_CANCELED, or
> > b) Use a special notifier events like LOCK_ACQUIRE/RELEASE as Gautham was
> > using in his experiments, or
> > c) Do not send CPU_UP_CANCELED to a subsystem which did not receive
> > CPU_UP_PREPARE.
> >
> > I would prefer c).
>
> I think we need both b) and c).
>
> Let me explain.
>
> The need for c) is straightforward.
>
> The need for b) comes from the fact that _cpu_down messes with the
> tsk->cpus_allowed mask (to possibly jump off the dying CPU). This would cause
> sched_getaffinity() to potentially return a false value back to the user and
> hence it was modified to take lock_cpu_hotplug() before reading
> tsk->cpus_allowed.
Maybe I am missing something, but what prevents someone from reading the
wrong tsk->cpus_allowed at (A) below?
static int _cpu_down(unsigned int cpu)
{
...
...
set_cpus_allowed(current, tmp);
----- (A)
mutex_lock(&cpu_bitmask_lock);
p = __stop_machine_run(take_cpu_down, NULL, cpu);
...
}
>
> If we are discarding this whole lock_cpu_hotplug(), then IMO, we should
> use LOCK_ACQUIRE/RELEASE, where ACQUIRE notification is sent *before*
> messing with tsk->cpus_allowed and RELEASE notification sent *after*
> restoring tsk->cpus_allowed (something like below):
>
> @@ -186,13 +186,14 @@ int cpu_down(unsigned int cpu)
> {
> int err = 0;
>
> - mutex_lock(&cpu_add_remove_lock);
> + blocking_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE,
> + (void *)(long)cpu);
> if (cpu_hotplug_disabled)
> err = -EBUSY;
> else
> err = _cpu_down(cpu);
> -
> - mutex_unlock(&cpu_add_remove_lock);
> + blocking_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE,
> + (void *)(long)cpu);
> return err;
> }
>
But, since we send CPU_DOWN_PREPARE at _cpu_down before set_cpus_allowed(),
is it not possible to take the per scheduler subsystem lock at DOWN_PREPARE
and serialize sched_getaffinity with the same per scheduler subsys lock?
Thanks,
Kiran
On Mon, Oct 30, 2006 at 07:51:40PM -0800, Ravikiran G Thirumalai wrote:
> kmem_cache_shrink:
> >From what I can gather by looking at the cpu hotplug code, disabling
> preemption before iterating over cpu_online_map ensures that a
> cpu won't disappear from the bitmask (system). But it does not ensure that a
> cpu won't come up right? (I see stop_machine usage in the cpu_down path, but
> not in the cpu_up path). But then on closer look I see that on_each_cpu
> uses call_lock to protect the cpu_online_map against cpu_online events.
> So, yes we don't need to take the cache_chain_sem here.
Yes thats what I thought.
> kmem_cache_destroy:
> We still need to stay serialized against cpu online here. I guess
> you already know why :)
> http://lkml.org/lkml/2004/3/23/80
sure!
> Maybe I am missing something, but what prevents someone from reading the
> wrong tsk->cpus_allowed at (A) below?
>
> static int _cpu_down(unsigned int cpu)
> {
> ...
> ...
> set_cpus_allowed(current, tmp);
> ----- (A)
lock_cpu_hotplug() in sched_getaffinity was supposed to guard from
reading the wrong value you point out. But with recent churn of cpu hotplug
locking, this is broken now.
Ideally, lock_cpu_hotplug() should have taken something equivalent to
cpu_add_remove_lock (breaking cpufreq in the course :), but moving
mutex_lock(&cpu_bitmask_lock) few lines above (before set_cpus_allowed) should
also work in this case.
Alternately, taking a per-subsystem lock in DOWN_PREPARE/LOCK_ACQUIRE
notifications, which is used by sched_getaffinity also, would work
(as you note below).
> mutex_lock(&cpu_bitmask_lock);
> p = __stop_machine_run(take_cpu_down, NULL, cpu);
> ...
> }
>
>
> >
> > If we are discarding this whole lock_cpu_hotplug(), then IMO, we should
> > use LOCK_ACQUIRE/RELEASE, where ACQUIRE notification is sent *before*
> > messing with tsk->cpus_allowed and RELEASE notification sent *after*
> > restoring tsk->cpus_allowed (something like below):
> >
> > @@ -186,13 +186,14 @@ int cpu_down(unsigned int cpu)
> > {
> > int err = 0;
> >
> > - mutex_lock(&cpu_add_remove_lock);
> > + blocking_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE,
> > + (void *)(long)cpu);
> > if (cpu_hotplug_disabled)
> > err = -EBUSY;
> > else
> > err = _cpu_down(cpu);
> > -
> > - mutex_unlock(&cpu_add_remove_lock);
> > + blocking_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE,
> > + (void *)(long)cpu);
> > return err;
> > }
> >
>
> But, since we send CPU_DOWN_PREPARE at _cpu_down before set_cpus_allowed(),
> is it not possible to take the per scheduler subsystem lock at DOWN_PREPARE
> and serialize sched_getaffinity with the same per scheduler subsys lock?
CPU_DOWN_PREPARE is a good enough time to acquire the (scheduler
subsystem) lock. But I am more concerned about when we release that lock.
Releasing at CPU_DEAD/CPU_DOWN_FAILED is too early, since the tasks's
cpus_allowed mask would not have been restored by then. Thats why having a
separate notification to release (and acquire) the lock would make more sense I
thought.
--
Regards,
vatsa