Fix a cpu hotplug race condition while tuning slab cpu caches.
CPU online (cpu_up) and tuning slab caches (do_tune_cpucache)
can race and can lead to a situation where we do not have an
arraycache allocated for a newly onlined cpu.
The race can be explained as follows:
cpu_online_map 00000111
cpu_up(3)
cpuup_callback CPU_UP_PREPARE:
mutex_lock(&cache_chain_mutex);
...
allocate_array_cache for cpu 3
...
mutex_unlock(&cache_chain_mutex);
... slabinfo_write
... mutex_lock(&cache_chain_mutex);
... do_tune_cpucache
... allocate new arraycache for cpu0,1,2, NULL rest
... ...
mutex_lock(&cpu_bitmask_lock); ...
cpu_online_map 00001111
mutex_unlock(&cpu_bitmask_lock); ...
on_each_cpu swap the new array_cache with old
^^^^
CPU 3 gets assigned with a NULL array cache
Hence, when do_ccupdate_local is run on CPU 3, CPU 3 gets a NULL array_cache,
and caused badness thereon.
So don't allow cpus to come and go while in do_tune_cpucache.
The other code path of do_tune_cpucache through kmem_cache_create
is already protected through lock_cpu_hotplug.
Signed-off-by: Alok N Kataria <[email protected]>
Signed-off-by: Ravikiran Thirumalai <[email protected]>
Signed-off-by: Shai Fultheim <[email protected]>
Index: linux-2.6.19-rc1slab/mm/slab.c
===================================================================
--- linux-2.6.19-rc1slab.orig/mm/slab.c 2006-10-13 12:35:02.578841000 -0700
+++ linux-2.6.19-rc1slab/mm/slab.c 2006-10-13 12:35:46.848841000 -0700
@@ -4072,6 +4072,7 @@ ssize_t slabinfo_write(struct file *file
return -EINVAL;
/* Find the cache in the chain of caches. */
+ lock_cpu_hotplug();
mutex_lock(&cache_chain_mutex);
res = -EINVAL;
list_for_each_entry(cachep, &cache_chain, next) {
@@ -4087,6 +4088,7 @@ ssize_t slabinfo_write(struct file *file
}
}
mutex_unlock(&cache_chain_mutex);
+ unlock_cpu_hotplug();
if (res >= 0)
res = count;
return res;
On Mon, 16 Oct 2006, Ravikiran G Thirumalai wrote:
> Fix a cpu hotplug race condition while tuning slab cpu caches.
Acked-by: Christoph Lameter <[email protected]>
We probably have a huge number of similar races. Basically any use of
the online cpu map has the potential of causing a race.
On Mon, 16 Oct 2006 01:54:39 -0700
Ravikiran G Thirumalai <[email protected]> wrote:
> Fix a cpu hotplug race condition while tuning slab cpu caches.
>
> CPU online (cpu_up) and tuning slab caches (do_tune_cpucache)
> can race and can lead to a situation where we do not have an
> arraycache allocated for a newly onlined cpu.
lock_cpu_hotplug() is a noxious thing which should be removed.
> The race can be explained as follows:
>
> cpu_online_map 00000111
> cpu_up(3)
> cpuup_callback CPU_UP_PREPARE:
> mutex_lock(&cache_chain_mutex);
> ...
> allocate_array_cache for cpu 3
> ...
> mutex_unlock(&cache_chain_mutex);
> ... slabinfo_write
> ... mutex_lock(&cache_chain_mutex);
> ... do_tune_cpucache
> ... allocate new arraycache for cpu0,1,2, NULL rest
> ... ...
> mutex_lock(&cpu_bitmask_lock); ...
> cpu_online_map 00001111
> mutex_unlock(&cpu_bitmask_lock); ...
> on_each_cpu swap the new array_cache with old
> ^^^^
> CPU 3 gets assigned with a NULL array cache
The problem is obvious: we have some data (the array caches) and we have a
data structure which is used to look up that data (cpu_online_map). But
we're releasing the lock while these two things are in an inconsistent
state.
So you could have fixed this by taking cache_chain_mutex in CPU_UP_PREPARE
and releasing it in CPU_ONLINE and CPU_UP_CANCELED.
> Hence, when do_ccupdate_local is run on CPU 3, CPU 3 gets a NULL array_cache,
> and caused badness thereon.
>
> So don't allow cpus to come and go while in do_tune_cpucache.
> The other code path of do_tune_cpucache through kmem_cache_create
> is already protected through lock_cpu_hotplug.
>
> Signed-off-by: Alok N Kataria <[email protected]>
> Signed-off-by: Ravikiran Thirumalai <[email protected]>
> Signed-off-by: Shai Fultheim <[email protected]>
>
> Index: linux-2.6.19-rc1slab/mm/slab.c
> ===================================================================
> --- linux-2.6.19-rc1slab.orig/mm/slab.c 2006-10-13 12:35:02.578841000 -0700
> +++ linux-2.6.19-rc1slab/mm/slab.c 2006-10-13 12:35:46.848841000 -0700
> @@ -4072,6 +4072,7 @@ ssize_t slabinfo_write(struct file *file
> return -EINVAL;
>
> /* Find the cache in the chain of caches. */
> + lock_cpu_hotplug();
> mutex_lock(&cache_chain_mutex);
> res = -EINVAL;
> list_for_each_entry(cachep, &cache_chain, next) {
> @@ -4087,6 +4088,7 @@ ssize_t slabinfo_write(struct file *file
> }
> }
> mutex_unlock(&cache_chain_mutex);
> + unlock_cpu_hotplug();
> if (res >= 0)
> res = count;
> return res;
Given that this lock_cpu_hotplug() happens at a high level I guess it'll
avoid the usual lock_cpu_hotplug() horrors and we can live with it. I
assume lockdep was enabled when you were testing this?
On Mon, Oct 16, 2006 at 11:15:11AM -0700, Andrew Morton wrote:
> On Mon, 16 Oct 2006 01:54:39 -0700
> Ravikiran G Thirumalai <[email protected]> wrote:
>
> The problem is obvious: we have some data (the array caches) and we have a
> data structure which is used to look up that data (cpu_online_map). But
> we're releasing the lock while these two things are in an inconsistent
> state.
>
> So you could have fixed this by taking cache_chain_mutex in CPU_UP_PREPARE
> and releasing it in CPU_ONLINE and CPU_UP_CANCELED.
Hmm, yes. I suppose so. Maybe we can do away with other uses of
lock_cpu_hotplug() in slab.c as well then! Will give it a shot. Slab
locking might look uglier than what it already is though no?
>
> > list_for_each_entry(cachep, &cache_chain, next) {
> > @@ -4087,6 +4088,7 @@ ssize_t slabinfo_write(struct file *file
> > }
> > }
> > mutex_unlock(&cache_chain_mutex);
> > + unlock_cpu_hotplug();
> > if (res >= 0)
> > res = count;
> > return res;
>
> Given that this lock_cpu_hotplug() happens at a high level I guess it'll
> avoid the usual lock_cpu_hotplug() horrors and we can live with it. I
> assume lockdep was enabled when you were testing this?
Not when I tested it. I just retested with lockdep on and things seemed
fine on a SMP.
Thanks,
Kiran
On Mon, 16 Oct 2006 12:26:15 -0700
Ravikiran G Thirumalai <[email protected]> wrote:
> On Mon, Oct 16, 2006 at 11:15:11AM -0700, Andrew Morton wrote:
> > On Mon, 16 Oct 2006 01:54:39 -0700
> > Ravikiran G Thirumalai <[email protected]> wrote:
> >
> > The problem is obvious: we have some data (the array caches) and we have a
> > data structure which is used to look up that data (cpu_online_map). But
> > we're releasing the lock while these two things are in an inconsistent
> > state.
> >
> > So you could have fixed this by taking cache_chain_mutex in CPU_UP_PREPARE
> > and releasing it in CPU_ONLINE and CPU_UP_CANCELED.
>
> Hmm, yes. I suppose so. Maybe we can do away with other uses of
> lock_cpu_hotplug() in slab.c as well then!
Yes, we can.
> Will give it a shot. Slab
> locking might look uglier than what it already is though no?
I suspect it will improve - all the calls to lock_cpu_hotplug() go away.
But it gets conceptually more complex: we're now holding cache_chain_mutex
across some or all of the _other_ callbacks which are registered on the cpu
notifier list. And they could be anything at all, and in any order.
But as long as things are well-designed and layered and interactions are
minimised (all things we like to achieve) then we're OK. But it's a worry.
Fortunately lockdep helps a lot here.
> >
> > > list_for_each_entry(cachep, &cache_chain, next) {
> > > @@ -4087,6 +4088,7 @@ ssize_t slabinfo_write(struct file *file
> > > }
> > > }
> > > mutex_unlock(&cache_chain_mutex);
> > > + unlock_cpu_hotplug();
> > > if (res >= 0)
> > > res = count;
> > > return res;
> >
> > Given that this lock_cpu_hotplug() happens at a high level I guess it'll
> > avoid the usual lock_cpu_hotplug() horrors and we can live with it. I
> > assume lockdep was enabled when you were testing this?
>
> Not when I tested it. I just retested with lockdep on and things seemed
> fine on a SMP.
Great, thanks. Please ensure that lockdep is used when testing the kernel.
Also preempt, DEBUG_SLAB, DEBUG_SPINLOCK_SLEEP and various other things.
I guess the list in Documentation/SubmitChecklist is the definitive one.
(Was: [patch] slab: Fix a cpu hotplug race condition while tuning slab cpu
caches)
On Mon, Oct 16, 2006 at 12:48:08PM -0700, Andrew Morton wrote:
> >
> > Not when I tested it. I just retested with lockdep on and things seemed
> > fine on a SMP.
>
> Great, thanks. Please ensure that lockdep is used when testing the kernel.
> Also preempt, DEBUG_SLAB, DEBUG_SPINLOCK_SLEEP and various other things.
> I guess the list in Documentation/SubmitChecklist is the definitive one.
Seems like cpu offline spits out lockdep warnings when actually offlining a
CPU with CPU_HOTPLUG + CONFIG_PREEMPT + CONFIG_SLAB_DEBUG etc. Here is the
trace I get. Note that this is plain 2.6.19-rc2 (_without_ the slab cpu
hotplug fix).
This a dual CPU 4 thread Xeon SMP box with 8G main memory. I am also
attaching the .config I used, and the full dmesg.
Thanks,
Kiran
[ 235.667515] CPU 3 is now offline
[ 235.670860]
[ 235.670861] =======================================================
[ 235.678694] [ INFO: possible circular locking dependency detected ]
[ 235.684990] 2.6.19-rc2 #1
[ 235.687649] -------------------------------------------------------
[ 235.693945] bash/3223 is trying to acquire lock:
[ 235.698596] ((cpu_chain).rwsem){----}, at: [<ffffffff8023ea62>] blocking_notifier_call_chain+0x13/0x36
[ 235.708226]
[ 235.708226] but task is already holding lock:
[ 235.714149] (workqueue_mutex){--..}, at: [<ffffffff802422a1>] workqueue_cpu_callback+0x14e/0x2a4
[ 235.723260]
[ 235.723261] which lock already depends on the new lock.
[ 235.723262]
[ 235.731575]
[ 235.731575] the existing dependency chain (in reverse order) is:
[ 235.739144]
[ 235.739144] -> #1 (workqueue_mutex){--..}:
[ 235.744954] [<ffffffff80249cc9>] add_lock_to_list+0x78/0x9d
[ 235.751588] [<ffffffff8024bcd8>] __lock_acquire+0xaca/0xc37
[ 235.758222] [<ffffffff8024c0ff>] lock_acquire+0x5c/0x77
[ 235.764501] [<ffffffff802422a1>] workqueue_cpu_callback+0x14e/0x2a4
[ 235.771836] [<ffffffff8048d2fa>] __mutex_lock_slowpath+0xea/0x272
[ 235.778990] [<ffffffff802422a1>] workqueue_cpu_callback+0x14e/0x2a4
[ 235.786317] [<ffffffff8023e932>] notifier_call_chain+0x23/0x32
[ 235.793211] [<ffffffff8023ea71>] blocking_notifier_call_chain+0x22/0x36
[ 235.800901] [<ffffffff8024ff00>] _cpu_down+0x53/0x218
[ 235.807014] [<ffffffff802500f0>] cpu_down+0x2b/0x42
[ 235.812955] [<ffffffff803b09c8>] store_online+0x27/0x71
[ 235.819234] [<ffffffff802b67b6>] sysfs_write_file+0xcc/0xfb
[ 235.825878] [<ffffffff8027bfb5>] vfs_write+0xb2/0x155
[ 235.831983] [<ffffffff8027c10d>] sys_write+0x45/0x70
[ 235.838002] [<ffffffff802099be>] system_call+0x7e/0x83
[ 235.844194] [<ffffffffffffffff>] 0xffffffffffffffff
[ 235.850148]
[ 235.850148] -> #0 ((cpu_chain).rwsem){----}:
[ 235.856145] [<ffffffff8024977b>] save_trace+0x48/0xdf
[ 235.862250] [<ffffffff80249e67>] print_circular_bug_tail+0x34/0x70
[ 235.869491] [<ffffffff8024bbc5>] __lock_acquire+0x9b7/0xc37
[ 235.876125] [<ffffffff8048ef0e>] _spin_unlock_irqrestore+0x49/0x52
[ 235.883374] [<ffffffff8024c0ff>] lock_acquire+0x5c/0x77
[ 235.889653] [<ffffffff8023ea62>] blocking_notifier_call_chain+0x13/0x36
[ 235.897334] [<ffffffff8048c89f>] cond_resched+0x2f/0x3a
[ 235.903613] [<ffffffff80247fbf>] down_read+0x37/0x40
[ 235.909642] [<ffffffff8023ea62>] blocking_notifier_call_chain+0x13/0x36
[ 235.917322] [<ffffffff80250004>] _cpu_down+0x157/0x218
[ 235.923532] [<ffffffff802500f0>] cpu_down+0x2b/0x42
[ 235.929464] [<ffffffff803b09c8>] store_online+0x27/0x71
[ 235.935752] [<ffffffff802b67b6>] sysfs_write_file+0xcc/0xfb
[ 235.942386] [<ffffffff8027bfb5>] vfs_write+0xb2/0x155
[ 235.948491] [<ffffffff8027c10d>] sys_write+0x45/0x70
[ 235.954520] [<ffffffff802099be>] system_call+0x7e/0x83
[ 235.960712] [<ffffffffffffffff>] 0xffffffffffffffff
[ 235.966645]
[ 235.966645] other info that might help us debug this:
[ 235.966646]
[ 235.974812] 2 locks held by bash/3223:
[ 235.978604] #0: (cpu_add_remove_lock){--..}, at: [<ffffffff802500de>] cpu_down+0x19/0x42
[ 235.987160] #1: (workqueue_mutex){--..}, at: [<ffffffff802422a1>] workqueue_cpu_callback+0x14e/0x2a4
[ 235.996765]
[ 235.996766] stack backtrace:
[ 236.001217]
[ 236.001218] Call Trace:
[ 236.005238] [<ffffffff80249e9a>] print_circular_bug_tail+0x67/0x70
[ 236.011543] [<ffffffff8024bbc5>] __lock_acquire+0x9b7/0xc37
[ 236.017241] [<ffffffff8048ef0e>] _spin_unlock_irqrestore+0x49/0x52
[ 236.023546] [<ffffffff8024c0ff>] lock_acquire+0x5c/0x77
[ 236.028898] [<ffffffff8023ea62>] blocking_notifier_call_chain+0x13/0x36
[ 236.035634] [<ffffffff8048c89f>] cond_resched+0x2f/0x3a
[ 236.040987] [<ffffffff80247fbf>] down_read+0x37/0x40
[ 236.046080] [<ffffffff8023ea62>] blocking_notifier_call_chain+0x13/0x36
[ 236.052818] [<ffffffff80250004>] _cpu_down+0x157/0x218
[ 236.058084] [<ffffffff802500f0>] cpu_down+0x2b/0x42
[ 236.063089] [<ffffffff803b09c8>] store_online+0x27/0x71
[ 236.068441] [<ffffffff802b67b6>] sysfs_write_file+0xcc/0xfb
[ 236.074140] [<ffffffff8027bfb5>] vfs_write+0xb2/0x155
[ 236.079319] [<ffffffff8027c10d>] sys_write+0x45/0x70
[ 236.084411] [<ffffffff802099be>] system_call+0x7e/0x83
[ 236.089677]
On Monday October 16, [email protected] wrote:
> (Was: [patch] slab: Fix a cpu hotplug race condition while tuning slab cpu
> caches)
>
> On Mon, Oct 16, 2006 at 12:48:08PM -0700, Andrew Morton wrote:
> > >
> > > Not when I tested it. I just retested with lockdep on and things seemed
> > > fine on a SMP.
> >
> > Great, thanks. Please ensure that lockdep is used when testing the kernel.
> > Also preempt, DEBUG_SLAB, DEBUG_SPINLOCK_SLEEP and various other things.
> > I guess the list in Documentation/SubmitChecklist is the definitive one.
>
> Seems like cpu offline spits out lockdep warnings when actually offlining a
> CPU with CPU_HOTPLUG + CONFIG_PREEMPT + CONFIG_SLAB_DEBUG etc. Here is the
> trace I get. Note that this is plain 2.6.19-rc2 (_without_ the slab cpu
> hotplug fix).
That one is fixed by the following patch which will be in the next
-mm.
Thanks,
NeilBrown
Convert cpu hotplug notifiers to use raw_notifier instead of blocking_notifier
The use of blocking notifier by _cpu_up and _cpu_down in cpu.c
has two problem.
1/ An interaction with the workqueue notifier causes lockdep to
spit a warning.
2/ A notifier could conceivable be added or removed while _cpu_up or
_cpu_down are in process. As each notifier is called twice
(prepare then commit/abort) this could be unhealthy.
To fix to we simply take cpu_add_remove_lock while adding
or removing notifiers to/from the list.
This makes the 'blocking' usage unnecessary as all accesses to
cpu_chain are now protected by cpu_add_remove_lock. So
change "blocking" to "raw" in all relevant places.
This fixes 1.
Credit: Andrew Morton
Cc: [email protected]
Signed-off-by: Neil Brown <[email protected]>
### Diffstat output
./kernel/cpu.c | 24 +++++++++++++++---------
1 file changed, 15 insertions(+), 9 deletions(-)
diff .prev/kernel/cpu.c ./kernel/cpu.c
--- .prev/kernel/cpu.c 2006-10-16 15:41:11.000000000 +1000
+++ ./kernel/cpu.c 2006-10-13 14:33:49.000000000 +1000
@@ -19,7 +19,7 @@
static DEFINE_MUTEX(cpu_add_remove_lock);
static DEFINE_MUTEX(cpu_bitmask_lock);
-static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
+static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
* Should always be manipulated under cpu_add_remove_lock
@@ -68,7 +68,11 @@ EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
/* Need to know about CPUs going up/down? */
int __cpuinit register_cpu_notifier(struct notifier_block *nb)
{
- return blocking_notifier_chain_register(&cpu_chain, nb);
+ int ret;
+ mutex_lock(&cpu_add_remove_lock);
+ ret = raw_notifier_chain_register(&cpu_chain, nb);
+ mutex_unlock(&cpu_add_remove_lock);
+ return ret;
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -77,7 +81,9 @@ EXPORT_SYMBOL(register_cpu_notifier);
void unregister_cpu_notifier(struct notifier_block *nb)
{
- blocking_notifier_chain_unregister(&cpu_chain, nb);
+ mutex_lock(&cpu_add_remove_lock);
+ raw_notifier_chain_unregister(&cpu_chain, nb);
+ mutex_unlock(&cpu_add_remove_lock);
}
EXPORT_SYMBOL(unregister_cpu_notifier);
@@ -126,7 +132,7 @@ static int _cpu_down(unsigned int cpu)
if (!cpu_online(cpu))
return -EINVAL;
- err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
+ err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
(void *)(long)cpu);
if (err == NOTIFY_BAD) {
printk("%s: attempt to take down CPU %u failed\n",
@@ -146,7 +152,7 @@ static int _cpu_down(unsigned int cpu)
if (IS_ERR(p)) {
/* CPU didn't die: tell everyone. Can't complain. */
- if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
+ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
(void *)(long)cpu) == NOTIFY_BAD)
BUG();
@@ -169,7 +175,7 @@ static int _cpu_down(unsigned int cpu)
put_cpu();
/* CPU is completely dead: tell everyone. Too late to complain. */
- if (blocking_notifier_call_chain(&cpu_chain, CPU_DEAD,
+ if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD,
(void *)(long)cpu) == NOTIFY_BAD)
BUG();
@@ -206,7 +212,7 @@ static int __devinit _cpu_up(unsigned in
if (cpu_online(cpu) || !cpu_present(cpu))
return -EINVAL;
- ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
+ ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
if (ret == NOTIFY_BAD) {
printk("%s: attempt to bring up CPU %u failed\n",
__FUNCTION__, cpu);
@@ -223,11 +229,11 @@ static int __devinit _cpu_up(unsigned in
BUG_ON(!cpu_online(cpu));
/* Now call notifier in preparation. */
- blocking_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
+ raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
out_notify:
if (ret != 0)
- blocking_notifier_call_chain(&cpu_chain,
+ raw_notifier_call_chain(&cpu_chain,
CPU_UP_CANCELED, hcpu);
return ret;