Hi,
Commit 2f7021a8 ("cpufreq: protect 'policy->cpus' from offlining during
__gov_queue_work()") causes the following deadlock for me when using kernel
v3.10 on ARM EXYNOS4412:
[ 960.380000] INFO: task kworker/0:1:34 blocked for more than 120 seconds.
[ 960.385000] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 960.390000] kworker/0:1 D c0613d88 0 34 2 0x00000000
[ 960.395000] Workqueue: events od_dbs_timer
[ 960.400000] [<c0613d88>] (__schedule+0x39c/0x7ec) from [<c06145e8>] (schedule_preempt_disabled+0x24/0x34)
[ 960.410000] [<c06145e8>] (schedule_preempt_disabled+0x24/0x34) from [<c0612fe4>] (__mutex_lock_slowpath+0x15c/0x21c)
[ 960.420000] [<c0612fe4>] (__mutex_lock_slowpath+0x15c/0x21c) from [<c06130ec>] (mutex_lock+0x48/0x4c)
[ 960.430000] [<c06130ec>] (mutex_lock+0x48/0x4c) from [<c0027cdc>] (get_online_cpus+0x2c/0x48)
[ 960.440000] [<c0027cdc>] (get_online_cpus+0x2c/0x48) from [<c03b059c>] (gov_queue_work+0x1c/0xb8)
[ 960.450000] [<c03b059c>] (gov_queue_work+0x1c/0xb8) from [<c03afabc>] (od_dbs_timer+0xa8/0x12c)
[ 960.455000] [<c03afabc>] (od_dbs_timer+0xa8/0x12c) from [<c003ecdc>] (process_one_work+0x138/0x43c)
[ 960.465000] [<c003ecdc>] (process_one_work+0x138/0x43c) from [<c003f3d8>] (worker_thread+0x134/0x3b8)
[ 960.475000] [<c003f3d8>] (worker_thread+0x134/0x3b8) from [<c0044df8>] (kthread+0xa4/0xb0)
[ 960.485000] [<c0044df8>] (kthread+0xa4/0xb0) from [<c000ead8>] (ret_from_fork+0x14/0x3c)
[ 960.490000] INFO: task bash:2497 blocked for more than 120 seconds.
[ 960.495000] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 960.505000] bash D c0613d88 0 2497 2496 0x00000001
[ 960.510000] [<c0613d88>] (__schedule+0x39c/0x7ec) from [<c061244c>] (schedule_timeout+0x14c/0x20c)
[ 960.520000] [<c061244c>] (schedule_timeout+0x14c/0x20c) from [<c0613774>] (wait_for_common+0xac/0x150)
[ 960.530000] [<c0613774>] (wait_for_common+0xac/0x150) from [<c003deb8>] (flush_work+0xa4/0x130)
[ 960.540000] [<c003deb8>] (flush_work+0xa4/0x130) from [<c003f9d4>] (__cancel_work_timer+0x74/0x100)
[ 960.545000] [<c003f9d4>] (__cancel_work_timer+0x74/0x100) from [<c03b0dc8>] (cpufreq_governor_dbs+0x4dc/0x630)
[ 960.555000] [<c03b0dc8>] (cpufreq_governor_dbs+0x4dc/0x630) from [<c03ac70c>] (__cpufreq_governor+0x60/0x120)
[ 960.565000] [<c03ac70c>] (__cpufreq_governor+0x60/0x120) from [<c03ad1cc>] (cpufreq_add_dev+0x2f4/0x4d8)
[ 960.575000] [<c03ad1cc>] (cpufreq_add_dev+0x2f4/0x4d8) from [<c060dcd8>] (cpufreq_cpu_callback+0x54/0x5c)
[ 960.585000] [<c060dcd8>] (cpufreq_cpu_callback+0x54/0x5c) from [<c004a280>] (notifier_call_chain+0x44/0x84)
[ 960.595000] [<c004a280>] (notifier_call_chain+0x44/0x84) from [<c0027c94>] (__cpu_notify+0x2c/0x48)
[ 960.605000] [<c0027c94>] (__cpu_notify+0x2c/0x48) from [<c060b914>] (_cpu_up+0x10c/0x15c)
[ 960.615000] [<c060b914>] (_cpu_up+0x10c/0x15c) from [<c060b9c8>] (cpu_up+0x64/0x80)
[ 960.620000] [<c060b9c8>] (cpu_up+0x64/0x80) from [<c060a704>] (store_online+0x4c/0x74)
[ 960.630000] [<c060a704>] (store_online+0x4c/0x74) from [<c02b88b4>] (dev_attr_store+0x18/0x24)
[ 960.635000] [<c02b88b4>] (dev_attr_store+0x18/0x24) from [<c014f43c>] (sysfs_write_file+0xfc/0x164)
[ 960.645000] [<c014f43c>] (sysfs_write_file+0xfc/0x164) from [<c00f6e70>] (vfs_write+0xbc/0x184)
[ 960.655000] [<c00f6e70>] (vfs_write+0xbc/0x184) from [<c00f7208>] (SyS_write+0x40/0x68)
[ 960.665000] [<c00f7208>] (SyS_write+0x40/0x68) from [<c000ea40>] (ret_fast_syscall+0x0/0x30)
Tomek has also managed to got a lockdep info for the problem:
[ 36.570161]
[ 36.570233] ======================================================
[ 36.576376] [ INFO: possible circular locking dependency detected ]
[ 36.582637] 3.10.0-rc5-00336-g6905065-dirty #1069 Not tainted
[ 36.588355] -------------------------------------------------------
[ 36.594608] bash/2783 is trying to acquire lock:
[ 36.599205] ((&(&j_cdbs->work)->work)){+.+.+.}, at: [<c0042138>] flush_work+0x0/0x27c
[ 36.607104]
[ 36.607104] but task is already holding lock:
[ 36.612917] (cpu_hotplug.lock){+.+.+.}, at: [<c0028cb0>] cpu_hotplug_begin+0x2c/0x58
[ 36.620731]
[ 36.620731] which lock already depends on the new lock.
[ 36.620731]
[ 36.628890]
[ 36.628890] the existing dependency chain (in reverse order) is:
[ 36.636355]
-> #2 (cpu_hotplug.lock){+.+.+.}:
[ 36.640867] [<c007c828>] lock_acquire+0x9c/0x130
[ 36.646075] [<c0643dc4>] mutex_lock_nested+0x68/0x3ec
[ 36.651716] [<c0028db0>] get_online_cpus+0x40/0x60
[ 36.657097] [<c03c2810>] gov_queue_work+0x1c/0xac
[ 36.662392] [<c03c18c4>] od_dbs_timer+0xb8/0x16c
[ 36.667600] [<c00415f8>] process_one_work+0x198/0x4d8
[ 36.673242] [<c0042998>] worker_thread+0x134/0x3e8
[ 36.678624] [<c0048ad8>] kthread+0xa8/0xb4
[ 36.683311] [<c000ee88>] ret_from_fork+0x14/0x2c
[ 36.688522]
-> #1 (&j_cdbs->timer_mutex){+.+.+.}:
[ 36.693380] [<c007c828>] lock_acquire+0x9c/0x130
[ 36.698588] [<c0643dc4>] mutex_lock_nested+0x68/0x3ec
[ 36.704229] [<c03c184c>] od_dbs_timer+0x40/0x16c
[ 36.709438] [<c00415f8>] process_one_work+0x198/0x4d8
[ 36.715080] [<c0042998>] worker_thread+0x134/0x3e8
[ 36.720461] [<c0048ad8>] kthread+0xa8/0xb4
[ 36.725148] [<c000ee88>] ret_from_fork+0x14/0x2c
[ 36.730360]
-> #0 ((&(&j_cdbs->work)->work)){+.+.+.}:
[ 36.735565] [<c007bb70>] __lock_acquire+0x17c0/0x1e64
[ 36.741206] [<c007c828>] lock_acquire+0x9c/0x130
[ 36.746414] [<c0042170>] flush_work+0x38/0x27c
[ 36.751449] [<c00424e8>] __cancel_work_timer+0x84/0x120
[ 36.757265] [<c03c3040>] cpufreq_governor_dbs+0x4f4/0x674
[ 36.763254] [<c03bdeac>] __cpufreq_governor+0x60/0x120
[ 36.768983] [<c03bfaa0>] __cpufreq_remove_dev.clone.4+0x7c/0x480
[ 36.775579] [<c06421d0>] cpufreq_cpu_callback+0x48/0x5c
[ 36.781396] [<c004e688>] notifier_call_chain+0x44/0x84
[ 36.787124] [<c0028c30>] __cpu_notify+0x2c/0x48
[ 36.792245] [<c063de9c>] _cpu_down+0xb4/0x238
[ 36.797193] [<c063e048>] cpu_down+0x28/0x3c
[ 36.801966] [<c063e9c0>] store_online+0x30/0x74
[ 36.807088] [<c02c9bbc>] dev_attr_store+0x18/0x24
[ 36.812382] [<c0166e08>] sysfs_write_file+0x104/0x18c
[ 36.818025] [<c010b704>] vfs_write+0xc8/0x194
[ 36.822973] [<c010bac0>] SyS_write+0x44/0x70
[ 36.827833] [<c000edc0>] ret_fast_syscall+0x0/0x48
[ 36.833218]
[ 36.833218] other info that might help us debug this:
[ 36.833218]
[ 36.841203] Chain exists of:
(&(&j_cdbs->work)->work) --> &j_cdbs->timer_mutex --> cpu_hotplug.lock
[ 36.850663] Possible unsafe locking scenario:
[ 36.850663]
[ 36.856565] CPU0 CPU1
[ 36.861079] ---- ----
[ 36.865591] lock(cpu_hotplug.lock);
[ 36.869237] lock(&j_cdbs->timer_mutex);
[ 36.875746] lock(cpu_hotplug.lock);
[ 36.881909] lock((&(&j_cdbs->work)->work));
[ 36.886250]
[ 36.886250] *** DEADLOCK ***
[ 36.886250]
[ 36.892163] 5 locks held by bash/2783:
[ 36.895886] #0: (sb_writers#7){.+.+.+}, at: [<c010b7cc>] vfs_write+0x190/0x194
[ 36.903264] #1: (&buffer->mutex){+.+.+.}, at: [<c0166d2c>] sysfs_write_file+0x28/0x18c
[ 36.911336] #2: (s_active#50){.+.+..}, at: [<c0166de4>] sysfs_write_file+0xe0/0x18c
[ 36.919148] #3: (cpu_add_remove_lock){+.+.+.}, at: [<c063e02c>] cpu_down+0xc/0x3c
[ 36.926787] #4: (cpu_hotplug.lock){+.+.+.}, at: [<c0028cb0>] cpu_hotplug_begin+0x2c/0x58
[ 36.935032]
[ 36.935032] stack backtrace:
[ 36.939394] CPU: 1 PID: 2783 Comm: bash Not tainted 3.10.0-rc5-00336-g6905065-dirty #1069
[ 36.947591] [<c0015ed0>] (unwind_backtrace+0x0/0x13c) from [<c0012e58>] (show_stack+0x10/0x14)
[ 36.956164] [<c0012e58>] (show_stack+0x10/0x14) from [<c0078a9c>] (print_circular_bug+0x1c8/0x304)
[ 36.965105] [<c0078a9c>] (print_circular_bug+0x1c8/0x304) from [<c007bb70>] (__lock_acquire+0x17c0/0x1e64)
[ 36.974735] [<c007bb70>] (__lock_acquire+0x17c0/0x1e64) from [<c007c828>] (lock_acquire+0x9c/0x130)
[ 36.983767] [<c007c828>] (lock_acquire+0x9c/0x130) from [<c0042170>] (flush_work+0x38/0x27c)
[ 36.992187] [<c0042170>] (flush_work+0x38/0x27c) from [<c00424e8>] (__cancel_work_timer+0x84/0x120)
[ 37.001225] [<c00424e8>] (__cancel_work_timer+0x84/0x120) from [<c03c3040>] (cpufreq_governor_dbs+0x4f4/0x674)
[ 37.011196] [<c03c3040>] (cpufreq_governor_dbs+0x4f4/0x674) from [<c03bdeac>] (__cpufreq_governor+0x60/0x120)
[ 37.021087] [<c03bdeac>] (__cpufreq_governor+0x60/0x120) from [<c03bfaa0>] (__cpufreq_remove_dev.clone.4+0x7c/0x480)
[ 37.031607] [<c03bfaa0>] (__cpufreq_remove_dev.clone.4+0x7c/0x480) from [<c06421d0>] (cpufreq_cpu_callback+0x48/0x5c)
[ 37.042207] [<c06421d0>] (cpufreq_cpu_callback+0x48/0x5c) from [<c004e688>] (notifier_call_chain+0x44/0x84)
[ 37.051910] [<c004e688>] (notifier_call_chain+0x44/0x84) from [<c0028c30>] (__cpu_notify+0x2c/0x48)
[ 37.060931] [<c0028c30>] (__cpu_notify+0x2c/0x48) from [<c063de9c>] (_cpu_down+0xb4/0x238)
[ 37.069177] [<c063de9c>] (_cpu_down+0xb4/0x238) from [<c063e048>] (cpu_down+0x28/0x3c)
[ 37.077074] [<c063e048>] (cpu_down+0x28/0x3c) from [<c063e9c0>] (store_online+0x30/0x74)
[ 37.085147] [<c063e9c0>] (store_online+0x30/0x74) from [<c02c9bbc>] (dev_attr_store+0x18/0x24)
[ 37.093740] [<c02c9bbc>] (dev_attr_store+0x18/0x24) from [<c0166e08>] (sysfs_write_file+0x104/0x18c)
[ 37.102852] [<c0166e08>] (sysfs_write_file+0x104/0x18c) from [<c010b704>] (vfs_write+0xc8/0x194)
[ 37.111616] [<c010b704>] (vfs_write+0xc8/0x194) from [<c010bac0>] (SyS_write+0x44/0x70)
[ 37.119613] [<c010bac0>] (SyS_write+0x44/0x70) from [<c000edc0>] (ret_fast_syscall+0x0/0x48)
Reproducing the issue is very easy, i.e.:
# echo 0 > /sys/devices/system/cpu/cpu3/online
# echo 0 > /sys/devices/system/cpu/cpu2/online
# echo 0 > /sys/devices/system/cpu/cpu1/online
# while true;do echo 1 > /sys/devices/system/cpu/cpu1/online;echo 0 > /sys/devices/system/cpu/cpu1/online;done
The commit in question (2f7021a8) was merged in v3.10-rc5 as a fix for
commit 031299b ("cpufreq: governors: Avoid unnecessary per cpu timer
interrupts") which was causing a kernel warning to show up.
Michael/Viresh: do you have some idea how to fix the issue?
Best regards,
--
Bartlomiej Zolnierkiewicz
Samsung R&D Institute Poland
Samsung Electronics
Hi, Bartlomiej
On 07/08/2013 11:26 PM, Bartlomiej Zolnierkiewicz wrote:
[snip]
>
> # echo 0 > /sys/devices/system/cpu/cpu3/online
> # echo 0 > /sys/devices/system/cpu/cpu2/online
> # echo 0 > /sys/devices/system/cpu/cpu1/online
> # while true;do echo 1 > /sys/devices/system/cpu/cpu1/online;echo 0 > /sys/devices/system/cpu/cpu1/online;done
>
> The commit in question (2f7021a8) was merged in v3.10-rc5 as a fix for
> commit 031299b ("cpufreq: governors: Avoid unnecessary per cpu timer
> interrupts") which was causing a kernel warning to show up.
>
> Michael/Viresh: do you have some idea how to fix the issue?
Thanks for the report :) would you like to take a try
on below patch and see whether it solve the issue?
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 5af40ad..aa05eaa 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -229,6 +229,8 @@ static void set_sampling_rate(struct dbs_data *dbs_data,
}
}
+static struct lock_class_key j_cdbs_key;
+
int cpufreq_governor_dbs(struct cpufreq_policy *policy,
struct common_dbs_data *cdata, unsigned int event)
{
@@ -366,6 +368,8 @@ int (struct cpufreq_policy *policy,
kcpustat_cpu(j).cpustat[CPUTIME_NICE];
mutex_init(&j_cdbs->timer_mutex);
+ lockdep_set_class(&j_cdbs->timer_mutex, &j_cdbs_key);
+
INIT_DEFERRABLE_WORK(&j_cdbs->work,
dbs_data->cdata->gov_dbs_timer);
}
Regards,
Michael Wang
>
> Best regards,
> --
> Bartlomiej Zolnierkiewicz
> Samsung R&D Institute Poland
> Samsung Electronics
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
Hi,
On Tuesday, July 09, 2013 10:15:43 AM Michael Wang wrote:
> Hi, Bartlomiej
>
> On 07/08/2013 11:26 PM, Bartlomiej Zolnierkiewicz wrote:
> [snip]
> >
> > # echo 0 > /sys/devices/system/cpu/cpu3/online
> > # echo 0 > /sys/devices/system/cpu/cpu2/online
> > # echo 0 > /sys/devices/system/cpu/cpu1/online
> > # while true;do echo 1 > /sys/devices/system/cpu/cpu1/online;echo 0 > /sys/devices/system/cpu/cpu1/online;done
> >
> > The commit in question (2f7021a8) was merged in v3.10-rc5 as a fix for
> > commit 031299b ("cpufreq: governors: Avoid unnecessary per cpu timer
> > interrupts") which was causing a kernel warning to show up.
> >
> > Michael/Viresh: do you have some idea how to fix the issue?
>
> Thanks for the report :) would you like to take a try
> on below patch and see whether it solve the issue?
It doesn't help and unfortunately it just can't help as it only
addresses lockdep functionality while the issue is not a lockdep
problem but a genuine locking problem. CPU hot-unplug invokes
_cpu_down() which calls cpu_hotplug_begin() which in turn takes
&cpu_hotplug.lock. The lock is then hold during __cpu_notify()
call. Notifier chain goes up to cpufreq_governor_dbs() which for
CPUFREQ_GOV_STOP event does gov_cancel_work(). This function
flushes pending work and waits for it to finish. The all above
happens in one kernel thread. At the same time the other kernel
thread is doing the work we are waiting to complete and it also
happens to do gov_queue_work() which calls get_online_cpus().
Then the code tries to take &cpu_hotplug.lock which is already
held by the first thread and deadlocks.
Best regards,
--
Bartlomiej Zolnierkiewicz
Samsung R&D Institute Poland
Samsung Electronics
> diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
> index 5af40ad..aa05eaa 100644
> --- a/drivers/cpufreq/cpufreq_governor.c
> +++ b/drivers/cpufreq/cpufreq_governor.c
> @@ -229,6 +229,8 @@ static void set_sampling_rate(struct dbs_data *dbs_data,
> }
> }
>
> +static struct lock_class_key j_cdbs_key;
> +
> int cpufreq_governor_dbs(struct cpufreq_policy *policy,
> struct common_dbs_data *cdata, unsigned int event)
> {
> @@ -366,6 +368,8 @@ int (struct cpufreq_policy *policy,
> kcpustat_cpu(j).cpustat[CPUTIME_NICE];
>
> mutex_init(&j_cdbs->timer_mutex);
> + lockdep_set_class(&j_cdbs->timer_mutex, &j_cdbs_key);
> +
> INIT_DEFERRABLE_WORK(&j_cdbs->work,
> dbs_data->cdata->gov_dbs_timer);
> }
>
> Regards,
> Michael Wang
On 07/09/2013 05:21 PM, Bartlomiej Zolnierkiewicz wrote:
>
> Hi,
>
> On Tuesday, July 09, 2013 10:15:43 AM Michael Wang wrote:
>> Hi, Bartlomiej
>>
>> On 07/08/2013 11:26 PM, Bartlomiej Zolnierkiewicz wrote:
>> [snip]
>>>
>>> # echo 0 > /sys/devices/system/cpu/cpu3/online
>>> # echo 0 > /sys/devices/system/cpu/cpu2/online
>>> # echo 0 > /sys/devices/system/cpu/cpu1/online
>>> # while true;do echo 1 > /sys/devices/system/cpu/cpu1/online;echo 0 > /sys/devices/system/cpu/cpu1/online;done
>>>
>>> The commit in question (2f7021a8) was merged in v3.10-rc5 as a fix for
>>> commit 031299b ("cpufreq: governors: Avoid unnecessary per cpu timer
>>> interrupts") which was causing a kernel warning to show up.
>>>
>>> Michael/Viresh: do you have some idea how to fix the issue?
>>
>> Thanks for the report :) would you like to take a try
>> on below patch and see whether it solve the issue?
>
> It doesn't help and unfortunately it just can't help as it only
> addresses lockdep functionality while the issue is not a lockdep
> problem but a genuine locking problem. CPU hot-unplug invokes
> _cpu_down() which calls cpu_hotplug_begin() which in turn takes
> &cpu_hotplug.lock. The lock is then hold during __cpu_notify()
> call. Notifier chain goes up to cpufreq_governor_dbs() which for
> CPUFREQ_GOV_STOP event does gov_cancel_work(). This function
> flushes pending work and waits for it to finish. The all above
> happens in one kernel thread. At the same time the other kernel
> thread is doing the work we are waiting to complete and it also
> happens to do gov_queue_work() which calls get_online_cpus().
> Then the code tries to take &cpu_hotplug.lock which is already
> held by the first thread and deadlocks.
>
Yeah, exactly!
So I had proposed doing an asynchronous cancel-work or doing the
synchronous cancel-work in the CPU_POST_DEAD phase, where the
cpu_hotplug.lock is not held. See this thread:
http://marc.info/?l=linux-kernel&m=137241212616799&w=2
http://marc.info/?l=linux-pm&m=137242906622537&w=2
But now that I look at commit 2f7021a8 again, I still think we should
revert it and fix the _actual_ root-cause of the bug.
Cpufreq subsystem has enough synchronization to ensure that policy->cpus
always contains online CPUs. And it also has the concept of cancelling
queued work items, *before* that CPU is taken offline.
So, where is the chance that we try to queue work items on offline CPUs?
To answer that question, I was looking at the cpufreq code yesterday
and found something very interesting: the gov_cancel_work() that is
invoked before a CPU goes offline, can actually end up effectively
*NOT* cancelling the queued work item!
The reason is, the per-cpu work items are not just self-queueing (if
that was the case, gov_cancel_work would have been successful without
any doubt), but instead, they can also queue work items on *other* CPUs!
Example from ondemand governor's per-cpu work item:
static void od_dbs_timer(struct work_struct *work)
{
...
bool modify_all = true;
...
gov_queue_work(dbs_data, dbs_info->cdbs.cur_policy, delay, modify_all);
}
So, every per-cpu work item can re-queue the work item on *many other*
CPUs, and not just itself!
So that leads to a race which makes gov_cancel_work() ineffective.
The call to cancel_delayed_work_sync() will cancel all pending work items
on say CPU 3 (which is going down), but immediately after that, say CPU4's
work item fires and queues the work item on CPU4 as well as CPU3. Thus,
gov_cancel_work() _effectively_ didn't do anything useful.
But this still doesn't immediately explain how we can end up trying to
queue work items on offline CPUs (since policy->cpus is supposed to always
contain online cpus only, and this does look correct in the code as well,
at a first glance). But I just wanted to share this finding, in case it
helps us find out the real root-cause.
Also, you might perhaps want to try the (untested) patch shown below, and
see if it resolves your problem. It basically makes work-items requeue
themselves on only their respective CPUs and not others, so that
gov_cancel_work succeeds in its mission. However, I guess the patch is
wrong from a cpufreq perspective, in case cpufreq really depends on the
"requeue-work-on-everybody" model.
Regards,
Srivatsa S. Bhat
------------------------------------------------------------------------
drivers/cpufreq/cpufreq_conservative.c | 2 +-
drivers/cpufreq/cpufreq_governor.c | 2 --
drivers/cpufreq/cpufreq_ondemand.c | 2 +-
3 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 0ceb2ef..bbfc1dd 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -120,7 +120,7 @@ static void cs_dbs_timer(struct work_struct *work)
struct dbs_data *dbs_data = dbs_info->cdbs.cur_policy->governor_data;
struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
int delay = delay_for_sampling_rate(cs_tuners->sampling_rate);
- bool modify_all = true;
+ bool modify_all = false;
mutex_lock(&core_dbs_info->cdbs.timer_mutex);
if (!need_load_eval(&core_dbs_info->cdbs, cs_tuners->sampling_rate))
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 4645876..ec4baeb 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -137,10 +137,8 @@ void gov_queue_work(struct dbs_data *dbs_data, struct cpufreq_policy *policy,
if (!all_cpus) {
__gov_queue_work(smp_processor_id(), dbs_data, delay);
} else {
- get_online_cpus();
for_each_cpu(i, policy->cpus)
__gov_queue_work(i, dbs_data, delay);
- put_online_cpus();
}
}
EXPORT_SYMBOL_GPL(gov_queue_work);
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 93eb5cb..241ebc0 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -230,7 +230,7 @@ static void od_dbs_timer(struct work_struct *work)
struct dbs_data *dbs_data = dbs_info->cdbs.cur_policy->governor_data;
struct od_dbs_tuners *od_tuners = dbs_data->tuners;
int delay = 0, sample_type = core_dbs_info->sample_type;
- bool modify_all = true;
+ bool modify_all = false;
mutex_lock(&core_dbs_info->cdbs.timer_mutex);
if (!need_load_eval(&core_dbs_info->cdbs, od_tuners->sampling_rate)) {
On 07/09/2013 07:51 PM, Bartlomiej Zolnierkiewicz wrote:
[snip]
>
> It doesn't help and unfortunately it just can't help as it only
> addresses lockdep functionality while the issue is not a lockdep
> problem but a genuine locking problem. CPU hot-unplug invokes
> _cpu_down() which calls cpu_hotplug_begin() which in turn takes
> &cpu_hotplug.lock. The lock is then hold during __cpu_notify()
> call. Notifier chain goes up to cpufreq_governor_dbs() which for
> CPUFREQ_GOV_STOP event does gov_cancel_work(). This function
> flushes pending work and waits for it to finish. The all above
> happens in one kernel thread. At the same time the other kernel
> thread is doing the work we are waiting to complete and it also
> happens to do gov_queue_work() which calls get_online_cpus().
> Then the code tries to take &cpu_hotplug.lock which is already
> held by the first thread and deadlocks.
Hmm...I think I get your point, some thread hold the lock and
flush some work which also try to hold the same lock, correct?
Ok, that's a problem, let's figure out a good way to solve it :)
Regards,
Michael Wang
>
> Best regards,
> --
> Bartlomiej Zolnierkiewicz
> Samsung R&D Institute Poland
> Samsung Electronics
>
>> diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
>> index 5af40ad..aa05eaa 100644
>> --- a/drivers/cpufreq/cpufreq_governor.c
>> +++ b/drivers/cpufreq/cpufreq_governor.c
>> @@ -229,6 +229,8 @@ static void set_sampling_rate(struct dbs_data *dbs_data,
>> }
>> }
>>
>> +static struct lock_class_key j_cdbs_key;
>> +
>> int cpufreq_governor_dbs(struct cpufreq_policy *policy,
>> struct common_dbs_data *cdata, unsigned int event)
>> {
>> @@ -366,6 +368,8 @@ int (struct cpufreq_policy *policy,
>> kcpustat_cpu(j).cpustat[CPUTIME_NICE];
>>
>> mutex_init(&j_cdbs->timer_mutex);
>> + lockdep_set_class(&j_cdbs->timer_mutex, &j_cdbs_key);
>> +
>> INIT_DEFERRABLE_WORK(&j_cdbs->work,
>> dbs_data->cdata->gov_dbs_timer);
>> }
>>
>> Regards,
>> Michael Wang
>
On 07/09/2013 09:07 PM, Srivatsa S. Bhat wrote:
[snip]
>>
>
> Yeah, exactly!
>
> So I had proposed doing an asynchronous cancel-work or doing the
> synchronous cancel-work in the CPU_POST_DEAD phase, where the
> cpu_hotplug.lock is not held. See this thread:
>
> http://marc.info/?l=linux-kernel&m=137241212616799&w=2
> http://marc.info/?l=linux-pm&m=137242906622537&w=2
>
> But now that I look at commit 2f7021a8 again, I still think we should
> revert it and fix the _actual_ root-cause of the bug.
Agree, or we could revert it with some better fix, otherwise the prev
bug report will back again...
>
> Cpufreq subsystem has enough synchronization to ensure that policy->cpus
> always contains online CPUs. And it also has the concept of cancelling
> queued work items, *before* that CPU is taken offline.
> So, where is the chance that we try to queue work items on offline CPUs?
>
> To answer that question, I was looking at the cpufreq code yesterday
> and found something very interesting: the gov_cancel_work() that is
> invoked before a CPU goes offline, can actually end up effectively
> *NOT* cancelling the queued work item!
>
> The reason is, the per-cpu work items are not just self-queueing (if
> that was the case, gov_cancel_work would have been successful without
> any doubt), but instead, they can also queue work items on *other* CPUs!
>
> Example from ondemand governor's per-cpu work item:
>
> static void od_dbs_timer(struct work_struct *work)
> {
> ...
> bool modify_all = true;
> ...
> gov_queue_work(dbs_data, dbs_info->cdbs.cur_policy, delay, modify_all);
> }
>
> So, every per-cpu work item can re-queue the work item on *many other*
> CPUs, and not just itself!
>
> So that leads to a race which makes gov_cancel_work() ineffective.
> The call to cancel_delayed_work_sync() will cancel all pending work items
> on say CPU 3 (which is going down), but immediately after that, say CPU4's
> work item fires and queues the work item on CPU4 as well as CPU3. Thus,
> gov_cancel_work() _effectively_ didn't do anything useful.
That's interesting, sense like a little closer to the root, the timer is
supposed to stop but failed... I need some investigation here...
Regards,
Michael Wang
>
> But this still doesn't immediately explain how we can end up trying to
> queue work items on offline CPUs (since policy->cpus is supposed to always
> contain online cpus only, and this does look correct in the code as well,
> at a first glance). But I just wanted to share this finding, in case it
> helps us find out the real root-cause.
>
> Also, you might perhaps want to try the (untested) patch shown below, and
> see if it resolves your problem. It basically makes work-items requeue
> themselves on only their respective CPUs and not others, so that
> gov_cancel_work succeeds in its mission. However, I guess the patch is
> wrong from a cpufreq perspective, in case cpufreq really depends on the
> "requeue-work-on-everybody" model.
>
> Regards,
> Srivatsa S. Bhat
>
> ------------------------------------------------------------------------
>
> drivers/cpufreq/cpufreq_conservative.c | 2 +-
> drivers/cpufreq/cpufreq_governor.c | 2 --
> drivers/cpufreq/cpufreq_ondemand.c | 2 +-
> 3 files changed, 2 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
> index 0ceb2ef..bbfc1dd 100644
> --- a/drivers/cpufreq/cpufreq_conservative.c
> +++ b/drivers/cpufreq/cpufreq_conservative.c
> @@ -120,7 +120,7 @@ static void cs_dbs_timer(struct work_struct *work)
> struct dbs_data *dbs_data = dbs_info->cdbs.cur_policy->governor_data;
> struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
> int delay = delay_for_sampling_rate(cs_tuners->sampling_rate);
> - bool modify_all = true;
> + bool modify_all = false;
>
> mutex_lock(&core_dbs_info->cdbs.timer_mutex);
> if (!need_load_eval(&core_dbs_info->cdbs, cs_tuners->sampling_rate))
> diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
> index 4645876..ec4baeb 100644
> --- a/drivers/cpufreq/cpufreq_governor.c
> +++ b/drivers/cpufreq/cpufreq_governor.c
> @@ -137,10 +137,8 @@ void gov_queue_work(struct dbs_data *dbs_data, struct cpufreq_policy *policy,
> if (!all_cpus) {
> __gov_queue_work(smp_processor_id(), dbs_data, delay);
> } else {
> - get_online_cpus();
> for_each_cpu(i, policy->cpus)
> __gov_queue_work(i, dbs_data, delay);
> - put_online_cpus();
> }
> }
> EXPORT_SYMBOL_GPL(gov_queue_work);
> diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
> index 93eb5cb..241ebc0 100644
> --- a/drivers/cpufreq/cpufreq_ondemand.c
> +++ b/drivers/cpufreq/cpufreq_ondemand.c
> @@ -230,7 +230,7 @@ static void od_dbs_timer(struct work_struct *work)
> struct dbs_data *dbs_data = dbs_info->cdbs.cur_policy->governor_data;
> struct od_dbs_tuners *od_tuners = dbs_data->tuners;
> int delay = 0, sample_type = core_dbs_info->sample_type;
> - bool modify_all = true;
> + bool modify_all = false;
>
> mutex_lock(&core_dbs_info->cdbs.timer_mutex);
> if (!need_load_eval(&core_dbs_info->cdbs, od_tuners->sampling_rate)) {
>
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
On 07/09/2013 09:07 PM, Srivatsa S. Bhat wrote:
[snip]
>
> But this still doesn't immediately explain how we can end up trying to
> queue work items on offline CPUs (since policy->cpus is supposed to always
> contain online cpus only, and this does look correct in the code as well,
> at a first glance). But I just wanted to share this finding, in case it
> helps us find out the real root-cause.
The prev info show the policy->cpus won't contain offline cpu, but after
you get one cpu id from it, that cpu will go offline at any time.
I'm not sure what is supposed after notify CPUFREQ_GOV_STOP event, if it
is in order to stop queued work and prevent follow work happen again,
then it failed to, and we need some method to stop queue work again when
CPUFREQ_GOV_STOP notified, like some flag in policy which will be
checked before re-queue work in work.
But if the event is just to sync the queued work but not prevent follow
work happen, then things will become tough...we need confirm.
What's your opinion?
Regards,
Michael Wang
>
> Also, you might perhaps want to try the (untested) patch shown below, and
> see if it resolves your problem. It basically makes work-items requeue
> themselves on only their respective CPUs and not others, so that
> gov_cancel_work succeeds in its mission. However, I guess the patch is
> wrong from a cpufreq perspective, in case cpufreq really depends on the
> "requeue-work-on-everybody" model.
>
> Regards,
> Srivatsa S. Bhat
>
> ------------------------------------------------------------------------
>
> drivers/cpufreq/cpufreq_conservative.c | 2 +-
> drivers/cpufreq/cpufreq_governor.c | 2 --
> drivers/cpufreq/cpufreq_ondemand.c | 2 +-
> 3 files changed, 2 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
> index 0ceb2ef..bbfc1dd 100644
> --- a/drivers/cpufreq/cpufreq_conservative.c
> +++ b/drivers/cpufreq/cpufreq_conservative.c
> @@ -120,7 +120,7 @@ static void cs_dbs_timer(struct work_struct *work)
> struct dbs_data *dbs_data = dbs_info->cdbs.cur_policy->governor_data;
> struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
> int delay = delay_for_sampling_rate(cs_tuners->sampling_rate);
> - bool modify_all = true;
> + bool modify_all = false;
>
> mutex_lock(&core_dbs_info->cdbs.timer_mutex);
> if (!need_load_eval(&core_dbs_info->cdbs, cs_tuners->sampling_rate))
> diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
> index 4645876..ec4baeb 100644
> --- a/drivers/cpufreq/cpufreq_governor.c
> +++ b/drivers/cpufreq/cpufreq_governor.c
> @@ -137,10 +137,8 @@ void gov_queue_work(struct dbs_data *dbs_data, struct cpufreq_policy *policy,
> if (!all_cpus) {
> __gov_queue_work(smp_processor_id(), dbs_data, delay);
> } else {
> - get_online_cpus();
> for_each_cpu(i, policy->cpus)
> __gov_queue_work(i, dbs_data, delay);
> - put_online_cpus();
> }
> }
> EXPORT_SYMBOL_GPL(gov_queue_work);
> diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
> index 93eb5cb..241ebc0 100644
> --- a/drivers/cpufreq/cpufreq_ondemand.c
> +++ b/drivers/cpufreq/cpufreq_ondemand.c
> @@ -230,7 +230,7 @@ static void od_dbs_timer(struct work_struct *work)
> struct dbs_data *dbs_data = dbs_info->cdbs.cur_policy->governor_data;
> struct od_dbs_tuners *od_tuners = dbs_data->tuners;
> int delay = 0, sample_type = core_dbs_info->sample_type;
> - bool modify_all = true;
> + bool modify_all = false;
>
> mutex_lock(&core_dbs_info->cdbs.timer_mutex);
> if (!need_load_eval(&core_dbs_info->cdbs, od_tuners->sampling_rate)) {
>
>
>
>
On 10 July 2013 09:42, Michael Wang <[email protected]> wrote:
> I'm not sure what is supposed after notify CPUFREQ_GOV_STOP event, if it
> is in order to stop queued work and prevent follow work happen again,
> then it failed to, and we need some method to stop queue work again when
> CPUFREQ_GOV_STOP notified, like some flag in policy which will be
> checked before re-queue work in work.
>
> But if the event is just to sync the queued work but not prevent follow
> work happen, then things will become tough...we need confirm.
After GOV_STOP, until the time GOV_START is called, we shouldn't be
queuing any work.
On 07/10/2013 01:39 PM, Viresh Kumar wrote:
> On 10 July 2013 09:42, Michael Wang <[email protected]> wrote:
>> I'm not sure what is supposed after notify CPUFREQ_GOV_STOP event, if it
>> is in order to stop queued work and prevent follow work happen again,
>> then it failed to, and we need some method to stop queue work again when
>> CPUFREQ_GOV_STOP notified, like some flag in policy which will be
>> checked before re-queue work in work.
>>
>> But if the event is just to sync the queued work but not prevent follow
>> work happen, then things will become tough...we need confirm.
>
> After GOV_STOP, until the time GOV_START is called, we shouldn't be
> queuing any work.
Thanks for the confirm :) seems like the root cause is very likely
related with the problem Srivatsa discovered.
I think the fix in his mail worth a try, but I need more investigations
to confirm that's the right way...
Regards,
Michael Wang
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
On 10 July 2013 11:34, Michael Wang <[email protected]> wrote:
> Thanks for the confirm :) seems like the root cause is very likely
> related with the problem Srivatsa discovered.
>
> I think the fix in his mail worth a try, but I need more investigations
> to confirm that's the right way...
Its not a fix really, but with that we can atleast confirm that we are on
the right path and the problem is exactly what he described.
On 07/10/2013 10:40 AM, Michael Wang wrote:
> On 07/09/2013 07:51 PM, Bartlomiej Zolnierkiewicz wrote:
> [snip]
>>
>> It doesn't help and unfortunately it just can't help as it only
>> addresses lockdep functionality while the issue is not a lockdep
>> problem but a genuine locking problem. CPU hot-unplug invokes
>> _cpu_down() which calls cpu_hotplug_begin() which in turn takes
>> &cpu_hotplug.lock. The lock is then hold during __cpu_notify()
>> call. Notifier chain goes up to cpufreq_governor_dbs() which for
>> CPUFREQ_GOV_STOP event does gov_cancel_work(). This function
>> flushes pending work and waits for it to finish. The all above
>> happens in one kernel thread. At the same time the other kernel
>> thread is doing the work we are waiting to complete and it also
>> happens to do gov_queue_work() which calls get_online_cpus().
>> Then the code tries to take &cpu_hotplug.lock which is already
>> held by the first thread and deadlocks.
>
> Hmm...I think I get your point, some thread hold the lock and
> flush some work which also try to hold the same lock, correct?
>
> Ok, that's a problem, let's figure out a good way to solve it :)
And besides the patch from Srivatsa, I also suggest below fix, it's
try to really stop all the works during down notify, I'd like to know
how do you think about it ;-)
Regards,
Michael Wang
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index dc9b72e..a64b544 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -178,13 +178,14 @@ void gov_queue_work(struct dbs_data *dbs_data, struct cpufreq_policy *policy,
{
int i;
+ if (dbs_data->queue_stop)
+ return;
+
if (!all_cpus) {
__gov_queue_work(smp_processor_id(), dbs_data, delay);
} else {
- get_online_cpus();
for_each_cpu(i, policy->cpus)
__gov_queue_work(i, dbs_data, delay);
- put_online_cpus();
}
}
EXPORT_SYMBOL_GPL(gov_queue_work);
@@ -193,12 +194,27 @@ static inline void gov_cancel_work(struct dbs_data *dbs_data,
struct cpufreq_policy *policy)
{
struct cpu_dbs_common_info *cdbs;
- int i;
+ int i, round = 2;
+ dbs_data->queue_stop = 1;
+redo:
+ round--;
for_each_cpu(i, policy->cpus) {
cdbs = dbs_data->cdata->get_cpu_cdbs(i);
cancel_delayed_work_sync(&cdbs->work);
}
+
+ /*
+ * Since there is no lock to prvent re-queue the
+ * cancelled work, some early cancelled work might
+ * have been queued again by later cancelled work.
+ *
+ * Flush the work again with dbs_data->queue_stop
+ * enabled, this time there will be no survivors.
+ */
+ if (round)
+ goto redo;
+ dbs_data->queue_stop = 0;
}
/* Will return if we need to evaluate cpu load again or not */
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index e16a961..9116135 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -213,6 +213,7 @@ struct dbs_data {
unsigned int min_sampling_rate;
int usage_count;
void *tuners;
+ int queue_stop;
/* dbs_mutex protects dbs_enable in governor start/stop */
struct mutex mutex;
>
> Regards,
> Michael Wang
>
>
>
>
>>
>> Best regards,
>> --
>> Bartlomiej Zolnierkiewicz
>> Samsung R&D Institute Poland
>> Samsung Electronics
>>
>>> diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
>>> index 5af40ad..aa05eaa 100644
>>> --- a/drivers/cpufreq/cpufreq_governor.c
>>> +++ b/drivers/cpufreq/cpufreq_governor.c
>>> @@ -229,6 +229,8 @@ static void set_sampling_rate(struct dbs_data *dbs_data,
>>> }
>>> }
>>>
>>> +static struct lock_class_key j_cdbs_key;
>>> +
>>> int cpufreq_governor_dbs(struct cpufreq_policy *policy,
>>> struct common_dbs_data *cdata, unsigned int event)
>>> {
>>> @@ -366,6 +368,8 @@ int (struct cpufreq_policy *policy,
>>> kcpustat_cpu(j).cpustat[CPUTIME_NICE];
>>>
>>> mutex_init(&j_cdbs->timer_mutex);
>>> + lockdep_set_class(&j_cdbs->timer_mutex, &j_cdbs_key);
>>> +
>>> INIT_DEFERRABLE_WORK(&j_cdbs->work,
>>> dbs_data->cdata->gov_dbs_timer);
>>> }
>>>
>>> Regards,
>>> Michael Wang
>>
>