On 11/28/2017 05:05 PM, Prateek Sood wrote:
> CPU1
> cpus_read_lock+0x3e/0x80
> static_key_slow_inc+0xe/0xa0
> cpuset_css_online+0x62/0x330
> online_css+0x26/0x80
> cgroup_apply_control_enable+0x266/0x3d0
> cgroup_mkdir+0x37d/0x4f0
> kernfs_iop_mkdir+0x53/0x80
> vfs_mkdir+0x10e/0x1a0
> SyS_mkdirat+0xb3/0xe0
> entry_SYSCALL_64_fastpath+0x23/0x9a
>
> CPU0
> lock_acquire+0xec/0x1e0
> __mutex_lock+0x89/0x920
> cpuset_write_resmask+0x61/0x1100
> cgroup_file_write+0x7b/0x200
> kernfs_fop_write+0x112/0x1a0
> __vfs_write+0x23/0x150
> vfs_write+0xc8/0x1c0
> SyS_write+0x45/0xa0
> entry_SYSCALL_64_fastpath+0x23/0x9a
>
> CPU0 CPU1
> ---- ----
> lock(cpu_hotplug_lock.rw_sem);
> lock(cpuset_mutex);
> lock(cpu_hotplug_lock.rw_sem);
> lock(cpuset_mutex);
>
> Change locking order of cpu_hotplug_lock.rw_sem and
> cpuset_mutex in cpuset_css_online(). Use _cpuslocked()
> version for static_branch_inc/static_branch_dec in
> cpuset_inc()/cpuset_dec().
>
> Signed-off-by: Prateek Sood <[email protected]>
> ---
> include/linux/cpuset.h | 8 ++++----
> include/linux/jump_label.h | 10 ++++++++--
> kernel/cgroup/cpuset.c | 4 ++--
> kernel/jump_label.c | 13 +++++++++++++
> 4 files changed, 27 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
> index 2ab910f..5aadc25 100644
> --- a/include/linux/cpuset.h
> +++ b/include/linux/cpuset.h
> @@ -40,14 +40,14 @@ static inline bool cpusets_enabled(void)
>
> static inline void cpuset_inc(void)
> {
> - static_branch_inc(&cpusets_pre_enable_key);
> - static_branch_inc(&cpusets_enabled_key);
> + static_branch_inc_cpuslocked(&cpusets_pre_enable_key);
> + static_branch_inc_cpuslocked(&cpusets_enabled_key);
> }
>
> static inline void cpuset_dec(void)
> {
> - static_branch_dec(&cpusets_enabled_key);
> - static_branch_dec(&cpusets_pre_enable_key);
> + static_branch_dec_cpuslocked(&cpusets_enabled_key);
> + static_branch_dec_cpuslocked(&cpusets_pre_enable_key);
> }
>
> extern int cpuset_init(void);
> diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
> index c7b368c..890e21c 100644
> --- a/include/linux/jump_label.h
> +++ b/include/linux/jump_label.h
> @@ -160,6 +160,8 @@ extern void arch_jump_label_transform_static(struct jump_entry *entry,
> extern int jump_label_text_reserved(void *start, void *end);
> extern void static_key_slow_inc(struct static_key *key);
> extern void static_key_slow_dec(struct static_key *key);
> +extern void static_key_slow_incr_cpuslocked(struct static_key *key);
> +extern void static_key_slow_decr_cpuslocked(struct static_key *key);
> extern void jump_label_apply_nops(struct module *mod);
> extern int static_key_count(struct static_key *key);
> extern void static_key_enable(struct static_key *key);
> @@ -259,6 +261,8 @@ static inline void static_key_disable(struct static_key *key)
>
> #define static_key_enable_cpuslocked(k) static_key_enable((k))
> #define static_key_disable_cpuslocked(k) static_key_disable((k))
> +#define static_key_slow_incr_cpuslocked(k) static_key_slow_inc((k))
> +#define static_key_slow_decr_cpuslocked(k) static_key_slow_dec((k))
>
> #define STATIC_KEY_INIT_TRUE { .enabled = ATOMIC_INIT(1) }
> #define STATIC_KEY_INIT_FALSE { .enabled = ATOMIC_INIT(0) }
> @@ -414,8 +418,10 @@ struct static_key_false {
> * Advanced usage; refcount, branch is enabled when: count != 0
> */
>
> -#define static_branch_inc(x) static_key_slow_inc(&(x)->key)
> -#define static_branch_dec(x) static_key_slow_dec(&(x)->key)
> +#define static_branch_inc(x) static_key_slow_inc(&(x)->key)
> +#define static_branch_dec(x) static_key_slow_dec(&(x)->key)
> +#define static_branch_inc_cpuslocked(x) static_key_slow_incr_cpuslocked(&(x)->key)
> +#define static_branch_dec_cpuslocked(x) static_key_slow_decr_cpuslocked(&(x)->key)
>
> /*
> * Normal usage; boolean enable/disable.
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index 227bc25..4ad8bae 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -1985,7 +1985,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
> if (!parent)
> return 0;
>
> - mutex_lock(&cpuset_mutex);
> + cpuset_sched_change_begin();
>
> set_bit(CS_ONLINE, &cs->flags);
> if (is_spread_page(parent))
> @@ -2034,7 +2034,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
> cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
> spin_unlock_irq(&callback_lock);
> out_unlock:
> - mutex_unlock(&cpuset_mutex);
> + cpuset_sched_change_end();
> return 0;
> }
>
> diff --git a/kernel/jump_label.c b/kernel/jump_label.c
> index 8594d24..dde0eaa 100644
> --- a/kernel/jump_label.c
> +++ b/kernel/jump_label.c
> @@ -126,6 +126,12 @@ void static_key_slow_inc(struct static_key *key)
> }
> EXPORT_SYMBOL_GPL(static_key_slow_inc);
>
> +void static_key_slow_incr_cpuslocked(struct static_key *key)
> +{
> + static_key_slow_inc_cpuslocked(key);
> +}
> +EXPORT_SYMBOL_GPL(static_key_slow_incr_cpuslocked);
> +
> void static_key_enable_cpuslocked(struct static_key *key)
> {
> STATIC_KEY_CHECK_USE(key);
> @@ -229,6 +235,13 @@ void static_key_slow_dec(struct static_key *key)
> }
> EXPORT_SYMBOL_GPL(static_key_slow_dec);
>
> +void static_key_slow_decr_cpuslocked(struct static_key *key)
> +{
> + STATIC_KEY_CHECK_USE(key);
> + static_key_slow_dec_cpuslocked(key, 0, NULL);
> +}
> +EXPORT_SYMBOL_GPL(static_key_slow_decr_cpuslocked);
> +
> void static_key_slow_dec_deferred(struct static_key_deferred *key)
> {
> STATIC_KEY_CHECK_USE(key);
>
TJ,
Any feedback/suggestion for this patch?
--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project
Hello,
On Mon, Dec 04, 2017 at 10:44:49AM +0530, Prateek Sood wrote:
> Any feedback/suggestion for this patch?
Sorry about the delay. I'm a bit worried because it feels like we're
chasing a squirrel. I'll think through the recent changes and this
one and get back to you.
Thanks.
--
tejun
Hello, again.
On Mon, Dec 04, 2017 at 12:22:19PM -0800, Tejun Heo wrote:
> Hello,
>
> On Mon, Dec 04, 2017 at 10:44:49AM +0530, Prateek Sood wrote:
> > Any feedback/suggestion for this patch?
>
> Sorry about the delay. I'm a bit worried because it feels like we're
> chasing a squirrel. I'll think through the recent changes and this
> one and get back to you.
Can you please take a look at the following pending commit?
https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes&id=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d
AFAICS, this should remove the circular dependency you originally
reported. I'll revert the two cpuset commits for now.
Thanks.
--
tejun
On Mon, Dec 04, 2017 at 02:58:25PM -0800, Tejun Heo wrote:
> Hello, again.
>
> On Mon, Dec 04, 2017 at 12:22:19PM -0800, Tejun Heo wrote:
> > Hello,
> >
> > On Mon, Dec 04, 2017 at 10:44:49AM +0530, Prateek Sood wrote:
> > > Any feedback/suggestion for this patch?
> >
> > Sorry about the delay. I'm a bit worried because it feels like we're
> > chasing a squirrel. I'll think through the recent changes and this
> > one and get back to you.
>
> Can you please take a look at the following pending commit?
>
> https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes&id=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d
>
> AFAICS, this should remove the circular dependency you originally
> reported. I'll revert the two cpuset commits for now.
So I liked his patches in that we would be able to go back to
synchronous sched_domain building.
On 12/05/2017 04:31 AM, Peter Zijlstra wrote:
> On Mon, Dec 04, 2017 at 02:58:25PM -0800, Tejun Heo wrote:
>> Hello, again.
>>
>> On Mon, Dec 04, 2017 at 12:22:19PM -0800, Tejun Heo wrote:
>>> Hello,
>>>
>>> On Mon, Dec 04, 2017 at 10:44:49AM +0530, Prateek Sood wrote:
>>>> Any feedback/suggestion for this patch?
>>>
>>> Sorry about the delay. I'm a bit worried because it feels like we're
>>> chasing a squirrel. I'll think through the recent changes and this
>>> one and get back to you.
>>
>> Can you please take a look at the following pending commit?
>>
>> https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes&id=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d
>>
>> AFAICS, this should remove the circular dependency you originally
>> reported. I'll revert the two cpuset commits for now.
>
> So I liked his patches in that we would be able to go back to
> synchronous sched_domain building.
>
https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes&id=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d
This will fix the original circular locking dependency issue.
I will let you both (Peter & TJ) to decide on which one to
pick.
--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project
On 12/08/2017 03:10 PM, Prateek Sood wrote:
> On 12/05/2017 04:31 AM, Peter Zijlstra wrote:
>> On Mon, Dec 04, 2017 at 02:58:25PM -0800, Tejun Heo wrote:
>>> Hello, again.
>>>
>>> On Mon, Dec 04, 2017 at 12:22:19PM -0800, Tejun Heo wrote:
>>>> Hello,
>>>>
>>>> On Mon, Dec 04, 2017 at 10:44:49AM +0530, Prateek Sood wrote:
>>>>> Any feedback/suggestion for this patch?
>>>>
>>>> Sorry about the delay. I'm a bit worried because it feels like we're
>>>> chasing a squirrel. I'll think through the recent changes and this
>>>> one and get back to you.
>>>
>>> Can you please take a look at the following pending commit?
>>>
>>> https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes&id=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d
>>>
>>> AFAICS, this should remove the circular dependency you originally
>>> reported. I'll revert the two cpuset commits for now.
>>
>> So I liked his patches in that we would be able to go back to
>> synchronous sched_domain building.
>>
> https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes&id=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d
>
> This will fix the original circular locking dependency issue.
> I will let you both (Peter & TJ) to decide on which one to
> pick.
>
>
>
TJ & Peter,
There is one deadlock issue during cgroup migration from cpu
hotplug path when a task T is being moved from source to
destination cgroup.
kworker/0:0
cpuset_hotplug_workfn()
cpuset_hotplug_update_tasks()
hotplug_update_tasks_legacy()
remove_tasks_in_empty_cpuset()
cgroup_transfer_tasks() // stuck in iterator loop
cgroup_migrate()
cgroup_migrate_add_task()
In cgroup_migrate_add_task() it checks for PF_EXITING flag of task T.
Task T will not migrate to destination cgroup. css_task_iter_start()
will keep pointing to task T in loop waiting for task T cg_list node
to be removed.
Task T
do_exit()
exit_signals() // sets PF_EXITING
exit_task_namespaces()
switch_task_namespaces()
free_nsproxy()
put_mnt_ns()
drop_collected_mounts()
namespace_unlock()
synchronize_rcu()
_synchronize_rcu_expedited()
schedule_work() // on cpu0 low priority worker pool
wait_event() // waiting for work item to execute
Task T inserted a work item in the worklist of cpu0 low priority
worker pool. It is waiting for expedited grace period work item
to execute. This work item will only be executed once kworker/0:0
complete execution of cpuset_hotplug_workfn().
kworker/0:0 ==> Task T ==>kworker/0:0
Following suggested patch might not be able to fix the above
mentioned case:
https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes&id=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d
Combination of following patches fixes above mentioned scenario
as well:
1) Inverting cpuset_mutex and cpu_hotplug_lock locking sequence
2) Making cpuset hotplug work synchronous
--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project
Hello, Peter.
On Tue, Dec 05, 2017 at 12:01:17AM +0100, Peter Zijlstra wrote:
> > AFAICS, this should remove the circular dependency you originally
> > reported. I'll revert the two cpuset commits for now.
>
> So I liked his patches in that we would be able to go back to
> synchronous sched_domain building.
Ah, yeah, that's a separate issue but didn't we intentionally make
that asynchronous? IIRC, cpuset migration can take a really long time
when the memory migration is turned on and doing that synchronously
could mess up the system.
Thanks.
--
tejun
Hello, Prateek.
On Fri, Dec 08, 2017 at 05:15:55PM +0530, Prateek Sood wrote:
> There is one deadlock issue during cgroup migration from cpu
> hotplug path when a task T is being moved from source to
> destination cgroup.
>
> kworker/0:0
> cpuset_hotplug_workfn()
> cpuset_hotplug_update_tasks()
> hotplug_update_tasks_legacy()
> remove_tasks_in_empty_cpuset()
> cgroup_transfer_tasks() // stuck in iterator loop
> cgroup_migrate()
> cgroup_migrate_add_task()
>
> In cgroup_migrate_add_task() it checks for PF_EXITING flag of task T.
> Task T will not migrate to destination cgroup. css_task_iter_start()
> will keep pointing to task T in loop waiting for task T cg_list node
> to be removed.
Heh, that's a bug in cgroup_transfer_tasks() which happened because I
forgot to update when we changed how we handle exiting tasks. The
right thing to do here is making cgroup_transfer_tasks() repeat iff
there were a valid migration target which didn't get transferred.
Thanks.
--
tejun
On 12/11/2017 08:50 PM, Tejun Heo wrote:
> Hello, Peter.
>
> On Tue, Dec 05, 2017 at 12:01:17AM +0100, Peter Zijlstra wrote:
>>> AFAICS, this should remove the circular dependency you originally
>>> reported. I'll revert the two cpuset commits for now.
>>
>> So I liked his patches in that we would be able to go back to
>> synchronous sched_domain building.
>
> Ah, yeah, that's a separate issue but didn't we intentionally make
> that asynchronous? IIRC, cpuset migration can take a really long time
> when the memory migration is turned on and doing that synchronously
> could mess up the system.
>
> Thanks.
>
Hi TJ,
This change makes the usage of cpuset_hotplug_workfn() from cpu
hotplug path synchronous. For memory hotplug it still remains
asynchronous.
Memory migration happening from cpuset_hotplug_workfn() is
already asynchronous by queuing cpuset_migrate_mm_workfn() in
cpuset_migrate_mm_wq.
cpuset_hotplug_workfn()
cpuset_hotplug_workfn(()
cpuset_migrate_mm()
queue_work(cpuset_migrate_mm_wq)
It seems that memory migration latency might not have
impact with this change.
Please let me know if you meant something else by cpuset
migration taking time when memory migration is turned on.
Thanks
--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project
On 12/11/2017 09:02 PM, Tejun Heo wrote:
> Hello, Prateek.
>
> On Fri, Dec 08, 2017 at 05:15:55PM +0530, Prateek Sood wrote:
>> There is one deadlock issue during cgroup migration from cpu
>> hotplug path when a task T is being moved from source to
>> destination cgroup.
>>
>> kworker/0:0
>> cpuset_hotplug_workfn()
>> cpuset_hotplug_update_tasks()
>> hotplug_update_tasks_legacy()
>> remove_tasks_in_empty_cpuset()
>> cgroup_transfer_tasks() // stuck in iterator loop
>> cgroup_migrate()
>> cgroup_migrate_add_task()
>>
>> In cgroup_migrate_add_task() it checks for PF_EXITING flag of task T.
>> Task T will not migrate to destination cgroup. css_task_iter_start()
>> will keep pointing to task T in loop waiting for task T cg_list node
>> to be removed.
>
> Heh, that's a bug in cgroup_transfer_tasks() which happened because I
> forgot to update when we changed how we handle exiting tasks. The
> right thing to do here is making cgroup_transfer_tasks() repeat iff
> there were a valid migration target which didn't get transferred.
>
> Thanks.
>
Hi TJ,
Did you mean something like below. If not then could you
please share a patch for this problem in
cgroup_transfer_tasks().
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 473e0c0..41de618 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -143,6 +143,8 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
struct css_task_iter *it);
+void css_task_migrate_iter_start(struct cgroup_subsys_state *css,
+ unsigned int flags, struct css_task_iter *it);
struct task_struct *css_task_iter_next(struct css_task_iter *it);
void css_task_iter_end(struct css_task_iter *it);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 024085d..12279ae 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -122,7 +122,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
* ->can_attach() fails.
*/
do {
- css_task_iter_start(&from->self, 0, &it);
+ css_task_migrate_iter_start(&from->self, 0, &it);
task = css_task_iter_next(&it);
if (task)
get_task_struct(task);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 0b1ffe1..3c1d2d2 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4189,6 +4189,42 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
spin_unlock_irq(&css_set_lock);
}
+void css_task_migrate_iter_start(struct cgroup_subsys_state *css,
+ unsigned int flags, struct css_task_iter *it)
+{
+ struct task_struct *task = NULL;
+ /* no one should try to iterate before mounting cgroups */
+ WARN_ON_ONCE(!use_task_css_set_links);
+
+ memset(it, 0, sizeof(*it));
+
+ spin_lock_irq(&css_set_lock);
+
+ it->ss = css->ss;
+ it->flags = flags;
+
+ if (it->ss)
+ it->cset_pos = &css->cgroup->e_csets[css->ss->id];
+ else
+ it->cset_pos = &css->cgroup->cset_links;
+
+ it->cset_head = it->cset_pos;
+
+ css_task_iter_advance_css_set(it);
+
+ while (it->task_pos) {
+ task = list_entry(it->task_pos, struct task_struct,
+ cg_list);
+
+ if (likely(!(task->flags & PF_EXITING)))
+ break;
+
+ css_task_iter_advance(it);
+ }
+
+ spin_unlock_irq(&css_set_lock);
+}
+
/**
* css_task_iter_next - return the next task for the iterator
* @it: the task iterator being iterated
Thanks
--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project
Hello, Prateek.
On Wed, Dec 13, 2017 at 07:58:24PM +0530, Prateek Sood wrote:
> Did you mean something like below. If not then could you
> please share a patch for this problem in
> cgroup_transfer_tasks().
Oh we surely can add a new iterator but we can just count in
cgroup_transfer_tasks() too, right?
Thanks.
--
tejun
Hello, Prateek.
On Wed, Dec 13, 2017 at 01:20:46PM +0530, Prateek Sood wrote:
> This change makes the usage of cpuset_hotplug_workfn() from cpu
> hotplug path synchronous. For memory hotplug it still remains
> asynchronous.
Ah, right.
> Memory migration happening from cpuset_hotplug_workfn() is
> already asynchronous by queuing cpuset_migrate_mm_workfn() in
> cpuset_migrate_mm_wq.
>
> cpuset_hotplug_workfn()
> cpuset_hotplug_workfn(()
> cpuset_migrate_mm()
> queue_work(cpuset_migrate_mm_wq)
>
> It seems that memory migration latency might not have
> impact with this change.
>
> Please let me know if you meant something else by cpuset
> migration taking time when memory migration is turned on.
No, I didn't. I was just confused about which part became
synchronous. So, I don't have anything against making the cpu part
synchronous, but let's not do that as the fix to the deadlocks cuz,
while we can avoid them by changing cpuset, I don't think cpuset is
the root cause for them. If there are benefits to making cpuset cpu
migration synchronous, let's do that for those benefits.
Thanks.
--
tejun
On 12/13/2017 09:10 PM, Tejun Heo wrote:
Hi TJ,
> Hello, Prateek.
>
> On Wed, Dec 13, 2017 at 07:58:24PM +0530, Prateek Sood wrote:
>> Did you mean something like below. If not then could you
>> please share a patch for this problem in
>> cgroup_transfer_tasks().
>
> Oh we surely can add a new iterator but we can just count in
> cgroup_transfer_tasks() too, right?
I did not get what you meant by this. Could you please share a
patch for this.
>
> Thanks.
>
Following are two ways to improve cgroup_transfer_tasks(). In
both cases task in PF_EXITING state would be left in source
cgroup. It would be removed from cgroup_exit() in exit path.
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 024085d..e2bdcdb 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -123,7 +123,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
*/
do {
css_task_iter_start(&from->self, 0, &it);
- task = css_task_iter_next(&it);
+ do {
+ task = css_task_iter_next(&it);
+ } while (task && (task & PF_EXITING))
+
if (task)
get_task_struct(task);
css_task_iter_end(&it);
----------------------------8<---------------------------------
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 024085d..843b8bb 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -121,12 +121,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
* Migrate tasks one-by-one until @from is empty. This fails iff
* ->can_attach() fails.
*/
+ css_task_iter_start(&from->self, 0, &it);
do {
- css_task_iter_start(&from->self, 0, &it);
task = css_task_iter_next(&it);
if (task)
get_task_struct(task);
- css_task_iter_end(&it);
if (task) {
ret = cgroup_migrate(task, false, &mgctx);
@@ -135,6 +134,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
put_task_struct(task);
}
} while (task && !ret);
+ css_task_iter_end(&it);
out_err:
cgroup_migrate_finish(&mgctx);
percpu_up_write(&cgroup_threadgroup_rwsem);
Thanks
--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project
Hello, Prateek.
On Fri, Dec 15, 2017 at 02:24:55PM +0530, Prateek Sood wrote:
> Following are two ways to improve cgroup_transfer_tasks(). In
> both cases task in PF_EXITING state would be left in source
> cgroup. It would be removed from cgroup_exit() in exit path.
>
> diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
> index 024085d..e2bdcdb 100644
> --- a/kernel/cgroup/cgroup-v1.c
> +++ b/kernel/cgroup/cgroup-v1.c
> @@ -123,7 +123,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
> */
> do {
> css_task_iter_start(&from->self, 0, &it);
> - task = css_task_iter_next(&it);
> + do {
> + task = css_task_iter_next(&it);
> + } while (task && (task & PF_EXITING))
> +
Yeah, this looks good to me. We can't just make a single pass as in
the other one because we can race aginst fork. And PF_EXITING being
left behind is what was happening previously too anyway. They can't
be moved.
Thanks.
--
tejun
On 12/13/2017 09:36 PM, Tejun Heo wrote:
> Hello, Prateek.
>
> On Wed, Dec 13, 2017 at 01:20:46PM +0530, Prateek Sood wrote:
>> This change makes the usage of cpuset_hotplug_workfn() from cpu
>> hotplug path synchronous. For memory hotplug it still remains
>> asynchronous.
>
> Ah, right.
>
>> Memory migration happening from cpuset_hotplug_workfn() is
>> already asynchronous by queuing cpuset_migrate_mm_workfn() in
>> cpuset_migrate_mm_wq.
>>
>> cpuset_hotplug_workfn()
>> cpuset_hotplug_workfn(()
>> cpuset_migrate_mm()
>> queue_work(cpuset_migrate_mm_wq)
>>
>> It seems that memory migration latency might not have
>> impact with this change.
>>
>> Please let me know if you meant something else by cpuset
>> migration taking time when memory migration is turned on.
>
> No, I didn't. I was just confused about which part became
> synchronous. So, I don't have anything against making the cpu part
> synchronous, but let's not do that as the fix to the deadlocks cuz,
> while we can avoid them by changing cpuset, I don't think cpuset is
> the root cause for them. If there are benefits to making cpuset cpu
> migration synchronous, let's do that for those benefits.
Making CPU part synchronous can only be achieved if we retain the
patch for inverting the locking order for cpuset_mutex and
cpu_hotplug_lock.
>
> Thanks.
>
Peter,
Do you suggest taking both the patches:
1) Inverting locking order of cpuset_mutex and cpu_hotplug_lock.
2) Make cpuset hotplug work synchronous.
or
https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes&id=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d
I would leave it for you and TJ to decide on this.
Thanks
--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project
On 12/15/2017 06:52 PM, Tejun Heo wrote:
> Hello, Prateek.
>
> On Fri, Dec 15, 2017 at 02:24:55PM +0530, Prateek Sood wrote:
>> Following are two ways to improve cgroup_transfer_tasks(). In
>> both cases task in PF_EXITING state would be left in source
>> cgroup. It would be removed from cgroup_exit() in exit path.
>>
>> diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
>> index 024085d..e2bdcdb 100644
>> --- a/kernel/cgroup/cgroup-v1.c
>> +++ b/kernel/cgroup/cgroup-v1.c
>> @@ -123,7 +123,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
>> */
>> do {
>> css_task_iter_start(&from->self, 0, &it);
>> - task = css_task_iter_next(&it);
>> + do {
>> + task = css_task_iter_next(&it);
>> + } while (task && (task & PF_EXITING))
>> +
>
> Yeah, this looks good to me. We can't just make a single pass as in
> the other one because we can race aginst fork. And PF_EXITING being
> left behind is what was happening previously too anyway. They can't
> be moved.
>
> Thanks.
>
Thanks TJ for reviewing.
I will send a formal patch with the above approved approach.
--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project
Deadlock during cgroup migration from cpu hotplug path when a task T is
being moved from source to destination cgroup.
kworker/0:0
cpuset_hotplug_workfn()
cpuset_hotplug_update_tasks()
hotplug_update_tasks_legacy()
remove_tasks_in_empty_cpuset()
cgroup_transfer_tasks() // stuck in iterator loop
cgroup_migrate()
cgroup_migrate_add_task()
In cgroup_migrate_add_task() it checks for PF_EXITING flag of task T.
Task T will not migrate to destination cgroup. css_task_iter_start()
will keep pointing to task T in loop waiting for task T cg_list node
to be removed.
Task T
do_exit()
exit_signals() // sets PF_EXITING
exit_task_namespaces()
switch_task_namespaces()
free_nsproxy()
put_mnt_ns()
drop_collected_mounts()
namespace_unlock()
synchronize_rcu()
_synchronize_rcu_expedited()
schedule_work() // on cpu0 low priority worker pool
wait_event() // waiting for work item to execute
Task T inserted a work item in the worklist of cpu0 low priority
worker pool. It is waiting for expedited grace period work item
to execute. This work item will only be executed once kworker/0:0
complete execution of cpuset_hotplug_workfn().
kworker/0:0 ==> Task T ==>kworker/0:0
In case of PF_EXITING task being migrated from source to destination
cgroup, migrate next available task in source cgroup.
Change-Id: I8874fb04479c136cae4dabd5c168c7749df66664
Signed-off-by: Prateek Sood <[email protected]>
---
kernel/cgroup/cgroup-v1.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 024085d..a2c05d2 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -123,7 +123,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
*/
do {
css_task_iter_start(&from->self, 0, &it);
- task = css_task_iter_next(&it);
+
+ do {
+ task = css_task_iter_next(&it);
+ } while (task && (task->flags & PF_EXITING));
+
if (task)
get_task_struct(task);
css_task_iter_end(&it);
--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc.,
is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.
On Tue, Dec 19, 2017 at 12:56:57PM +0530, Prateek Sood wrote:
> Deadlock during cgroup migration from cpu hotplug path when a task T is
> being moved from source to destination cgroup.
...
> Task T inserted a work item in the worklist of cpu0 low priority
> worker pool. It is waiting for expedited grace period work item
> to execute. This work item will only be executed once kworker/0:0
> complete execution of cpuset_hotplug_workfn().
>
> kworker/0:0 ==> Task T ==>kworker/0:0
>
> In case of PF_EXITING task being migrated from source to destination
> cgroup, migrate next available task in source cgroup.
>
> Change-Id: I8874fb04479c136cae4dabd5c168c7749df66664
> Signed-off-by: Prateek Sood <[email protected]>
Applied to cgroup/for-4.15-fixes.
Thanks a lot, Prateek.
--
tejun
On 12/13/2017 09:36 PM, Tejun Heo wrote:
> Hello, Prateek.
>
> On Wed, Dec 13, 2017 at 01:20:46PM +0530, Prateek Sood wrote:
>> This change makes the usage of cpuset_hotplug_workfn() from cpu
>> hotplug path synchronous. For memory hotplug it still remains
>> asynchronous.
>
> Ah, right.
>
>> Memory migration happening from cpuset_hotplug_workfn() is
>> already asynchronous by queuing cpuset_migrate_mm_workfn() in
>> cpuset_migrate_mm_wq.
>>
>> cpuset_hotplug_workfn()
>> cpuset_hotplug_workfn(()
>> cpuset_migrate_mm()
>> queue_work(cpuset_migrate_mm_wq)
>>
>> It seems that memory migration latency might not have
>> impact with this change.
>>
>> Please let me know if you meant something else by cpuset
>> migration taking time when memory migration is turned on.
>
> No, I didn't. I was just confused about which part became
> synchronous. So, I don't have anything against making the cpu part
> synchronous, but let's not do that as the fix to the deadlocks cuz,
> while we can avoid them by changing cpuset, I don't think cpuset is
> the root cause for them. If there are benefits to making cpuset cpu
> migration synchronous, let's do that for those benefits.
>
> Thanks.
>
TJ,
One more deadlock scenario
Task: sh
[<ffffff874f917290>] wait_for_completion+0x14
[<ffffff874e8a82e0>] cpuhp_kick_ap_work+0x80 //waiting for cpuhp/2
[<ffffff874f913780>] _cpu_down+0xe0
[<ffffff874e8a9934>] cpu_down+0x38
[<ffffff874ef6e4ec>] cpu_subsys_offline+0x10
Task: cpuhp/2
[<ffffff874f91645c>] schedule+0x38
[<ffffff874e92c76c>] _synchronize_rcu_expedited+0x2ec
[<ffffff874e92c874>] synchronize_sched_expedited+0x60
[<ffffff874e92c9f8>] synchronize_sched+0xb0
[<ffffff874e9104e4>] sugov_stop+0x58
[<ffffff874f36967c>] cpufreq_stop_governor+0x48
[<ffffff874f36a89c>] cpufreq_offline+0x84
[<ffffff874f36aa30>] cpuhp_cpufreq_offline+0xc
[<ffffff874e8a797c>] cpuhp_invoke_callback+0xac
[<ffffff874e8a89b4>] cpuhp_down_callbacks+0x58
[<ffffff874e8a95e8>] cpuhp_thread_fun+0xa8
_synchronize_rcu_expedited is waiting for execution of rcu
expedited grace period work item wait_rcu_exp_gp()
Task: kworker/2:1
[<ffffff874f91645c>] schedule+0x38
[<ffffff874f916870>] schedule_preempt_disabled+0x20
[<ffffff874f918df8>] __mutex_lock_slowpath+0x158
[<ffffff874f919004>] mutex_lock+0x14
[<ffffff874e8a77b0>] get_online_cpus+0x34 //waiting for cpu_hotplug_lock
[<ffffff874e96452c>] rebuild_sched_domains+0x30
[<ffffff874e964648>] cpuset_hotplug_workfn+0xb8
[<ffffff874e8c27b8>] process_one_work+0x168
[<ffffff874e8c2ff4>] worker_thread+0x140
[<ffffff874e8c95b8>] kthread+0xe0
cpu_hotplug_lock is acquired by task: sh
Task: kworker/2:3
[<ffffff874f91645c>] schedule+0x38
[<ffffff874f91a384>] schedule_timeout+0x1d8
[<ffffff874f9171d4>] wait_for_common+0xb4
[<ffffff874f917304>] wait_for_completion_killable+0x14 //waiting for kthreadd
[<ffffff874e8c931c>] __kthread_create_on_node+0xec
[<ffffff874e8c9448>] kthread_create_on_node+0x64
[<ffffff874e8c2d88>] create_worker+0xb4
[<ffffff874e8c3194>] worker_thread+0x2e0
[<ffffff874e8c95b8>] kthread+0xe0
Task: kthreadd
[<ffffff874e8858ec>] __switch_to+0x94
[<ffffff874f915ed8>] __schedule+0x2a8
[<ffffff874f91645c>] schedule+0x38
[<ffffff874f91a0ec>] rwsem_down_read_failed+0xe8
[<ffffff874e913dc4>] __percpu_down_read+0xfc
[<ffffff874e8a4ab0>] copy_process.isra.72.part.73+0xf60
[<ffffff874e8a53b8>] _do_fork+0xc4
[<ffffff874e8a5720>] kernel_thread+0x34
[<ffffff874e8ca83c>] kthreadd+0x144
kthreadd is waiting for cgroup_threadgroup_rwsem acquired
by task T
Task: T
[<ffffff874f91645c>] schedule+0x38
[<ffffff874f916870>] schedule_preempt_disabled+0x20
[<ffffff874f918df8>] __mutex_lock_slowpath+0x158
[<ffffff874f919004>] mutex_lock+0x14
[<ffffff874e962ff4>] cpuset_can_attach+0x58
[<ffffff874e95d640>] cgroup_taskset_migrate+0x8c
[<ffffff874e95d9b4>] cgroup_migrate+0xa4
[<ffffff874e95daf0>] cgroup_attach_task+0x100
[<ffffff874e95df28>] __cgroup_procs_write.isra.35+0x228
[<ffffff874e95e00c>] cgroup_tasks_write+0x10
[<ffffff874e958294>] cgroup_file_write+0x44
[<ffffff874eaa4384>] kernfs_fop_write+0xc0
task T is waiting for cpuset_mutex acquired
by kworker/2:1
sh ==> cpuhp/2 ==> kworker/2:1 ==> sh
kworker/2:3 ==> kthreadd ==> Task T ==> kworker/2:1
It seems that my earlier patch set should fix this scenario:
1) Inverting locking order of cpuset_mutex and cpu_hotplug_lock.
2) Make cpuset hotplug work synchronous.
Could you please share your feedback.
Thanks
--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project
Hello,
On Fri, Dec 29, 2017 at 02:07:16AM +0530, Prateek Sood wrote:
> task T is waiting for cpuset_mutex acquired
> by kworker/2:1
>
> sh ==> cpuhp/2 ==> kworker/2:1 ==> sh
>
> kworker/2:3 ==> kthreadd ==> Task T ==> kworker/2:1
>
> It seems that my earlier patch set should fix this scenario:
> 1) Inverting locking order of cpuset_mutex and cpu_hotplug_lock.
> 2) Make cpuset hotplug work synchronous.
>
> Could you please share your feedback.
Hmm... this can also be resolved by adding WQ_MEM_RECLAIM to the
synchronize rcu workqueue, right? Given the wide-spread usages of
synchronize_rcu and friends, maybe that's the right solution, or at
least something we also need to do, for this particular deadlock?
Again, I don't have anything against making the domain rebuliding part
of cpuset operations synchronous and these tricky deadlock scenarios
do indicate that doing so would probably be beneficial. That said,
tho, these scenarios seem more of manifestations of other problems
exposed through kthreadd dependency than anything else.
Thanks.
--
tejun
On Tue, Jan 02, 2018 at 08:16:56AM -0800, Tejun Heo wrote:
> Hello,
>
> On Fri, Dec 29, 2017 at 02:07:16AM +0530, Prateek Sood wrote:
> > task T is waiting for cpuset_mutex acquired
> > by kworker/2:1
> >
> > sh ==> cpuhp/2 ==> kworker/2:1 ==> sh
> >
> > kworker/2:3 ==> kthreadd ==> Task T ==> kworker/2:1
> >
> > It seems that my earlier patch set should fix this scenario:
> > 1) Inverting locking order of cpuset_mutex and cpu_hotplug_lock.
> > 2) Make cpuset hotplug work synchronous.
> >
> > Could you please share your feedback.
>
> Hmm... this can also be resolved by adding WQ_MEM_RECLAIM to the
> synchronize rcu workqueue, right? Given the wide-spread usages of
> synchronize_rcu and friends, maybe that's the right solution, or at
> least something we also need to do, for this particular deadlock?
To make WQ_MEM_RECLAIM work, I need to dynamically allocate RCU's
workqueues, correct? Or is there some way to mark a statically
allocated workqueue as WQ_MEM_RECLAIM after the fact?
I can dynamically allocate them, but I need to carefully investigate
boot-time use. So if it is possible to be lazy, I do want to take
the easy way out. ;-)
Thanx, Paul
> Again, I don't have anything against making the domain rebuliding part
> of cpuset operations synchronous and these tricky deadlock scenarios
> do indicate that doing so would probably be beneficial. That said,
> tho, these scenarios seem more of manifestations of other problems
> exposed through kthreadd dependency than anything else.
>
> Thanks.
>
> --
> tejun
>
On Tue, Jan 02, 2018 at 09:44:08AM -0800, Paul E. McKenney wrote:
> On Tue, Jan 02, 2018 at 08:16:56AM -0800, Tejun Heo wrote:
> > Hello,
> >
> > On Fri, Dec 29, 2017 at 02:07:16AM +0530, Prateek Sood wrote:
> > > task T is waiting for cpuset_mutex acquired
> > > by kworker/2:1
> > >
> > > sh ==> cpuhp/2 ==> kworker/2:1 ==> sh
> > >
> > > kworker/2:3 ==> kthreadd ==> Task T ==> kworker/2:1
> > >
> > > It seems that my earlier patch set should fix this scenario:
> > > 1) Inverting locking order of cpuset_mutex and cpu_hotplug_lock.
> > > 2) Make cpuset hotplug work synchronous.
> > >
> > > Could you please share your feedback.
> >
> > Hmm... this can also be resolved by adding WQ_MEM_RECLAIM to the
> > synchronize rcu workqueue, right? Given the wide-spread usages of
> > synchronize_rcu and friends, maybe that's the right solution, or at
> > least something we also need to do, for this particular deadlock?
>
> To make WQ_MEM_RECLAIM work, I need to dynamically allocate RCU's
> workqueues, correct? Or is there some way to mark a statically
> allocated workqueue as WQ_MEM_RECLAIM after the fact?
>
> I can dynamically allocate them, but I need to carefully investigate
> boot-time use. So if it is possible to be lazy, I do want to take
> the easy way out. ;-)
Actually, after taking a quick look, could you please supply me with
a way of mark a statically allocated workqueue as WQ_MEM_RECLAIM after
the fact? Otherwise, I end up having to check for the workqueue having
been allocated pretty much each time I use it, which is going to be an
open invitation for bugs. Plus it looks like there are ways that RCU's
workqueue wakeups can be executed during very early boot, which can be
handled, but again in a rather messy fashion.
In contrast, given a way of mark a statically allocated workqueue
as WQ_MEM_RECLAIM after the fact, I simply continue initializing the
workqueue at early boot, and then add the WQ_MEM_RECLAIM marking some
arbitrarily chosen time after the scheduler has been initialized.
The required change to workqueues looks easy, just move the body of
the "if (flags & WQ_MEM_RECLAIM) {" statement in __alloc_workqueue_key()
to a separate function, right?
Thanx, Paul
Hello, Paul.
Sorry about the delay. Travel followed by cold. :(
On Tue, Jan 02, 2018 at 10:01:19AM -0800, Paul E. McKenney wrote:
> Actually, after taking a quick look, could you please supply me with
> a way of mark a statically allocated workqueue as WQ_MEM_RECLAIM after
> the fact? Otherwise, I end up having to check for the workqueue having
Hmmm... there is no statically allocated workqueue tho. If you're
referring to the system-wide workqueues (system*_wq), they're just
created dynamically early during boot.
> been allocated pretty much each time I use it, which is going to be an
> open invitation for bugs. Plus it looks like there are ways that RCU's
> workqueue wakeups can be executed during very early boot, which can be
> handled, but again in a rather messy fashion.
>
> In contrast, given a way of mark a statically allocated workqueue
> as WQ_MEM_RECLAIM after the fact, I simply continue initializing the
> workqueue at early boot, and then add the WQ_MEM_RECLAIM marking some
> arbitrarily chosen time after the scheduler has been initialized.
>
> The required change to workqueues looks easy, just move the body of
> the "if (flags & WQ_MEM_RECLAIM) {" statement in __alloc_workqueue_key()
> to a separate function, right?
Ah, okay, yes, currently, workqueue init is kinda silly in that while
it allows init of non-mem-reclaiming workqueues way before workqueue
is actually online, it doesn't allow the same for mem-reclaiming ones.
As you pointed out, it's just an oversight on my part as the init path
split was done initially to accomodate early init of system
workqueues.
I'll update the code so that rescuers can be added later too; however,
please note that while the work items may be queued, they won't be
executed until workqueue_init() is run (the same as now) as there
can't be worker threads anyway before that point.
Thanks.
--
tejun
Hello, Paul.
Applied the following two patches to allow early init of
WQ_MEM_RECLAIM workqueues to the following git branch.
git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git for-4.16
The branch won't be rebased and currently only contains these two
patches. Please feel free to pull and put further RCU changes on top.
Thanks.
------ 8< ------
>From 983c751532738b83c4c5b51192ebc6fbfe9330f7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <[email protected]>
Date: Mon, 8 Jan 2018 05:38:32 -0800
Separate out init_rescuer() from __alloc_workqueue_key() to prepare
for early init support for WQ_MEM_RECLAIM. This patch doesn't
introduce any functional changes.
Signed-off-by: Tejun Heo <[email protected]>
Cc: Paul McKenney <[email protected]>
---
kernel/workqueue.c | 56 ++++++++++++++++++++++++++++++++----------------------
1 file changed, 33 insertions(+), 23 deletions(-)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 43d18cb..5baac7c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3939,6 +3939,37 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
return clamp_val(max_active, 1, lim);
}
+/*
+ * Workqueues which may be used during memory reclaim should have a rescuer
+ * to guarantee forward progress.
+ */
+static int init_rescuer(struct workqueue_struct *wq)
+{
+ struct worker *rescuer;
+ int ret;
+
+ if (!(wq->flags & WQ_MEM_RECLAIM))
+ return 0;
+
+ rescuer = alloc_worker(NUMA_NO_NODE);
+ if (!rescuer)
+ return -ENOMEM;
+
+ rescuer->rescue_wq = wq;
+ rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name);
+ ret = PTR_ERR_OR_ZERO(rescuer->task);
+ if (ret) {
+ kfree(rescuer);
+ return ret;
+ }
+
+ wq->rescuer = rescuer;
+ kthread_bind_mask(rescuer->task, cpu_possible_mask);
+ wake_up_process(rescuer->task);
+
+ return 0;
+}
+
struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
unsigned int flags,
int max_active,
@@ -4001,29 +4032,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
if (alloc_and_link_pwqs(wq) < 0)
goto err_free_wq;
- /*
- * Workqueues which may be used during memory reclaim should
- * have a rescuer to guarantee forward progress.
- */
- if (flags & WQ_MEM_RECLAIM) {
- struct worker *rescuer;
-
- rescuer = alloc_worker(NUMA_NO_NODE);
- if (!rescuer)
- goto err_destroy;
-
- rescuer->rescue_wq = wq;
- rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
- wq->name);
- if (IS_ERR(rescuer->task)) {
- kfree(rescuer);
- goto err_destroy;
- }
-
- wq->rescuer = rescuer;
- kthread_bind_mask(rescuer->task, cpu_possible_mask);
- wake_up_process(rescuer->task);
- }
+ if (init_rescuer(wq) < 0)
+ goto err_destroy;
if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
goto err_destroy;
--
2.9.5
>From 40c17f75dfa9ec163478efd3f7443fd6af9992c9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <[email protected]>
Date: Mon, 8 Jan 2018 05:38:37 -0800
Workqueues can be created early during boot before workqueue subsystem
in fully online - work items are queued waiting for later full
initialization. However, early init wasn't supported for
WQ_MEM_RECLAIM workqueues causing unnecessary annoyances for a subset
of users. Expand early init support to include WQ_MEM_RECLAIM
workqueues.
Signed-off-by: Tejun Heo <[email protected]>
Cc: Paul McKenney <[email protected]>
---
kernel/workqueue.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5baac7c..c86cc1e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4032,7 +4032,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
if (alloc_and_link_pwqs(wq) < 0)
goto err_free_wq;
- if (init_rescuer(wq) < 0)
+ if (wq_online && init_rescuer(wq) < 0)
goto err_destroy;
if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
@@ -5639,6 +5639,8 @@ int __init workqueue_init(void)
* archs such as power and arm64. As per-cpu pools created
* previously could be missing node hint and unbound pools NUMA
* affinity, fix them up.
+ *
+ * Also, while iterating workqueues, create rescuers if requested.
*/
wq_numa_init();
@@ -5650,8 +5652,12 @@ int __init workqueue_init(void)
}
}
- list_for_each_entry(wq, &workqueues, list)
+ list_for_each_entry(wq, &workqueues, list) {
wq_update_unbound_numa(wq, smp_processor_id(), true);
+ WARN(init_rescuer(wq),
+ "workqueue: failed to create early rescuer for %s",
+ wq->name);
+ }
mutex_unlock(&wq_pool_mutex);
--
2.9.5
On Mon, Jan 08, 2018 at 04:28:23AM -0800, Tejun Heo wrote:
> Hello, Paul.
>
> Sorry about the delay. Travel followed by cold. :(
>
> On Tue, Jan 02, 2018 at 10:01:19AM -0800, Paul E. McKenney wrote:
> > Actually, after taking a quick look, could you please supply me with
> > a way of mark a statically allocated workqueue as WQ_MEM_RECLAIM after
> > the fact? Otherwise, I end up having to check for the workqueue having
>
> Hmmm... there is no statically allocated workqueue tho. If you're
> referring to the system-wide workqueues (system*_wq), they're just
> created dynamically early during boot.
Good point, I was confused. But yes, they are conveniently allocated
just before the call to rcu_init(), which does work out well. ;-)
> > been allocated pretty much each time I use it, which is going to be an
> > open invitation for bugs. Plus it looks like there are ways that RCU's
> > workqueue wakeups can be executed during very early boot, which can be
> > handled, but again in a rather messy fashion.
> >
> > In contrast, given a way of mark a statically allocated workqueue
> > as WQ_MEM_RECLAIM after the fact, I simply continue initializing the
> > workqueue at early boot, and then add the WQ_MEM_RECLAIM marking some
> > arbitrarily chosen time after the scheduler has been initialized.
> >
> > The required change to workqueues looks easy, just move the body of
> > the "if (flags & WQ_MEM_RECLAIM) {" statement in __alloc_workqueue_key()
> > to a separate function, right?
>
> Ah, okay, yes, currently, workqueue init is kinda silly in that while
> it allows init of non-mem-reclaiming workqueues way before workqueue
> is actually online, it doesn't allow the same for mem-reclaiming ones.
> As you pointed out, it's just an oversight on my part as the init path
> split was done initially to accomodate early init of system
> workqueues.
>
> I'll update the code so that rescuers can be added later too; however,
> please note that while the work items may be queued, they won't be
> executed until workqueue_init() is run (the same as now) as there
> can't be worker threads anyway before that point.
Thank you! I added the following patch to allow RCU access to the
init_rescuer() function. Does that work for you, or did you have some
other arrangement in mind?
Thanx, Paul
------------------------------------------------------------------------
commit 66683a07503d71e5d5cceac72caf772e6e59c787
Author: Paul E. McKenney <[email protected]>
Date: Mon Jan 8 14:27:46 2018 -0800
workqueue: Allow init_rescuer() to be invoked from other files
This commit exports init_rescuer() so that RCU can invoke it.
Signed-off-by: Paul E. McKenney <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: Lai Jiangshan <[email protected]>
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 4a54ef96aff5..31ce9343b4a9 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -433,6 +433,8 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
#define create_singlethread_workqueue(name) \
alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name)
+int init_rescuer(struct workqueue_struct *wq);
+
extern void destroy_workqueue(struct workqueue_struct *wq);
struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c86cc1ed678b..7440c61c6213 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3943,7 +3943,7 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
* Workqueues which may be used during memory reclaim should have a rescuer
* to guarantee forward progress.
*/
-static int init_rescuer(struct workqueue_struct *wq)
+int init_rescuer(struct workqueue_struct *wq)
{
struct worker *rescuer;
int ret;
On Mon, Jan 08, 2018 at 02:52:38PM -0800, Paul E. McKenney wrote:
> On Mon, Jan 08, 2018 at 04:28:23AM -0800, Tejun Heo wrote:
> > Hello, Paul.
> >
> > Sorry about the delay. Travel followed by cold. :(
> >
> > On Tue, Jan 02, 2018 at 10:01:19AM -0800, Paul E. McKenney wrote:
> > > Actually, after taking a quick look, could you please supply me with
> > > a way of mark a statically allocated workqueue as WQ_MEM_RECLAIM after
> > > the fact? Otherwise, I end up having to check for the workqueue having
> >
> > Hmmm... there is no statically allocated workqueue tho. If you're
> > referring to the system-wide workqueues (system*_wq), they're just
> > created dynamically early during boot.
>
> Good point, I was confused. But yes, they are conveniently allocated
> just before the call to rcu_init(), which does work out well. ;-)
>
> > > been allocated pretty much each time I use it, which is going to be an
> > > open invitation for bugs. Plus it looks like there are ways that RCU's
> > > workqueue wakeups can be executed during very early boot, which can be
> > > handled, but again in a rather messy fashion.
> > >
> > > In contrast, given a way of mark a statically allocated workqueue
> > > as WQ_MEM_RECLAIM after the fact, I simply continue initializing the
> > > workqueue at early boot, and then add the WQ_MEM_RECLAIM marking some
> > > arbitrarily chosen time after the scheduler has been initialized.
> > >
> > > The required change to workqueues looks easy, just move the body of
> > > the "if (flags & WQ_MEM_RECLAIM) {" statement in __alloc_workqueue_key()
> > > to a separate function, right?
> >
> > Ah, okay, yes, currently, workqueue init is kinda silly in that while
> > it allows init of non-mem-reclaiming workqueues way before workqueue
> > is actually online, it doesn't allow the same for mem-reclaiming ones.
> > As you pointed out, it's just an oversight on my part as the init path
> > split was done initially to accomodate early init of system
> > workqueues.
> >
> > I'll update the code so that rescuers can be added later too; however,
> > please note that while the work items may be queued, they won't be
> > executed until workqueue_init() is run (the same as now) as there
> > can't be worker threads anyway before that point.
>
> Thank you! I added the following patch to allow RCU access to the
> init_rescuer() function. Does that work for you, or did you have some
> other arrangement in mind?
And here are the corresponding changes to RCU, which pass light rcutorture
testing.
Thanx, Paul
------------------------------------------------------------------------
commit d0d6626927faf3421df6a1db875ad7099f7d49cd
Author: Paul E. McKenney <[email protected]>
Date: Mon Jan 8 14:35:52 2018 -0800
rcu: Create RCU-specific workqueues with rescuers
RCU's expedited grace periods can participate in out-of-memory deadlocks
due to all available system_wq kthreads being blocked and there not being
memory available to create more. This commit prevents such deadlocks
by allocating an RCU-specific workqueue_struct at early boot time, and
providing it with a rescuer to ensure forward progress. This uses the
shiny new init_rescuer() function provided by Tejun.
This commit also causes SRCU to use this new RCU-specific
workqueue_struct. Note that SRCU's use of workqueues never blocks them
waiting for readers, so this should be safe from a forward-progress
viewpoint.
Reported-by: Prateek Sood <[email protected]>
Reported-by: Tejun Heo <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 59c471de342a..acabc4781b08 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -493,6 +493,7 @@ void show_rcu_gp_kthreads(void);
void rcu_force_quiescent_state(void);
void rcu_bh_force_quiescent_state(void);
void rcu_sched_force_quiescent_state(void);
+extern struct workqueue_struct *rcu_gp_workqueue;
#endif /* #else #ifdef CONFIG_TINY_RCU */
#ifdef CONFIG_RCU_NOCB_CPU
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 6d5880089ff6..89f0f6b3ce9a 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -465,7 +465,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
*/
static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
{
- srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq,
+ srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_workqueue,
&sdp->work, delay);
}
@@ -664,7 +664,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) {
WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
srcu_gp_start(sp);
- queue_delayed_work(system_power_efficient_wq, &sp->work,
+ queue_delayed_work(rcu_gp_workqueue, &sp->work,
srcu_get_delay(sp));
}
raw_spin_unlock_irqrestore_rcu_node(sp, flags);
@@ -1198,7 +1198,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
raw_spin_unlock_irq_rcu_node(sp);
if (pushgp)
- queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
+ queue_delayed_work(rcu_gp_workqueue, &sp->work, delay);
}
/*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f9c0ca2ccf0c..99c12650b9db 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4272,6 +4272,15 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp)
pr_cont("\n");
}
+struct workqueue_struct *rcu_gp_workqueue;
+
+static int __init rcu_init_wq_rescuer(void)
+{
+ WARN_ON(init_rescuer(rcu_gp_workqueue));
+ return 0;
+}
+core_initcall(rcu_init_wq_rescuer);
+
void __init rcu_init(void)
{
int cpu;
@@ -4298,6 +4307,10 @@ void __init rcu_init(void)
rcu_cpu_starting(cpu);
rcutree_online_cpu(cpu);
}
+
+ /* Create workqueue for expedited GPs and for Tree SRCU. */
+ rcu_gp_workqueue = alloc_workqueue("rcu_gp", 0, 0);
+ WARN_ON(!rcu_gp_workqueue);
}
#include "tree_exp.h"
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 46d61b597731..3ba3ef4d4796 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -606,7 +606,7 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
rew.rew_rsp = rsp;
rew.rew_s = s;
INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
- schedule_work(&rew.rew_work);
+ queue_work(rcu_gp_workqueue, &rew.rew_work);
}
/* Wait for expedited grace period to complete. */
Hello, Paul.
On Mon, Jan 08, 2018 at 04:31:27PM -0800, Paul E. McKenney wrote:
> +static int __init rcu_init_wq_rescuer(void)
> +{
> + WARN_ON(init_rescuer(rcu_gp_workqueue));
> + return 0;
> +}
> +core_initcall(rcu_init_wq_rescuer);
So, what I don't get is why RCU needs to call this explicitly.
core_initcall() is after workqueue_init() anyway. Why am I missing?
Thanks.
--
tejun
On Mon, Jan 08, 2018 at 07:42:11PM -0800, Tejun Heo wrote:
> Hello, Paul.
>
> On Mon, Jan 08, 2018 at 04:31:27PM -0800, Paul E. McKenney wrote:
> > +static int __init rcu_init_wq_rescuer(void)
> > +{
> > + WARN_ON(init_rescuer(rcu_gp_workqueue));
> > + return 0;
> > +}
> > +core_initcall(rcu_init_wq_rescuer);
>
> So, what I don't get is why RCU needs to call this explicitly.
> core_initcall() is after workqueue_init() anyway. Why am I missing?
Me being stupid, I guess.
OK, so I can put WQ_MEM_RECLAIM on the early boot creation of RCU's
workqueue_struct as shown below, right?
Thanx, Paul
------------------------------------------------------------------------
commit 9884a945a65837cda6de2ff621d47c59a6ca3e28
Author: Paul E. McKenney <[email protected]>
Date: Mon Jan 8 14:35:52 2018 -0800
rcu: Create RCU-specific workqueues with rescuers
RCU's expedited grace periods can participate in out-of-memory deadlocks
due to all available system_wq kthreads being blocked and there not being
memory available to create more. This commit prevents such deadlocks
by allocating an RCU-specific workqueue_struct at early boot time, and
providing it with a rescuer to ensure forward progress. This uses the
shiny new init_rescuer() function provided by Tejun (but indirectly).
This commit also causes SRCU to use this new RCU-specific
workqueue_struct. Note that SRCU's use of workqueues never blocks them
waiting for readers, so this should be safe from a forward-progress
viewpoint.
Reported-by: Prateek Sood <[email protected]>
Reported-by: Tejun Heo <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 59c471de342a..acabc4781b08 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -493,6 +493,7 @@ void show_rcu_gp_kthreads(void);
void rcu_force_quiescent_state(void);
void rcu_bh_force_quiescent_state(void);
void rcu_sched_force_quiescent_state(void);
+extern struct workqueue_struct *rcu_gp_workqueue;
#endif /* #else #ifdef CONFIG_TINY_RCU */
#ifdef CONFIG_RCU_NOCB_CPU
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 6d5880089ff6..89f0f6b3ce9a 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -465,7 +465,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
*/
static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
{
- srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq,
+ srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_workqueue,
&sdp->work, delay);
}
@@ -664,7 +664,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) {
WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
srcu_gp_start(sp);
- queue_delayed_work(system_power_efficient_wq, &sp->work,
+ queue_delayed_work(rcu_gp_workqueue, &sp->work,
srcu_get_delay(sp));
}
raw_spin_unlock_irqrestore_rcu_node(sp, flags);
@@ -1198,7 +1198,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
raw_spin_unlock_irq_rcu_node(sp);
if (pushgp)
- queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
+ queue_delayed_work(rcu_gp_workqueue, &sp->work, delay);
}
/*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f9c0ca2ccf0c..d658538e6f7d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4272,6 +4272,8 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp)
pr_cont("\n");
}
+struct workqueue_struct *rcu_gp_workqueue;
+
void __init rcu_init(void)
{
int cpu;
@@ -4298,6 +4300,10 @@ void __init rcu_init(void)
rcu_cpu_starting(cpu);
rcutree_online_cpu(cpu);
}
+
+ /* Create workqueue for expedited GPs and for Tree SRCU. */
+ rcu_gp_workqueue = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
+ WARN_ON(!rcu_gp_workqueue);
}
#include "tree_exp.h"
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 46d61b597731..3ba3ef4d4796 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -606,7 +606,7 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
rew.rew_rsp = rsp;
rew.rew_s = s;
INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
- schedule_work(&rew.rew_work);
+ queue_work(rcu_gp_workqueue, &rew.rew_work);
}
/* Wait for expedited grace period to complete. */
Hello, Paul.
On Mon, Jan 08, 2018 at 08:20:16PM -0800, Paul E. McKenney wrote:
> OK, so I can put WQ_MEM_RECLAIM on the early boot creation of RCU's
> workqueue_struct as shown below, right?
Yes, this looks good to me. Just one question.
> +struct workqueue_struct *rcu_gp_workqueue;
> +
> void __init rcu_init(void)
> {
> int cpu;
> @@ -4298,6 +4300,10 @@ void __init rcu_init(void)
> rcu_cpu_starting(cpu);
> rcutree_online_cpu(cpu);
> }
> +
> + /* Create workqueue for expedited GPs and for Tree SRCU. */
> + rcu_gp_workqueue = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
> + WARN_ON(!rcu_gp_workqueue);
The code was previously using both system_power_efficient_wq and
system_workqueue (for the expedited path). I guess the options were
either using two workqueues or dropping POWER_EFFICIENT. I have no
idea how big an impact this will make or whether it'd even be
noticeable but maybe it'd be worthwhile to mention that in the
description?
Thanks.
--
tejun
On Tue, Jan 09, 2018 at 05:44:48AM -0800, Tejun Heo wrote:
> Hello, Paul.
>
> On Mon, Jan 08, 2018 at 08:20:16PM -0800, Paul E. McKenney wrote:
> > OK, so I can put WQ_MEM_RECLAIM on the early boot creation of RCU's
> > workqueue_struct as shown below, right?
>
> Yes, this looks good to me. Just one question.
>
> > +struct workqueue_struct *rcu_gp_workqueue;
> > +
> > void __init rcu_init(void)
> > {
> > int cpu;
> > @@ -4298,6 +4300,10 @@ void __init rcu_init(void)
> > rcu_cpu_starting(cpu);
> > rcutree_online_cpu(cpu);
> > }
> > +
> > + /* Create workqueue for expedited GPs and for Tree SRCU. */
> > + rcu_gp_workqueue = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
> > + WARN_ON(!rcu_gp_workqueue);
>
> The code was previously using both system_power_efficient_wq and
> system_workqueue (for the expedited path). I guess the options were
> either using two workqueues or dropping POWER_EFFICIENT. I have no
> idea how big an impact this will make or whether it'd even be
> noticeable but maybe it'd be worthwhile to mention that in the
> description?
Good point! How about if I change the last paragraph of the commit
log to read as follows?
Thanx, Paul
------------------------------------------------------------------------
This commit also causes SRCU to use this new RCU-specific
workqueue_struct. Note that SRCU's use of workqueues never blocks them
waiting for readers, so this should be safe from a forward-progress
viewpoint. Note that this moves SRCU from system_power_efficient_wq
to a normal workqueue. In the unlikely event that this results in
measurable degradation, a separate power-efficient workqueue will be
creates for SRCU.
Hello, Paul.
On Tue, Jan 09, 2018 at 07:21:12AM -0800, Paul E. McKenney wrote:
> > The code was previously using both system_power_efficient_wq and
> > system_workqueue (for the expedited path). I guess the options were
> > either using two workqueues or dropping POWER_EFFICIENT. I have no
> > idea how big an impact this will make or whether it'd even be
> > noticeable but maybe it'd be worthwhile to mention that in the
> > description?
>
> Good point! How about if I change the last paragraph of the commit
> log to read as follows?
>
> Thanx, Paul
>
> ------------------------------------------------------------------------
>
> This commit also causes SRCU to use this new RCU-specific
> workqueue_struct. Note that SRCU's use of workqueues never blocks them
> waiting for readers, so this should be safe from a forward-progress
> viewpoint. Note that this moves SRCU from system_power_efficient_wq
> to a normal workqueue. In the unlikely event that this results in
> measurable degradation, a separate power-efficient workqueue will be
> creates for SRCU.
Sounds good. Please feel free to add
Acked-by: Tejun Heo <[email protected]>
Thanks.
--
tejun
On Tue, Jan 09, 2018 at 07:37:52AM -0800, Tejun Heo wrote:
> Hello, Paul.
>
> On Tue, Jan 09, 2018 at 07:21:12AM -0800, Paul E. McKenney wrote:
> > > The code was previously using both system_power_efficient_wq and
> > > system_workqueue (for the expedited path). I guess the options were
> > > either using two workqueues or dropping POWER_EFFICIENT. I have no
> > > idea how big an impact this will make or whether it'd even be
> > > noticeable but maybe it'd be worthwhile to mention that in the
> > > description?
> >
> > Good point! How about if I change the last paragraph of the commit
> > log to read as follows?
> >
> > Thanx, Paul
> >
> > ------------------------------------------------------------------------
> >
> > This commit also causes SRCU to use this new RCU-specific
> > workqueue_struct. Note that SRCU's use of workqueues never blocks them
> > waiting for readers, so this should be safe from a forward-progress
> > viewpoint. Note that this moves SRCU from system_power_efficient_wq
> > to a normal workqueue. In the unlikely event that this results in
> > measurable degradation, a separate power-efficient workqueue will be
> > creates for SRCU.
>
> Sounds good. Please feel free to add
>
> Acked-by: Tejun Heo <[email protected]>
Done, thank you!
Thanx, Paul
On Tue, Jan 09, 2018 at 08:00:22AM -0800, Paul E. McKenney wrote:
> On Tue, Jan 09, 2018 at 07:37:52AM -0800, Tejun Heo wrote:
> > Hello, Paul.
> >
> > On Tue, Jan 09, 2018 at 07:21:12AM -0800, Paul E. McKenney wrote:
> > > > The code was previously using both system_power_efficient_wq and
> > > > system_workqueue (for the expedited path). I guess the options were
> > > > either using two workqueues or dropping POWER_EFFICIENT. I have no
> > > > idea how big an impact this will make or whether it'd even be
> > > > noticeable but maybe it'd be worthwhile to mention that in the
> > > > description?
> > >
> > > Good point! How about if I change the last paragraph of the commit
> > > log to read as follows?
> > >
> > > Thanx, Paul
> > >
> > > ------------------------------------------------------------------------
> > >
> > > This commit also causes SRCU to use this new RCU-specific
> > > workqueue_struct. Note that SRCU's use of workqueues never blocks them
> > > waiting for readers, so this should be safe from a forward-progress
> > > viewpoint. Note that this moves SRCU from system_power_efficient_wq
> > > to a normal workqueue. In the unlikely event that this results in
> > > measurable degradation, a separate power-efficient workqueue will be
> > > creates for SRCU.
> >
> > Sounds good. Please feel free to add
> >
> > Acked-by: Tejun Heo <[email protected]>
>
> Done, thank you!
And one additional question... How are we pushing this upstream? By
default, I would push things starting this late into the merge window
following the next one (v4.17), but would be more than willing to make
an exception given that this fixes a valid real-world complaint.
For that matter, if you would rather push my commit along with your pair
of commits, that works for me! Either way, please just let me know.
Thanx, Paul
Hello, Paul.
On Wed, Jan 10, 2018 at 12:08:21PM -0800, Paul E. McKenney wrote:
> And one additional question... How are we pushing this upstream? By
> default, I would push things starting this late into the merge window
> following the next one (v4.17), but would be more than willing to make
> an exception given that this fixes a valid real-world complaint.
I put the workqueue part in the fixes branch. If there's gonna be
another rc, I'm gonna push it to Linus. If not, I'll push when the
next window opens. I think either way should be fine.
> For that matter, if you would rather push my commit along with your pair
> of commits, that works for me! Either way, please just let me know.
Oh, let's push these through their respective trees. Please take the
RCU one through the usual RCU tree.
Thank you very much.
--
tejun
On Wed, Jan 10, 2018 at 01:41:01PM -0800, Tejun Heo wrote:
> Hello, Paul.
>
> On Wed, Jan 10, 2018 at 12:08:21PM -0800, Paul E. McKenney wrote:
> > And one additional question... How are we pushing this upstream? By
> > default, I would push things starting this late into the merge window
> > following the next one (v4.17), but would be more than willing to make
> > an exception given that this fixes a valid real-world complaint.
>
> I put the workqueue part in the fixes branch. If there's gonna be
> another rc, I'm gonna push it to Linus. If not, I'll push when the
> next window opens. I think either way should be fine.
>
> > For that matter, if you would rather push my commit along with your pair
> > of commits, that works for me! Either way, please just let me know.
>
> Oh, let's push these through their respective trees. Please take the
> RCU one through the usual RCU tree.
Works for me!
> Thank you very much.
And to you!
Thanx, Paul
On 01/02/2018 09:46 PM, Tejun Heo wrote:
> Hello,
>
> On Fri, Dec 29, 2017 at 02:07:16AM +0530, Prateek Sood wrote:
>> task T is waiting for cpuset_mutex acquired
>> by kworker/2:1
>>
>> sh ==> cpuhp/2 ==> kworker/2:1 ==> sh
>>
>> kworker/2:3 ==> kthreadd ==> Task T ==> kworker/2:1
>>
>> It seems that my earlier patch set should fix this scenario:
>> 1) Inverting locking order of cpuset_mutex and cpu_hotplug_lock.
>> 2) Make cpuset hotplug work synchronous.
>>
>> Could you please share your feedback.
>
> Hmm... this can also be resolved by adding WQ_MEM_RECLAIM to the
> synchronize rcu workqueue, right? Given the wide-spread usages of
> synchronize_rcu and friends, maybe that's the right solution, or at
> least something we also need to do, for this particular deadlock?
>
> Again, I don't have anything against making the domain rebuliding part
> of cpuset operations synchronous and these tricky deadlock scenarios
> do indicate that doing so would probably be beneficial. That said,
> tho, these scenarios seem more of manifestations of other problems
> exposed through kthreadd dependency than anything else.
>
> Thanks.
>
Hi TJ,
Thanks for suggesting WQ_MEM_RECLAIM solution.
My understanding of WQ_MEM_RECLAIM was that it needs to be used for
cases where memory pressure could cause deadlocks.
In this case it does not seem to be a memory pressure issue.
Overloading WQ_MEM_RECLAIM usage for solution to another problem
is the correct approach?
This scenario can be resolved by using WQ_MEM_RECLAIM and a separate
workqueue for rcu. But there seems to be a possibility in future if
any cpu hotplug callbacks use other predefined workqueues which do not
have WQ_MEM_RECLAIM option.
Please let me know your feedback on this.
Thanks
--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project
Hello, Prateek.
On Mon, Jan 15, 2018 at 05:32:18PM +0530, Prateek Sood wrote:
> My understanding of WQ_MEM_RECLAIM was that it needs to be used for
> cases where memory pressure could cause deadlocks.
Yes, that is the primary role; however, there are a couple places
where we need it to isolate a low level subsystem's forward progress
from the dependencies of the dynanic worker pool management.
> In this case it does not seem to be a memory pressure issue.
> Overloading WQ_MEM_RECLAIM usage for solution to another problem
> is the correct approach?
In general, we don't face this issue but for things like RCU, I do
believe this is the right approach. The name of the flag is a bit of
misnomer for those cases as it only describes one of the dependencies
of worker pool management. I don't think it's an actual problem given
that vast majority of the usages are protecting against memory
dependency.
> This scenario can be resolved by using WQ_MEM_RECLAIM and a separate
> workqueue for rcu. But there seems to be a possibility in future if
> any cpu hotplug callbacks use other predefined workqueues which do not
> have WQ_MEM_RECLAIM option.
We want to isolate from this sort of dependencies anyway, especially
how widely and deeply we depend on RCU these days. Maybe we will
develop a justified case in the future but at least up until now every
case around this issue seems to be something to be fixed on their own.
Thanks.
--
tejun