I am very sorry. My gcc version is 7.5 and it does not report error.
We have a deadlock problem which can be solved by commit 4f7e7236435ca
("cgroup: Fix threadgroup_rwsem <-> cpus_read_lock() deadlock").
However, it makes lock order of cpus_read_lock and cpuset_mutex
wrong in v4.19. The call sequence is as follows:
cgroup_procs_write()
cgroup_procs_write_start()
get_online_cpus(); // cpus_read_lock()
percpu_down_write(&cgroup_threadgroup_rwsem)
cgroup_attach_task
cgroup_migrate
cgroup_migrate_execute
ss->attach (cpust_attach)
mutex_lock(&cpuset_mutex)
it seems hard to make cpus_read_lock is locked before
cgroup_threadgroup_rwsem and cpuset_mutex is locked before
cpus_read_lock unless backport the commit d74b27d63a8beb
("cgroup/cpuset: Change cpuset_rwsem and hotplug lock order")
Changes in v2:
* Add #include <linux/cpu.h> in kernel/cgroup/cgroup.c to
avoid some compile error.
* Exchange get_online_cpus() location in cpuset_attach to
keep cpu_hotplug_lock->cpuset_mutex order, although it will
be remove by ("cgroup: Fix threadgroup_rwsem <->
cpus_read_lock() deadlock")
Juri Lelli (1):
cgroup/cpuset: Change cpuset_rwsem and hotplug lock order
Tejun Heo (1):
cgroup: Fix threadgroup_rwsem <-> cpus_read_lock() deadlock
Tetsuo Handa (1):
cgroup: Add missing cpus_read_lock() to cgroup_attach_task_all()
include/linux/cpuset.h | 8 +++----
kernel/cgroup/cgroup-v1.c | 3 +++
kernel/cgroup/cgroup.c | 50 +++++++++++++++++++++++++++++++++++----
kernel/cgroup/cpuset.c | 25 ++++++++++++--------
4 files changed, 67 insertions(+), 19 deletions(-)
--
2.17.1
From: Tejun Heo <[email protected]>
commit 4f7e7236435ca0abe005c674ebd6892c6e83aeb3 upstream.
Add #include <linux/cpu.h> to avoid compile error on some architectures.
commit 9a3284fad42f6 ("cgroup: Optimize single thread migration") and
commit 671c11f0619e5 ("cgroup: Elide write-locking threadgroup_rwsem
when updating csses on an empty subtree") are not backport. So ignore the
input parameter of cgroup_attach_lock/cgroup_attach_unlock.
original commit message:
Bringing up a CPU may involve creating and destroying tasks which requires
read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
cpus_read_lock(). However, cpuset's ->attach(), which may be called with
thredagroup_rwsem write-locked, also wants to disable CPU hotplug and
acquires cpus_read_lock(), leading to a deadlock.
Fix it by guaranteeing that ->attach() is always called with CPU hotplug
disabled and removing cpus_read_lock() call from cpuset_attach().
Signed-off-by: Tejun Heo <[email protected]>
Reviewed-and-tested-by: Imran Khan <[email protected]>
Reported-and-tested-by: Xuewen Yan <[email protected]>
Fixes: 05c7b7a92cc8 ("cgroup/cpuset: Fix a race between cpuset_attach() and cpu hotplug")
Cc: [email protected] # v5.17+
Signed-off-by: Cai Xinchen <[email protected]>
---
v2:
- Add #include <linux/cpu.h> in kernel/cgroup/cgroup.c to avoid compile
error on some architectures
---
kernel/cgroup/cgroup.c | 50 +++++++++++++++++++++++++++++++++++++-----
kernel/cgroup/cpuset.c | 7 +-----
2 files changed, 46 insertions(+), 11 deletions(-)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a892a99eb4bf..a8185cdb8587 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -55,6 +55,7 @@
#include <linux/nsproxy.h>
#include <linux/file.h>
#include <linux/sched/cputime.h>
+#include <linux/cpu.h>
#include <net/sock.h>
#define CREATE_TRACE_POINTS
@@ -2209,6 +2210,45 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
}
EXPORT_SYMBOL_GPL(task_cgroup_path);
+/**
+ * cgroup_attach_lock - Lock for ->attach()
+ * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
+ *
+ * cgroup migration sometimes needs to stabilize threadgroups against forks and
+ * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
+ * implementations (e.g. cpuset), also need to disable CPU hotplug.
+ * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
+ * lead to deadlocks.
+ *
+ * Bringing up a CPU may involve creating and destroying tasks which requires
+ * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
+ * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
+ * write-locking threadgroup_rwsem, the locking order is reversed and we end up
+ * waiting for an on-going CPU hotplug operation which in turn is waiting for
+ * the threadgroup_rwsem to be released to create new tasks. For more details:
+ *
+ * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
+ *
+ * Resolve the situation by always acquiring cpus_read_lock() before optionally
+ * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
+ * CPU hotplug is disabled on entry.
+ */
+static void cgroup_attach_lock(void)
+{
+ get_online_cpus();
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+}
+
+/**
+ * cgroup_attach_unlock - Undo cgroup_attach_lock()
+ * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
+ */
+static void cgroup_attach_unlock(void)
+{
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+ put_online_cpus();
+}
+
/**
* cgroup_migrate_add_task - add a migration target task to a migration context
* @task: target task
@@ -2694,7 +2734,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
return ERR_PTR(-EINVAL);
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_lock();
rcu_read_lock();
if (pid) {
@@ -2725,7 +2765,7 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
goto out_unlock_rcu;
out_unlock_threadgroup:
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock();
out_unlock_rcu:
rcu_read_unlock();
return tsk;
@@ -2740,7 +2780,7 @@ void cgroup_procs_write_finish(struct task_struct *task)
/* release reference from cgroup_procs_write_start() */
put_task_struct(task);
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock();
for_each_subsys(ss, ssid)
if (ss->post_attach)
ss->post_attach();
@@ -2799,7 +2839,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
lockdep_assert_held(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_lock();
/* look up all csses currently attached to @cgrp's subtree */
spin_lock_irq(&css_set_lock);
@@ -2830,7 +2870,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock();
return ret;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 7169e47fb48b..c6d412cebc43 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1528,11 +1528,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
- /*
- * It should hold cpus lock because a cpu offline event can
- * cause set_cpus_allowed_ptr() failed.
- */
- get_online_cpus();
+ lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
mutex_lock(&cpuset_mutex);
/* prepare for attach */
@@ -1588,7 +1584,6 @@ static void cpuset_attach(struct cgroup_taskset *tset)
wake_up(&cpuset_attach_wq);
mutex_unlock(&cpuset_mutex);
- put_online_cpus();
}
/* The various types of files and directories in a cpuset file system */
--
2.17.1
From: Juri Lelli <[email protected]>
commit d74b27d63a8bebe2fe634944e4ebdc7b10db7a39 upstream.
commit 1243dc518c9da ("cgroup/cpuset: Convert cpuset_mutex to
percpu_rwsem") is performance patch which is not backport. So
convert percpu_rwsem to cpuset_mutex.
commit aa44002e7db25 ("cpuset: Fix unsafe lock order between
cpuset lock and cpuslock") makes lock order keep cpuset_mutex
->cpu_hotplug_lock. We should change lock order in cpuset_attach.
original commit message:
cpuset_rwsem is going to be acquired from sched_setscheduler() with a
following patch. There are however paths (e.g., spawn_ksoftirqd) in
which sched_scheduler() is eventually called while holding hotplug lock;
this creates a dependecy between hotplug lock (to be always acquired
first) and cpuset_rwsem (to be always acquired after hotplug lock).
Fix paths which currently take the two locks in the wrong order (after
a following patch is applied).
Tested-by: Dietmar Eggemann <[email protected]>
Signed-off-by: Juri Lelli <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Link: https://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
Signed-off-by: Cai Xinchen <[email protected]>
---
v2:
* Change get_online_cpus/put_online_cpus lock order in cpuset_attach
to keep cpuset_mutex and hotplug lock order
---
include/linux/cpuset.h | 8 ++++----
kernel/cgroup/cpuset.c | 24 +++++++++++++++++-------
2 files changed, 21 insertions(+), 11 deletions(-)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 934633a05d20..7f1478c26a33 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -40,14 +40,14 @@ static inline bool cpusets_enabled(void)
static inline void cpuset_inc(void)
{
- static_branch_inc(&cpusets_pre_enable_key);
- static_branch_inc(&cpusets_enabled_key);
+ static_branch_inc_cpuslocked(&cpusets_pre_enable_key);
+ static_branch_inc_cpuslocked(&cpusets_enabled_key);
}
static inline void cpuset_dec(void)
{
- static_branch_dec(&cpusets_enabled_key);
- static_branch_dec(&cpusets_pre_enable_key);
+ static_branch_dec_cpuslocked(&cpusets_enabled_key);
+ static_branch_dec_cpuslocked(&cpusets_pre_enable_key);
}
extern int cpuset_init(void);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index dcd5755b1fe2..7169e47fb48b 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -830,8 +830,8 @@ static void rebuild_sched_domains_locked(void)
cpumask_var_t *doms;
int ndoms;
+ lockdep_assert_cpus_held();
lockdep_assert_held(&cpuset_mutex);
- get_online_cpus();
/*
* We have raced with CPU hotplug. Don't do anything to avoid
@@ -839,15 +839,13 @@ static void rebuild_sched_domains_locked(void)
* Anyways, hotplug work item will rebuild sched domains.
*/
if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
- goto out;
+ return;
/* Generate domain masks and attrs */
ndoms = generate_sched_domains(&doms, &attr);
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
-out:
- put_online_cpus();
}
#else /* !CONFIG_SMP */
static void rebuild_sched_domains_locked(void)
@@ -857,9 +855,11 @@ static void rebuild_sched_domains_locked(void)
void rebuild_sched_domains(void)
{
+ get_online_cpus();
mutex_lock(&cpuset_mutex);
rebuild_sched_domains_locked();
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
}
/**
@@ -1528,13 +1528,13 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
- mutex_lock(&cpuset_mutex);
-
/*
* It should hold cpus lock because a cpu offline event can
* cause set_cpus_allowed_ptr() failed.
*/
get_online_cpus();
+ mutex_lock(&cpuset_mutex);
+
/* prepare for attach */
if (cs == &top_cpuset)
cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1553,7 +1553,6 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
cpuset_update_task_spread_flag(cs, task);
}
- put_online_cpus();
/*
* Change mm for all threadgroup leaders. This is expensive and may
@@ -1589,6 +1588,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
wake_up(&cpuset_attach_wq);
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
}
/* The various types of files and directories in a cpuset file system */
@@ -1617,6 +1617,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = 0;
+ get_online_cpus();
mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
@@ -1654,6 +1655,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
}
out_unlock:
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
return retval;
}
@@ -1664,6 +1666,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = -ENODEV;
+ get_online_cpus();
mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -1678,6 +1681,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
}
out_unlock:
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
return retval;
}
@@ -1716,6 +1720,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
kernfs_break_active_protection(of->kn);
flush_work(&cpuset_hotplug_work);
+ get_online_cpus();
mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -1741,6 +1746,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
free_trial_cpuset(trialcs);
out_unlock:
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css);
flush_workqueue(cpuset_migrate_mm_wq);
@@ -1985,6 +1991,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (!parent)
return 0;
+ get_online_cpus();
mutex_lock(&cpuset_mutex);
set_bit(CS_ONLINE, &cs->flags);
@@ -2035,6 +2042,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
spin_unlock_irq(&callback_lock);
out_unlock:
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
return 0;
}
@@ -2048,6 +2056,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
+ get_online_cpus();
mutex_lock(&cpuset_mutex);
if (is_sched_load_balance(cs))
@@ -2057,6 +2066,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
clear_bit(CS_ONLINE, &cs->flags);
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
}
static void cpuset_css_free(struct cgroup_subsys_state *css)
--
2.17.1
On Mon, Mar 20, 2023 at 01:15:04AM +0000, Cai Xinchen wrote:
> I am very sorry. My gcc version is 7.5 and it does not report error.
>
> We have a deadlock problem which can be solved by commit 4f7e7236435ca
> ("cgroup: Fix threadgroup_rwsem <-> cpus_read_lock() deadlock").
> However, it makes lock order of cpus_read_lock and cpuset_mutex
> wrong in v4.19. The call sequence is as follows:
> cgroup_procs_write()
> cgroup_procs_write_start()
> get_online_cpus(); // cpus_read_lock()
> percpu_down_write(&cgroup_threadgroup_rwsem)
> cgroup_attach_task
> cgroup_migrate
> cgroup_migrate_execute
> ss->attach (cpust_attach)
> mutex_lock(&cpuset_mutex)
> it seems hard to make cpus_read_lock is locked before
> cgroup_threadgroup_rwsem and cpuset_mutex is locked before
> cpus_read_lock unless backport the commit d74b27d63a8beb
> ("cgroup/cpuset: Change cpuset_rwsem and hotplug lock order")
>
> Changes in v2:
> * Add #include <linux/cpu.h> in kernel/cgroup/cgroup.c to
> avoid some compile error.
> * Exchange get_online_cpus() location in cpuset_attach to
> keep cpu_hotplug_lock->cpuset_mutex order, although it will
> be remove by ("cgroup: Fix threadgroup_rwsem <->
> cpus_read_lock() deadlock")
>
> Juri Lelli (1):
> cgroup/cpuset: Change cpuset_rwsem and hotplug lock order
>
> Tejun Heo (1):
> cgroup: Fix threadgroup_rwsem <-> cpus_read_lock() deadlock
>
> Tetsuo Handa (1):
> cgroup: Add missing cpus_read_lock() to cgroup_attach_task_all()
>
> include/linux/cpuset.h | 8 +++----
> kernel/cgroup/cgroup-v1.c | 3 +++
> kernel/cgroup/cgroup.c | 50 +++++++++++++++++++++++++++++++++++----
> kernel/cgroup/cpuset.c | 25 ++++++++++++--------
> 4 files changed, 67 insertions(+), 19 deletions(-)
Now queued up, thanks.
greg k-h