As a rwlock for tasklist_lock, there are multiple scenarios to acquire
read lock which write lock needed to be waiting for.
In freeze_process/thaw_processes it can take about 200+ms for holding read
lock of tasklist_lock by walking and freezing/thawing tasks in commercial
devices. And write_lock_irq will have preempt disabled and local irq
disabled to spin until the tasklist_lock can be acquired. This leading to
a bad responsive performance of current system.
Take an example:
1. cpu0 is holding read lock of tasklist_lock to thaw_processes.
2. cpu1 is waiting write lock of tasklist_lock to exec a new thread with
preempt_disabled and local irq disabled.
3. cpu2 is waiting write lock of tasklist_lock to do_exit with
preempt_disabled and local irq disabled.
4. cpu3 is waiting write lock of tasklist_lock to do_exit with
preempt_disabled and local irq disabled.
So introduce a write lock/unlock wrapper for tasklist_lock specificly.
The current taskslist_lock writers all have write_lock_irq to hold
tasklist_lock, and write_unlock_irq to release tasklist_lock, that means
the writers are not suitable or workable to wait on tasklist_lock in irq
disabled scenarios. So the write lock/unlock wrapper here only follow the
current design of directly use local_irq_disable and local_irq_enable,
and not take already irq disabled writer callers into account.
Use write_trylock in the loop and enabled irq for cpu to repsond if lock
cannot be taken.
Signed-off-by: Maria Yu <[email protected]>
---
fs/exec.c | 10 +++++-----
include/linux/sched/task.h | 29 +++++++++++++++++++++++++++++
kernel/exit.c | 16 ++++++++--------
kernel/fork.c | 6 +++---
kernel/ptrace.c | 12 ++++++------
kernel/sys.c | 8 ++++----
security/keys/keyctl.c | 4 ++--
7 files changed, 57 insertions(+), 28 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index 4aa19b24f281..030eef6852eb 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1086,7 +1086,7 @@ static int de_thread(struct task_struct *tsk)
for (;;) {
cgroup_threadgroup_change_begin(tsk);
- write_lock_irq(&tasklist_lock);
+ write_lock_tasklist_lock();
/*
* Do this under tasklist_lock to ensure that
* exit_notify() can't miss ->group_exec_task
@@ -1095,7 +1095,7 @@ static int de_thread(struct task_struct *tsk)
if (likely(leader->exit_state))
break;
__set_current_state(TASK_KILLABLE);
- write_unlock_irq(&tasklist_lock);
+ write_unlock_tasklist_lock();
cgroup_threadgroup_change_end(tsk);
schedule();
if (__fatal_signal_pending(tsk))
@@ -1150,7 +1150,7 @@ static int de_thread(struct task_struct *tsk)
*/
if (unlikely(leader->ptrace))
__wake_up_parent(leader, leader->parent);
- write_unlock_irq(&tasklist_lock);
+ write_unlock_tasklist_lock();
cgroup_threadgroup_change_end(tsk);
release_task(leader);
@@ -1198,13 +1198,13 @@ static int unshare_sighand(struct task_struct *me)
refcount_set(&newsighand->count, 1);
- write_lock_irq(&tasklist_lock);
+ write_lock_tasklist_lock();
spin_lock(&oldsighand->siglock);
memcpy(newsighand->action, oldsighand->action,
sizeof(newsighand->action));
rcu_assign_pointer(me->sighand, newsighand);
spin_unlock(&oldsighand->siglock);
- write_unlock_irq(&tasklist_lock);
+ write_unlock_tasklist_lock();
__cleanup_sighand(oldsighand);
}
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index a23af225c898..6f69d9a3c868 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -50,6 +50,35 @@ struct kernel_clone_args {
* a separate lock).
*/
extern rwlock_t tasklist_lock;
+
+/*
+ * Tasklist_lock is a special lock, it takes a good amount of time of
+ * taskslist_lock readers to finish, and the pure write_irq_lock api
+ * will do local_irq_disable at the very first, and put the current cpu
+ * waiting for the lock while is non-responsive for interrupts.
+ *
+ * The current taskslist_lock writers all have write_lock_irq to hold
+ * tasklist_lock, and write_unlock_irq to release tasklist_lock, that
+ * means the writers are not suitable or workable to wait on
+ * tasklist_lock in irq disabled scenarios. So the write lock/unlock
+ * wrapper here only follow the current design of directly use
+ * local_irq_disable and local_irq_enable.
+ */
+static inline void write_lock_tasklist_lock(void)
+{
+ while (1) {
+ local_irq_disable();
+ if (write_trylock(&tasklist_lock))
+ break;
+ local_irq_enable();
+ cpu_relax();
+ }
+}
+static inline void write_unlock_tasklist_lock(void)
+{
+ write_unlock_irq(&tasklist_lock);
+}
+
extern spinlock_t mmlist_lock;
extern union thread_union init_thread_union;
diff --git a/kernel/exit.c b/kernel/exit.c
index ee9f43bed49a..18b00f477079 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -251,7 +251,7 @@ void release_task(struct task_struct *p)
cgroup_release(p);
- write_lock_irq(&tasklist_lock);
+ write_lock_tasklist_lock();
ptrace_release_task(p);
thread_pid = get_pid(p->thread_pid);
__exit_signal(p);
@@ -275,7 +275,7 @@ void release_task(struct task_struct *p)
leader->exit_state = EXIT_DEAD;
}
- write_unlock_irq(&tasklist_lock);
+ write_unlock_tasklist_lock();
seccomp_filter_release(p);
proc_flush_pid(thread_pid);
put_pid(thread_pid);
@@ -598,7 +598,7 @@ static struct task_struct *find_child_reaper(struct task_struct *father,
return reaper;
}
- write_unlock_irq(&tasklist_lock);
+ write_unlock_tasklist_lock();
list_for_each_entry_safe(p, n, dead, ptrace_entry) {
list_del_init(&p->ptrace_entry);
@@ -606,7 +606,7 @@ static struct task_struct *find_child_reaper(struct task_struct *father,
}
zap_pid_ns_processes(pid_ns);
- write_lock_irq(&tasklist_lock);
+ write_lock_tasklist_lock();
return father;
}
@@ -730,7 +730,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
struct task_struct *p, *n;
LIST_HEAD(dead);
- write_lock_irq(&tasklist_lock);
+ write_lock_tasklist_lock();
forget_original_parent(tsk, &dead);
if (group_dead)
@@ -758,7 +758,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
wake_up_process(tsk->signal->group_exec_task);
- write_unlock_irq(&tasklist_lock);
+ write_unlock_tasklist_lock();
list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
list_del_init(&p->ptrace_entry);
@@ -1172,7 +1172,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
wo->wo_stat = status;
if (state == EXIT_TRACE) {
- write_lock_irq(&tasklist_lock);
+ write_lock_tasklist_lock();
/* We dropped tasklist, ptracer could die and untrace */
ptrace_unlink(p);
@@ -1181,7 +1181,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
if (do_notify_parent(p, p->exit_signal))
state = EXIT_DEAD;
p->exit_state = state;
- write_unlock_irq(&tasklist_lock);
+ write_unlock_tasklist_lock();
}
if (state == EXIT_DEAD)
release_task(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index 10917c3e1f03..06c4b4ab9102 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2623,7 +2623,7 @@ __latent_entropy struct task_struct *copy_process(
* Make it visible to the rest of the system, but dont wake it up yet.
* Need tasklist lock for parent etc handling!
*/
- write_lock_irq(&tasklist_lock);
+ write_lock_tasklist_lock();
/* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
@@ -2714,7 +2714,7 @@ __latent_entropy struct task_struct *copy_process(
hlist_del_init(&delayed.node);
spin_unlock(¤t->sighand->siglock);
syscall_tracepoint_update(p);
- write_unlock_irq(&tasklist_lock);
+ write_unlock_tasklist_lock();
if (pidfile)
fd_install(pidfd, pidfile);
@@ -2735,7 +2735,7 @@ __latent_entropy struct task_struct *copy_process(
bad_fork_cancel_cgroup:
sched_core_free(p);
spin_unlock(¤t->sighand->siglock);
- write_unlock_irq(&tasklist_lock);
+ write_unlock_tasklist_lock();
cgroup_cancel_fork(p, args);
bad_fork_put_pidfd:
if (clone_flags & CLONE_PIDFD) {
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d8b5e13a2229..a8d7e2d06f3e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -435,7 +435,7 @@ static int ptrace_attach(struct task_struct *task, long request,
if (retval)
goto unlock_creds;
- write_lock_irq(&tasklist_lock);
+ write_lock_tasklist_lock();
retval = -EPERM;
if (unlikely(task->exit_state))
goto unlock_tasklist;
@@ -479,7 +479,7 @@ static int ptrace_attach(struct task_struct *task, long request,
retval = 0;
unlock_tasklist:
- write_unlock_irq(&tasklist_lock);
+ write_unlock_tasklist_lock();
unlock_creds:
mutex_unlock(&task->signal->cred_guard_mutex);
out:
@@ -508,7 +508,7 @@ static int ptrace_traceme(void)
{
int ret = -EPERM;
- write_lock_irq(&tasklist_lock);
+ write_lock_tasklist_lock();
/* Are we already being traced? */
if (!current->ptrace) {
ret = security_ptrace_traceme(current->parent);
@@ -522,7 +522,7 @@ static int ptrace_traceme(void)
ptrace_link(current, current->real_parent);
}
}
- write_unlock_irq(&tasklist_lock);
+ write_unlock_tasklist_lock();
return ret;
}
@@ -588,7 +588,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
/* Architecture-specific hardware disable .. */
ptrace_disable(child);
- write_lock_irq(&tasklist_lock);
+ write_lock_tasklist_lock();
/*
* We rely on ptrace_freeze_traced(). It can't be killed and
* untraced by another thread, it can't be a zombie.
@@ -600,7 +600,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
*/
child->exit_code = data;
__ptrace_detach(current, child);
- write_unlock_irq(&tasklist_lock);
+ write_unlock_tasklist_lock();
proc_ptrace_connector(child, PTRACE_DETACH);
diff --git a/kernel/sys.c b/kernel/sys.c
index e219fcfa112d..0b1647d3ed32 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1088,7 +1088,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
/* From this point forward we keep holding onto the tasklist lock
* so that our parent does not change from under us. -DaveM
*/
- write_lock_irq(&tasklist_lock);
+ write_lock_tasklist_lock();
err = -ESRCH;
p = find_task_by_vpid(pid);
@@ -1136,7 +1136,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
err = 0;
out:
/* All paths lead to here, thus we are safe. -DaveM */
- write_unlock_irq(&tasklist_lock);
+ write_unlock_tasklist_lock();
rcu_read_unlock();
return err;
}
@@ -1229,7 +1229,7 @@ int ksys_setsid(void)
pid_t session = pid_vnr(sid);
int err = -EPERM;
- write_lock_irq(&tasklist_lock);
+ write_lock_tasklist_lock();
/* Fail if I am already a session leader */
if (group_leader->signal->leader)
goto out;
@@ -1247,7 +1247,7 @@ int ksys_setsid(void)
err = session;
out:
- write_unlock_irq(&tasklist_lock);
+ write_unlock_tasklist_lock();
if (err > 0) {
proc_sid_connector(group_leader);
sched_autogroup_create_attach(group_leader);
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 19be69fa4d05..dd8aed20486a 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1652,7 +1652,7 @@ long keyctl_session_to_parent(void)
me = current;
rcu_read_lock();
- write_lock_irq(&tasklist_lock);
+ write_lock_tasklist_lock();
ret = -EPERM;
oldwork = NULL;
@@ -1702,7 +1702,7 @@ long keyctl_session_to_parent(void)
if (!ret)
newwork = NULL;
unlock:
- write_unlock_irq(&tasklist_lock);
+ write_unlock_tasklist_lock();
rcu_read_unlock();
if (oldwork)
put_cred(container_of(oldwork, struct cred, rcu));
base-commit: 88035e5694a86a7167d490bb95e9df97a9bb162b
--
2.17.1
On 12/25/2023 4:19 PM, Maria Yu wrote:
> As a rwlock for tasklist_lock, there are multiple scenarios to acquire
> read lock which write lock needed to be waiting for.
> In freeze_process/thaw_processes it can take about 200+ms for holding read
> lock of tasklist_lock by walking and freezing/thawing tasks in commercial
> devices. And write_lock_irq will have preempt disabled and local irq
> disabled to spin until the tasklist_lock can be acquired. This leading to
> a bad responsive performance of current system.
> Take an example:
> 1. cpu0 is holding read lock of tasklist_lock to thaw_processes.
> 2. cpu1 is waiting write lock of tasklist_lock to exec a new thread with
> preempt_disabled and local irq disabled.
> 3. cpu2 is waiting write lock of tasklist_lock to do_exit with
> preempt_disabled and local irq disabled.
> 4. cpu3 is waiting write lock of tasklist_lock to do_exit with
> preempt_disabled and local irq disabled.
> So introduce a write lock/unlock wrapper for tasklist_lock specificly.
> The current taskslist_lock writers all have write_lock_irq to hold
> tasklist_lock, and write_unlock_irq to release tasklist_lock, that means
> the writers are not suitable or workable to wait on tasklist_lock in irq
> disabled scenarios. So the write lock/unlock wrapper here only follow the
> current design of directly use local_irq_disable and local_irq_enable,
> and not take already irq disabled writer callers into account.
> Use write_trylock in the loop and enabled irq for cpu to repsond if lock
> cannot be taken.
Pls ignore this patch.
Change is not ready for review.
Re-send by mistake.
>
> Signed-off-by: Maria Yu <[email protected]>
> ---
> fs/exec.c | 10 +++++-----
> include/linux/sched/task.h | 29 +++++++++++++++++++++++++++++
> kernel/exit.c | 16 ++++++++--------
> kernel/fork.c | 6 +++---
> kernel/ptrace.c | 12 ++++++------
> kernel/sys.c | 8 ++++----
> security/keys/keyctl.c | 4 ++--
> 7 files changed, 57 insertions(+), 28 deletions(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index 4aa19b24f281..030eef6852eb 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1086,7 +1086,7 @@ static int de_thread(struct task_struct *tsk)
>
> for (;;) {
> cgroup_threadgroup_change_begin(tsk);
> - write_lock_irq(&tasklist_lock);
> + write_lock_tasklist_lock();
> /*
> * Do this under tasklist_lock to ensure that
> * exit_notify() can't miss ->group_exec_task
> @@ -1095,7 +1095,7 @@ static int de_thread(struct task_struct *tsk)
> if (likely(leader->exit_state))
> break;
> __set_current_state(TASK_KILLABLE);
> - write_unlock_irq(&tasklist_lock);
> + write_unlock_tasklist_lock();
> cgroup_threadgroup_change_end(tsk);
> schedule();
> if (__fatal_signal_pending(tsk))
> @@ -1150,7 +1150,7 @@ static int de_thread(struct task_struct *tsk)
> */
> if (unlikely(leader->ptrace))
> __wake_up_parent(leader, leader->parent);
> - write_unlock_irq(&tasklist_lock);
> + write_unlock_tasklist_lock();
> cgroup_threadgroup_change_end(tsk);
>
> release_task(leader);
> @@ -1198,13 +1198,13 @@ static int unshare_sighand(struct task_struct *me)
>
> refcount_set(&newsighand->count, 1);
>
> - write_lock_irq(&tasklist_lock);
> + write_lock_tasklist_lock();
> spin_lock(&oldsighand->siglock);
> memcpy(newsighand->action, oldsighand->action,
> sizeof(newsighand->action));
> rcu_assign_pointer(me->sighand, newsighand);
> spin_unlock(&oldsighand->siglock);
> - write_unlock_irq(&tasklist_lock);
> + write_unlock_tasklist_lock();
>
> __cleanup_sighand(oldsighand);
> }
> diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
> index a23af225c898..6f69d9a3c868 100644
> --- a/include/linux/sched/task.h
> +++ b/include/linux/sched/task.h
> @@ -50,6 +50,35 @@ struct kernel_clone_args {
> * a separate lock).
> */
> extern rwlock_t tasklist_lock;
> +
> +/*
> + * Tasklist_lock is a special lock, it takes a good amount of time of
> + * taskslist_lock readers to finish, and the pure write_irq_lock api
> + * will do local_irq_disable at the very first, and put the current cpu
> + * waiting for the lock while is non-responsive for interrupts.
> + *
> + * The current taskslist_lock writers all have write_lock_irq to hold
> + * tasklist_lock, and write_unlock_irq to release tasklist_lock, that
> + * means the writers are not suitable or workable to wait on
> + * tasklist_lock in irq disabled scenarios. So the write lock/unlock
> + * wrapper here only follow the current design of directly use
> + * local_irq_disable and local_irq_enable.
> + */
> +static inline void write_lock_tasklist_lock(void)
> +{
> + while (1) {
> + local_irq_disable();
> + if (write_trylock(&tasklist_lock))
> + break;
> + local_irq_enable();
> + cpu_relax();
> + }
> +}
> +static inline void write_unlock_tasklist_lock(void)
> +{
> + write_unlock_irq(&tasklist_lock);
> +}
> +
> extern spinlock_t mmlist_lock;
>
> extern union thread_union init_thread_union;
> diff --git a/kernel/exit.c b/kernel/exit.c
> index ee9f43bed49a..18b00f477079 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -251,7 +251,7 @@ void release_task(struct task_struct *p)
>
> cgroup_release(p);
>
> - write_lock_irq(&tasklist_lock);
> + write_lock_tasklist_lock();
> ptrace_release_task(p);
> thread_pid = get_pid(p->thread_pid);
> __exit_signal(p);
> @@ -275,7 +275,7 @@ void release_task(struct task_struct *p)
> leader->exit_state = EXIT_DEAD;
> }
>
> - write_unlock_irq(&tasklist_lock);
> + write_unlock_tasklist_lock();
> seccomp_filter_release(p);
> proc_flush_pid(thread_pid);
> put_pid(thread_pid);
> @@ -598,7 +598,7 @@ static struct task_struct *find_child_reaper(struct task_struct *father,
> return reaper;
> }
>
> - write_unlock_irq(&tasklist_lock);
> + write_unlock_tasklist_lock();
>
> list_for_each_entry_safe(p, n, dead, ptrace_entry) {
> list_del_init(&p->ptrace_entry);
> @@ -606,7 +606,7 @@ static struct task_struct *find_child_reaper(struct task_struct *father,
> }
>
> zap_pid_ns_processes(pid_ns);
> - write_lock_irq(&tasklist_lock);
> + write_lock_tasklist_lock();
>
> return father;
> }
> @@ -730,7 +730,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
> struct task_struct *p, *n;
> LIST_HEAD(dead);
>
> - write_lock_irq(&tasklist_lock);
> + write_lock_tasklist_lock();
> forget_original_parent(tsk, &dead);
>
> if (group_dead)
> @@ -758,7 +758,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
> /* mt-exec, de_thread() is waiting for group leader */
> if (unlikely(tsk->signal->notify_count < 0))
> wake_up_process(tsk->signal->group_exec_task);
> - write_unlock_irq(&tasklist_lock);
> + write_unlock_tasklist_lock();
>
> list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
> list_del_init(&p->ptrace_entry);
> @@ -1172,7 +1172,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
> wo->wo_stat = status;
>
> if (state == EXIT_TRACE) {
> - write_lock_irq(&tasklist_lock);
> + write_lock_tasklist_lock();
> /* We dropped tasklist, ptracer could die and untrace */
> ptrace_unlink(p);
>
> @@ -1181,7 +1181,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
> if (do_notify_parent(p, p->exit_signal))
> state = EXIT_DEAD;
> p->exit_state = state;
> - write_unlock_irq(&tasklist_lock);
> + write_unlock_tasklist_lock();
> }
> if (state == EXIT_DEAD)
> release_task(p);
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 10917c3e1f03..06c4b4ab9102 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -2623,7 +2623,7 @@ __latent_entropy struct task_struct *copy_process(
> * Make it visible to the rest of the system, but dont wake it up yet.
> * Need tasklist lock for parent etc handling!
> */
> - write_lock_irq(&tasklist_lock);
> + write_lock_tasklist_lock();
>
> /* CLONE_PARENT re-uses the old parent */
> if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
> @@ -2714,7 +2714,7 @@ __latent_entropy struct task_struct *copy_process(
> hlist_del_init(&delayed.node);
> spin_unlock(¤t->sighand->siglock);
> syscall_tracepoint_update(p);
> - write_unlock_irq(&tasklist_lock);
> + write_unlock_tasklist_lock();
>
> if (pidfile)
> fd_install(pidfd, pidfile);
> @@ -2735,7 +2735,7 @@ __latent_entropy struct task_struct *copy_process(
> bad_fork_cancel_cgroup:
> sched_core_free(p);
> spin_unlock(¤t->sighand->siglock);
> - write_unlock_irq(&tasklist_lock);
> + write_unlock_tasklist_lock();
> cgroup_cancel_fork(p, args);
> bad_fork_put_pidfd:
> if (clone_flags & CLONE_PIDFD) {
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index d8b5e13a2229..a8d7e2d06f3e 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -435,7 +435,7 @@ static int ptrace_attach(struct task_struct *task, long request,
> if (retval)
> goto unlock_creds;
>
> - write_lock_irq(&tasklist_lock);
> + write_lock_tasklist_lock();
> retval = -EPERM;
> if (unlikely(task->exit_state))
> goto unlock_tasklist;
> @@ -479,7 +479,7 @@ static int ptrace_attach(struct task_struct *task, long request,
>
> retval = 0;
> unlock_tasklist:
> - write_unlock_irq(&tasklist_lock);
> + write_unlock_tasklist_lock();
> unlock_creds:
> mutex_unlock(&task->signal->cred_guard_mutex);
> out:
> @@ -508,7 +508,7 @@ static int ptrace_traceme(void)
> {
> int ret = -EPERM;
>
> - write_lock_irq(&tasklist_lock);
> + write_lock_tasklist_lock();
> /* Are we already being traced? */
> if (!current->ptrace) {
> ret = security_ptrace_traceme(current->parent);
> @@ -522,7 +522,7 @@ static int ptrace_traceme(void)
> ptrace_link(current, current->real_parent);
> }
> }
> - write_unlock_irq(&tasklist_lock);
> + write_unlock_tasklist_lock();
>
> return ret;
> }
> @@ -588,7 +588,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
> /* Architecture-specific hardware disable .. */
> ptrace_disable(child);
>
> - write_lock_irq(&tasklist_lock);
> + write_lock_tasklist_lock();
> /*
> * We rely on ptrace_freeze_traced(). It can't be killed and
> * untraced by another thread, it can't be a zombie.
> @@ -600,7 +600,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
> */
> child->exit_code = data;
> __ptrace_detach(current, child);
> - write_unlock_irq(&tasklist_lock);
> + write_unlock_tasklist_lock();
>
> proc_ptrace_connector(child, PTRACE_DETACH);
>
> diff --git a/kernel/sys.c b/kernel/sys.c
> index e219fcfa112d..0b1647d3ed32 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -1088,7 +1088,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
> /* From this point forward we keep holding onto the tasklist lock
> * so that our parent does not change from under us. -DaveM
> */
> - write_lock_irq(&tasklist_lock);
> + write_lock_tasklist_lock();
>
> err = -ESRCH;
> p = find_task_by_vpid(pid);
> @@ -1136,7 +1136,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
> err = 0;
> out:
> /* All paths lead to here, thus we are safe. -DaveM */
> - write_unlock_irq(&tasklist_lock);
> + write_unlock_tasklist_lock();
> rcu_read_unlock();
> return err;
> }
> @@ -1229,7 +1229,7 @@ int ksys_setsid(void)
> pid_t session = pid_vnr(sid);
> int err = -EPERM;
>
> - write_lock_irq(&tasklist_lock);
> + write_lock_tasklist_lock();
> /* Fail if I am already a session leader */
> if (group_leader->signal->leader)
> goto out;
> @@ -1247,7 +1247,7 @@ int ksys_setsid(void)
>
> err = session;
> out:
> - write_unlock_irq(&tasklist_lock);
> + write_unlock_tasklist_lock();
> if (err > 0) {
> proc_sid_connector(group_leader);
> sched_autogroup_create_attach(group_leader);
> diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
> index 19be69fa4d05..dd8aed20486a 100644
> --- a/security/keys/keyctl.c
> +++ b/security/keys/keyctl.c
> @@ -1652,7 +1652,7 @@ long keyctl_session_to_parent(void)
>
> me = current;
> rcu_read_lock();
> - write_lock_irq(&tasklist_lock);
> + write_lock_tasklist_lock();
>
> ret = -EPERM;
> oldwork = NULL;
> @@ -1702,7 +1702,7 @@ long keyctl_session_to_parent(void)
> if (!ret)
> newwork = NULL;
> unlock:
> - write_unlock_irq(&tasklist_lock);
> + write_unlock_tasklist_lock();
> rcu_read_unlock();
> if (oldwork)
> put_cred(container_of(oldwork, struct cred, rcu));
>
> base-commit: 88035e5694a86a7167d490bb95e9df97a9bb162b
--
Thx and BRs,
Aiqun(Maria) Yu
On Mon Dec 25, 2023 at 10:19 AM EET, Maria Yu wrote:
> As a rwlock for tasklist_lock, there are multiple scenarios to acquire
> read lock which write lock needed to be waiting for.
> In freeze_process/thaw_processes it can take about 200+ms for holding read
> lock of tasklist_lock by walking and freezing/thawing tasks in commercial
> devices. And write_lock_irq will have preempt disabled and local irq
> disabled to spin until the tasklist_lock can be acquired. This leading to
> a bad responsive performance of current system.
> Take an example:
> 1. cpu0 is holding read lock of tasklist_lock to thaw_processes.
> 2. cpu1 is waiting write lock of tasklist_lock to exec a new thread with
> preempt_disabled and local irq disabled.
> 3. cpu2 is waiting write lock of tasklist_lock to do_exit with
> preempt_disabled and local irq disabled.
> 4. cpu3 is waiting write lock of tasklist_lock to do_exit with
> preempt_disabled and local irq disabled.
> So introduce a write lock/unlock wrapper for tasklist_lock specificly.
> The current taskslist_lock writers all have write_lock_irq to hold
> tasklist_lock, and write_unlock_irq to release tasklist_lock, that means
> the writers are not suitable or workable to wait on tasklist_lock in irq
> disabled scenarios. So the write lock/unlock wrapper here only follow the
> current design of directly use local_irq_disable and local_irq_enable,
> and not take already irq disabled writer callers into account.
> Use write_trylock in the loop and enabled irq for cpu to repsond if lock
> cannot be taken.
>
> Signed-off-by: Maria Yu <[email protected]>
> ---
> fs/exec.c | 10 +++++-----
> include/linux/sched/task.h | 29 +++++++++++++++++++++++++++++
> kernel/exit.c | 16 ++++++++--------
> kernel/fork.c | 6 +++---
> kernel/ptrace.c | 12 ++++++------
> kernel/sys.c | 8 ++++----
> security/keys/keyctl.c | 4 ++--
> 7 files changed, 57 insertions(+), 28 deletions(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index 4aa19b24f281..030eef6852eb 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1086,7 +1086,7 @@ static int de_thread(struct task_struct *tsk)
>
> for (;;) {
> cgroup_threadgroup_change_begin(tsk);
> - write_lock_irq(&tasklist_lock);
> + write_lock_tasklist_lock();
> /*
> * Do this under tasklist_lock to ensure that
> * exit_notify() can't miss ->group_exec_task
> @@ -1095,7 +1095,7 @@ static int de_thread(struct task_struct *tsk)
> if (likely(leader->exit_state))
> break;
> __set_current_state(TASK_KILLABLE);
> - write_unlock_irq(&tasklist_lock);
> + write_unlock_tasklist_lock();
> cgroup_threadgroup_change_end(tsk);
> schedule();
> if (__fatal_signal_pending(tsk))
> @@ -1150,7 +1150,7 @@ static int de_thread(struct task_struct *tsk)
> */
> if (unlikely(leader->ptrace))
> __wake_up_parent(leader, leader->parent);
> - write_unlock_irq(&tasklist_lock);
> + write_unlock_tasklist_lock();
> cgroup_threadgroup_change_end(tsk);
>
> release_task(leader);
> @@ -1198,13 +1198,13 @@ static int unshare_sighand(struct task_struct *me)
>
> refcount_set(&newsighand->count, 1);
>
> - write_lock_irq(&tasklist_lock);
> + write_lock_tasklist_lock();
> spin_lock(&oldsighand->siglock);
> memcpy(newsighand->action, oldsighand->action,
> sizeof(newsighand->action));
> rcu_assign_pointer(me->sighand, newsighand);
> spin_unlock(&oldsighand->siglock);
> - write_unlock_irq(&tasklist_lock);
> + write_unlock_tasklist_lock();
>
> __cleanup_sighand(oldsighand);
> }
> diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
> index a23af225c898..6f69d9a3c868 100644
> --- a/include/linux/sched/task.h
> +++ b/include/linux/sched/task.h
> @@ -50,6 +50,35 @@ struct kernel_clone_args {
> * a separate lock).
> */
> extern rwlock_t tasklist_lock;
> +
> +/*
> + * Tasklist_lock is a special lock, it takes a good amount of time of
> + * taskslist_lock readers to finish, and the pure write_irq_lock api
> + * will do local_irq_disable at the very first, and put the current cpu
> + * waiting for the lock while is non-responsive for interrupts.
> + *
> + * The current taskslist_lock writers all have write_lock_irq to hold
> + * tasklist_lock, and write_unlock_irq to release tasklist_lock, that
> + * means the writers are not suitable or workable to wait on
> + * tasklist_lock in irq disabled scenarios. So the write lock/unlock
> + * wrapper here only follow the current design of directly use
> + * local_irq_disable and local_irq_enable.
> + */
> +static inline void write_lock_tasklist_lock(void)
> +{
> + while (1) {
> + local_irq_disable();
> + if (write_trylock(&tasklist_lock))
> + break;
> + local_irq_enable();
> + cpu_relax();
> + }
Maybe:
local_irq_disable();
while (!write_trylock(&tasklist_lock)) {
local_irq_enable();
cpu_relax();
local_irq_disable();
}
BR, Jarkko