2024-01-23 15:37:09

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH 0/3] fs/proc: do_task_stat: use sig->stats_lock

do_task_stat() has the same problem as getrusage() had before
getrusage-use-sig-stats_lock-rather-than-lock_task_sighand.patch

3/3 depends on the patch above.

Oleg.

fs/proc/array.c | 66 ++++++++++++++++++++++++++++++++-------------------------
kernel/exit.c | 10 +++------
2 files changed, 40 insertions(+), 36 deletions(-)



2024-01-23 15:42:56

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH 1/3] fs/proc: do_task_stat: move thread_group_cputime_adjusted() outside of lock_task_sighand()

thread_group_cputime() does its own locking, we can safely shift
thread_group_cputime_adjusted() which does another for_each_thread loop
outside of ->siglock protected section.

Not only this removes for_each_thread() from the critical section with
irqs disabled, this removes another case when stats_lock is taken with
siglock held. We want to remove this dependency, then we can change the
users of stats_lock to not disable irqs.

Signed-off-by: Oleg Nesterov <[email protected]>
---
fs/proc/array.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index ff08a8957552..45ba91863808 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -511,7 +511,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,

sigemptyset(&sigign);
sigemptyset(&sigcatch);
- cutime = cstime = utime = stime = 0;
+ cutime = cstime = 0;
cgtime = gtime = 0;

if (lock_task_sighand(task, &flags)) {
@@ -546,7 +546,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,

min_flt += sig->min_flt;
maj_flt += sig->maj_flt;
- thread_group_cputime_adjusted(task, &utime, &stime);
gtime += sig->gtime;

if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED))
@@ -562,10 +561,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,

if (permitted && (!whole || num_threads < 2))
wchan = !task_is_running(task);
- if (!whole) {
+
+ if (whole) {
+ thread_group_cputime_adjusted(task, &utime, &stime);
+ } else {
+ task_cputime_adjusted(task, &utime, &stime);
min_flt = task->min_flt;
maj_flt = task->maj_flt;
- task_cputime_adjusted(task, &utime, &stime);
gtime = task_gtime(task);
}

--
2.25.1.362.g51ebf55


2024-01-23 15:43:10

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH 2/3] fs/proc: do_task_stat: use sig->stats_lock to gather the threads/children stats

lock_task_sighand() can trigger a hard lockup. If NR_CPUS threads call
do_task_stat() at the same time and the process has NR_THREADS, it will
spin with irqs disabled O(NR_CPUS * NR_THREADS) time.

Change do_task_stat() to use sig->stats_lock to gather the statistics
outside of ->siglock protected section, in the likely case this code
will run lockless.

Signed-off-by: Oleg Nesterov <[email protected]>
---
fs/proc/array.c | 58 +++++++++++++++++++++++++++----------------------
1 file changed, 32 insertions(+), 26 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 45ba91863808..34a47fb0c57f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -477,13 +477,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
int permitted;
struct mm_struct *mm;
unsigned long long start_time;
- unsigned long cmin_flt = 0, cmaj_flt = 0;
- unsigned long min_flt = 0, maj_flt = 0;
- u64 cutime, cstime, utime, stime;
- u64 cgtime, gtime;
+ unsigned long cmin_flt, cmaj_flt, min_flt, maj_flt;
+ u64 cutime, cstime, cgtime, utime, stime, gtime;
unsigned long rsslim = 0;
unsigned long flags;
int exit_code = task->exit_code;
+ struct signal_struct *sig = task->signal;
+ unsigned int seq = 1;

state = *get_task_state(task);
vsize = eip = esp = 0;
@@ -511,12 +511,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,

sigemptyset(&sigign);
sigemptyset(&sigcatch);
- cutime = cstime = 0;
- cgtime = gtime = 0;

if (lock_task_sighand(task, &flags)) {
- struct signal_struct *sig = task->signal;
-
if (sig->tty) {
struct pid *pgrp = tty_get_pgrp(sig->tty);
tty_pgrp = pid_nr_ns(pgrp, ns);
@@ -527,27 +523,9 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
num_threads = get_nr_threads(task);
collect_sigign_sigcatch(task, &sigign, &sigcatch);

- cmin_flt = sig->cmin_flt;
- cmaj_flt = sig->cmaj_flt;
- cutime = sig->cutime;
- cstime = sig->cstime;
- cgtime = sig->cgtime;
rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);

- /* add up live thread stats at the group level */
if (whole) {
- struct task_struct *t;
-
- __for_each_thread(sig, t) {
- min_flt += t->min_flt;
- maj_flt += t->maj_flt;
- gtime += task_gtime(t);
- }
-
- min_flt += sig->min_flt;
- maj_flt += sig->maj_flt;
- gtime += sig->gtime;
-
if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED))
exit_code = sig->group_exit_code;
}
@@ -562,6 +540,34 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
if (permitted && (!whole || num_threads < 2))
wchan = !task_is_running(task);

+ do {
+ seq++; /* 2 on the 1st/lockless path, otherwise odd */
+ flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+
+ cmin_flt = sig->cmin_flt;
+ cmaj_flt = sig->cmaj_flt;
+ cutime = sig->cutime;
+ cstime = sig->cstime;
+ cgtime = sig->cgtime;
+
+ if (whole) {
+ struct task_struct *t;
+
+ min_flt = sig->min_flt;
+ maj_flt = sig->maj_flt;
+ gtime = sig->gtime;
+
+ rcu_read_lock();
+ __for_each_thread(sig, t) {
+ min_flt += t->min_flt;
+ maj_flt += t->maj_flt;
+ gtime += task_gtime(t);
+ }
+ rcu_read_unlock();
+ }
+ } while (need_seqretry(&sig->stats_lock, seq));
+ done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
+
if (whole) {
thread_group_cputime_adjusted(task, &utime, &stime);
} else {
--
2.25.1.362.g51ebf55


2024-01-23 15:43:27

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH 3/3] exit: wait_task_zombie: kill the no longer necessary spin_lock_irq(siglock)

After the recent changes nobody use siglock to read the values protected
by stats_lock, we can kill spin_lock_irq(&current->sighand->siglock) and
update the comment.

With this patch only __exit_signal() and thread_group_start_cputime() take
stats_lock under siglock.

Signed-off-by: Oleg Nesterov <[email protected]>
---
kernel/exit.c | 10 +++-------
1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index 3988a02efaef..dfb963d2f862 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1127,17 +1127,14 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* and nobody can change them.
*
* psig->stats_lock also protects us from our sub-threads
- * which can reap other children at the same time. Until
- * we change k_getrusage()-like users to rely on this lock
- * we have to take ->siglock as well.
+ * which can reap other children at the same time.
*
* We use thread_group_cputime_adjusted() to get times for
* the thread group, which consolidates times for all threads
* in the group including the group leader.
*/
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- spin_lock_irq(&current->sighand->siglock);
- write_seqlock(&psig->stats_lock);
+ write_seqlock_irq(&psig->stats_lock);
psig->cutime += tgutime + sig->cutime;
psig->cstime += tgstime + sig->cstime;
psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1160,8 +1157,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
psig->cmaxrss = maxrss;
task_io_accounting_add(&psig->ioac, &p->ioac);
task_io_accounting_add(&psig->ioac, &sig->ioac);
- write_sequnlock(&psig->stats_lock);
- spin_unlock_irq(&current->sighand->siglock);
+ write_sequnlock_irq(&psig->stats_lock);
}

if (wo->wo_rusage)
--
2.25.1.362.g51ebf55


2024-01-23 23:48:52

by Dylan Hatch

[permalink] [raw]
Subject: Re: [PATCH 1/3] fs/proc: do_task_stat: move thread_group_cputime_adjusted() outside of lock_task_sighand()

On Tue, Jan 23, 2024 at 7:35 AM Oleg Nesterov <[email protected]> wrote:
>
> thread_group_cputime() does its own locking, we can safely shift
> thread_group_cputime_adjusted() which does another for_each_thread loop
> outside of ->siglock protected section.
>
> Not only this removes for_each_thread() from the critical section with
> irqs disabled, this removes another case when stats_lock is taken with
> siglock held. We want to remove this dependency, then we can change the
> users of stats_lock to not disable irqs.
>
> Signed-off-by: Oleg Nesterov <[email protected]>
> ---
> fs/proc/array.c | 10 ++++++----
> 1 file changed, 6 insertions(+), 4 deletions(-)
>
> diff --git a/fs/proc/array.c b/fs/proc/array.c
> index ff08a8957552..45ba91863808 100644
> --- a/fs/proc/array.c
> +++ b/fs/proc/array.c
> @@ -511,7 +511,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
>
> sigemptyset(&sigign);
> sigemptyset(&sigcatch);
> - cutime = cstime = utime = stime = 0;
> + cutime = cstime = 0;
> cgtime = gtime = 0;
>
> if (lock_task_sighand(task, &flags)) {
> @@ -546,7 +546,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
>
> min_flt += sig->min_flt;
> maj_flt += sig->maj_flt;
> - thread_group_cputime_adjusted(task, &utime, &stime);
> gtime += sig->gtime;
>
> if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED))
> @@ -562,10 +561,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
>
> if (permitted && (!whole || num_threads < 2))
> wchan = !task_is_running(task);
> - if (!whole) {
> +
> + if (whole) {
> + thread_group_cputime_adjusted(task, &utime, &stime);
> + } else {
> + task_cputime_adjusted(task, &utime, &stime);
> min_flt = task->min_flt;
> maj_flt = task->maj_flt;
> - task_cputime_adjusted(task, &utime, &stime);
> gtime = task_gtime(task);
> }
>
> --
> 2.25.1.362.g51ebf55
>

Signed-off-by: Dylan Hatch <[email protected]>

2024-01-23 23:50:50

by Dylan Hatch

[permalink] [raw]
Subject: Re: [PATCH 2/3] fs/proc: do_task_stat: use sig->stats_lock to gather the threads/children stats

On Tue, Jan 23, 2024 at 7:35 AM Oleg Nesterov <[email protected]> wrote:
>
> lock_task_sighand() can trigger a hard lockup. If NR_CPUS threads call
> do_task_stat() at the same time and the process has NR_THREADS, it will
> spin with irqs disabled O(NR_CPUS * NR_THREADS) time.
>
> Change do_task_stat() to use sig->stats_lock to gather the statistics
> outside of ->siglock protected section, in the likely case this code
> will run lockless.
>
> Signed-off-by: Oleg Nesterov <[email protected]>
> ---
> fs/proc/array.c | 58 +++++++++++++++++++++++++++----------------------
> 1 file changed, 32 insertions(+), 26 deletions(-)
>
> diff --git a/fs/proc/array.c b/fs/proc/array.c
> index 45ba91863808..34a47fb0c57f 100644
> --- a/fs/proc/array.c
> +++ b/fs/proc/array.c
> @@ -477,13 +477,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
> int permitted;
> struct mm_struct *mm;
> unsigned long long start_time;
> - unsigned long cmin_flt = 0, cmaj_flt = 0;
> - unsigned long min_flt = 0, maj_flt = 0;
> - u64 cutime, cstime, utime, stime;
> - u64 cgtime, gtime;
> + unsigned long cmin_flt, cmaj_flt, min_flt, maj_flt;
> + u64 cutime, cstime, cgtime, utime, stime, gtime;
> unsigned long rsslim = 0;
> unsigned long flags;
> int exit_code = task->exit_code;
> + struct signal_struct *sig = task->signal;
> + unsigned int seq = 1;
>
> state = *get_task_state(task);
> vsize = eip = esp = 0;
> @@ -511,12 +511,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
>
> sigemptyset(&sigign);
> sigemptyset(&sigcatch);
> - cutime = cstime = 0;
> - cgtime = gtime = 0;
>
> if (lock_task_sighand(task, &flags)) {
> - struct signal_struct *sig = task->signal;
> -
> if (sig->tty) {
> struct pid *pgrp = tty_get_pgrp(sig->tty);
> tty_pgrp = pid_nr_ns(pgrp, ns);
> @@ -527,27 +523,9 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
> num_threads = get_nr_threads(task);
> collect_sigign_sigcatch(task, &sigign, &sigcatch);
>
> - cmin_flt = sig->cmin_flt;
> - cmaj_flt = sig->cmaj_flt;
> - cutime = sig->cutime;
> - cstime = sig->cstime;
> - cgtime = sig->cgtime;
> rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
>
> - /* add up live thread stats at the group level */
> if (whole) {
> - struct task_struct *t;
> -
> - __for_each_thread(sig, t) {
> - min_flt += t->min_flt;
> - maj_flt += t->maj_flt;
> - gtime += task_gtime(t);
> - }
> -
> - min_flt += sig->min_flt;
> - maj_flt += sig->maj_flt;
> - gtime += sig->gtime;
> -
> if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED))
> exit_code = sig->group_exit_code;
> }
> @@ -562,6 +540,34 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
> if (permitted && (!whole || num_threads < 2))
> wchan = !task_is_running(task);
>
> + do {
> + seq++; /* 2 on the 1st/lockless path, otherwise odd */
> + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
> +
> + cmin_flt = sig->cmin_flt;
> + cmaj_flt = sig->cmaj_flt;
> + cutime = sig->cutime;
> + cstime = sig->cstime;
> + cgtime = sig->cgtime;
> +
> + if (whole) {
> + struct task_struct *t;
> +
> + min_flt = sig->min_flt;
> + maj_flt = sig->maj_flt;
> + gtime = sig->gtime;
> +
> + rcu_read_lock();
> + __for_each_thread(sig, t) {
> + min_flt += t->min_flt;
> + maj_flt += t->maj_flt;
> + gtime += task_gtime(t);
> + }
> + rcu_read_unlock();
> + }
> + } while (need_seqretry(&sig->stats_lock, seq));
> + done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
> +
> if (whole) {
> thread_group_cputime_adjusted(task, &utime, &stime);
> } else {
> --
> 2.25.1.362.g51ebf55
>

Signed-off-by: Dylan Hatch <[email protected]>

2024-01-23 23:52:47

by Dylan Hatch

[permalink] [raw]
Subject: Re: [PATCH 3/3] exit: wait_task_zombie: kill the no longer necessary spin_lock_irq(siglock)

On Tue, Jan 23, 2024 at 7:35 AM Oleg Nesterov <[email protected]> wrote:
>
> After the recent changes nobody use siglock to read the values protected
> by stats_lock, we can kill spin_lock_irq(&current->sighand->siglock) and
> update the comment.
>
> With this patch only __exit_signal() and thread_group_start_cputime() take
> stats_lock under siglock.
>
> Signed-off-by: Oleg Nesterov <[email protected]>
> ---
> kernel/exit.c | 10 +++-------
> 1 file changed, 3 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 3988a02efaef..dfb963d2f862 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -1127,17 +1127,14 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
> * and nobody can change them.
> *
> * psig->stats_lock also protects us from our sub-threads
> - * which can reap other children at the same time. Until
> - * we change k_getrusage()-like users to rely on this lock
> - * we have to take ->siglock as well.
> + * which can reap other children at the same time.
> *
> * We use thread_group_cputime_adjusted() to get times for
> * the thread group, which consolidates times for all threads
> * in the group including the group leader.
> */
> thread_group_cputime_adjusted(p, &tgutime, &tgstime);
> - spin_lock_irq(&current->sighand->siglock);
> - write_seqlock(&psig->stats_lock);
> + write_seqlock_irq(&psig->stats_lock);
> psig->cutime += tgutime + sig->cutime;
> psig->cstime += tgstime + sig->cstime;
> psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
> @@ -1160,8 +1157,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
> psig->cmaxrss = maxrss;
> task_io_accounting_add(&psig->ioac, &p->ioac);
> task_io_accounting_add(&psig->ioac, &sig->ioac);
> - write_sequnlock(&psig->stats_lock);
> - spin_unlock_irq(&current->sighand->siglock);
> + write_sequnlock_irq(&psig->stats_lock);
> }
>
> if (wo->wo_rusage)
> --
> 2.25.1.362.g51ebf55
>

Signed-off-by: Dylan Hatch <[email protected]>