This way thread_group_cputime() doesn't exclude other readers on the
2nd pass.
thread_group_cputime() still needs to disable irqs because stats_lock
nests inside siglock. But once we change the getrusage()-like users to
rely on stats_lock we can remove this dependency, and after that there
will be no need for _irqsave.
And IIUC, this is the bugfix for CONFIG_PREEMPT_RT? Before this patch
read_seqbegin_or_lock() can spin in __read_seqcount_begin() while the
write_seqlock(stats_lock) section was preempted.
While at it, change the main loop to use __for_each_thread(sig, t).
Signed-off-by: Oleg Nesterov <[email protected]>
---
include/linux/sched/signal.h | 4 +++-
kernel/exit.c | 12 ++++++++----
kernel/fork.c | 3 ++-
kernel/sched/cputime.c | 10 ++++++----
4 files changed, 19 insertions(+), 10 deletions(-)
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index d7fa3ca2fa53..c7c0928b877d 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -182,7 +182,9 @@ struct signal_struct {
* Live threads maintain their own counters and add to these
* in __exit_signal, except for the group leader.
*/
- seqlock_t stats_lock;
+ rwlock_t stats_lock;
+ seqcount_rwlock_t stats_seqc;
+
u64 utime, stime, cutime, cstime;
u64 gtime;
u64 cgtime;
diff --git a/kernel/exit.c b/kernel/exit.c
index f3ba4b97a7d9..8dedb7138f9c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -182,7 +182,8 @@ static void __exit_signal(struct task_struct *tsk)
* see the empty ->thread_head list.
*/
task_cputime(tsk, &utime, &stime);
- write_seqlock(&sig->stats_lock);
+ write_lock(&sig->stats_lock);
+ write_seqcount_begin(&sig->stats_seqc);
sig->utime += utime;
sig->stime += stime;
sig->gtime += task_gtime(tsk);
@@ -196,7 +197,8 @@ static void __exit_signal(struct task_struct *tsk)
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig->nr_threads--;
__unhash_process(tsk, group_dead);
- write_sequnlock(&sig->stats_lock);
+ write_seqcount_end(&sig->stats_seqc);
+ write_unlock(&sig->stats_lock);
/*
* Do this under ->siglock, we can race with another thread
@@ -1160,7 +1162,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
*/
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
spin_lock_irq(¤t->sighand->siglock);
- write_seqlock(&psig->stats_lock);
+ write_lock(&psig->stats_lock);
+ write_seqcount_begin(&psig->stats_seqc);
psig->cutime += tgutime + sig->cutime;
psig->cstime += tgstime + sig->cstime;
psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1183,7 +1186,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
psig->cmaxrss = maxrss;
task_io_accounting_add(&psig->ioac, &p->ioac);
task_io_accounting_add(&psig->ioac, &sig->ioac);
- write_sequnlock(&psig->stats_lock);
+ write_seqcount_end(&psig->stats_seqc);
+ write_unlock(&psig->stats_lock);
spin_unlock_irq(¤t->sighand->siglock);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index b9d3aa493bbd..bbd5604053f8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1870,7 +1870,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->curr_target = tsk;
init_sigpending(&sig->shared_pending);
INIT_HLIST_HEAD(&sig->multiprocess);
- seqlock_init(&sig->stats_lock);
+ rwlock_init(&sig->stats_lock);
+ seqcount_rwlock_init(&sig->stats_seqc, &sig->stats_lock);
prev_cputime_init(&sig->prev_cputime);
#ifdef CONFIG_POSIX_TIMERS
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index af7952f12e6c..bd6a85bd2a49 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -333,12 +333,13 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
nextseq = 0;
do {
seq = nextseq;
- flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+ flags = read_seqcount_begin_or_lock_irqsave(&sig->stats_seqc,
+ &sig->stats_lock, &seq);
times->utime = sig->utime;
times->stime = sig->stime;
times->sum_exec_runtime = sig->sum_sched_runtime;
- for_each_thread(tsk, t) {
+ __for_each_thread(sig, t) {
task_cputime(t, &utime, &stime);
times->utime += utime;
times->stime += stime;
@@ -346,8 +347,9 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
}
/* If lockless access failed, take the lock. */
nextseq = 1;
- } while (need_seqretry(&sig->stats_lock, seq));
- done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
+ } while (need_seqcount_retry(&sig->stats_seqc, seq));
+ done_seqcount_retry_irqrestore(&sig->stats_seqc, &sig->stats_lock,
+ seq, flags);
rcu_read_unlock();
}
--
2.25.1.362.g51ebf55
On Wed, Sep 13, 2023 at 05:50:09PM +0200, Oleg Nesterov wrote:
> This way thread_group_cputime() doesn't exclude other readers on the
> 2nd pass.
>
> thread_group_cputime() still needs to disable irqs because stats_lock
> nests inside siglock. But once we change the getrusage()-like users to
> rely on stats_lock we can remove this dependency, and after that there
> will be no need for _irqsave.
>
> And IIUC, this is the bugfix for CONFIG_PREEMPT_RT? Before this patch
> read_seqbegin_or_lock() can spin in __read_seqcount_begin() while the
> write_seqlock(stats_lock) section was preempted.
>
> While at it, change the main loop to use __for_each_thread(sig, t).
>
> Signed-off-by: Oleg Nesterov <[email protected]>
> ---
> include/linux/sched/signal.h | 4 +++-
> kernel/exit.c | 12 ++++++++----
> kernel/fork.c | 3 ++-
> kernel/sched/cputime.c | 10 ++++++----
> 4 files changed, 19 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index d7fa3ca2fa53..c7c0928b877d 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -182,7 +182,9 @@ struct signal_struct {
> * Live threads maintain their own counters and add to these
> * in __exit_signal, except for the group leader.
> */
> - seqlock_t stats_lock;
> + rwlock_t stats_lock;
> + seqcount_rwlock_t stats_seqc;
> +
> u64 utime, stime, cutime, cstime;
> u64 gtime;
> u64 cgtime;
> diff --git a/kernel/exit.c b/kernel/exit.c
> index f3ba4b97a7d9..8dedb7138f9c 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -182,7 +182,8 @@ static void __exit_signal(struct task_struct *tsk)
> * see the empty ->thread_head list.
> */
> task_cputime(tsk, &utime, &stime);
> - write_seqlock(&sig->stats_lock);
> + write_lock(&sig->stats_lock);
> + write_seqcount_begin(&sig->stats_seqc);
> sig->utime += utime;
> sig->stime += stime;
> sig->gtime += task_gtime(tsk);
> @@ -196,7 +197,8 @@ static void __exit_signal(struct task_struct *tsk)
> sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
> sig->nr_threads--;
> __unhash_process(tsk, group_dead);
> - write_sequnlock(&sig->stats_lock);
> + write_seqcount_end(&sig->stats_seqc);
> + write_unlock(&sig->stats_lock);
>
> /*
> * Do this under ->siglock, we can race with another thread
> @@ -1160,7 +1162,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
> */
> thread_group_cputime_adjusted(p, &tgutime, &tgstime);
> spin_lock_irq(¤t->sighand->siglock);
> - write_seqlock(&psig->stats_lock);
> + write_lock(&psig->stats_lock);
> + write_seqcount_begin(&psig->stats_seqc);
> psig->cutime += tgutime + sig->cutime;
> psig->cstime += tgstime + sig->cstime;
> psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
> @@ -1183,7 +1186,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
> psig->cmaxrss = maxrss;
> task_io_accounting_add(&psig->ioac, &p->ioac);
> task_io_accounting_add(&psig->ioac, &sig->ioac);
> - write_sequnlock(&psig->stats_lock);
> + write_seqcount_end(&psig->stats_seqc);
> + write_unlock(&psig->stats_lock);
> spin_unlock_irq(¤t->sighand->siglock);
> }
>
> diff --git a/kernel/fork.c b/kernel/fork.c
> index b9d3aa493bbd..bbd5604053f8 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1870,7 +1870,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
> sig->curr_target = tsk;
> init_sigpending(&sig->shared_pending);
> INIT_HLIST_HEAD(&sig->multiprocess);
> - seqlock_init(&sig->stats_lock);
> + rwlock_init(&sig->stats_lock);
> + seqcount_rwlock_init(&sig->stats_seqc, &sig->stats_lock);
> prev_cputime_init(&sig->prev_cputime);
>
> #ifdef CONFIG_POSIX_TIMERS
> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> index af7952f12e6c..bd6a85bd2a49 100644
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -333,12 +333,13 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
> nextseq = 0;
> do {
> seq = nextseq;
> - flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
> + flags = read_seqcount_begin_or_lock_irqsave(&sig->stats_seqc,
> + &sig->stats_lock, &seq);
> times->utime = sig->utime;
> times->stime = sig->stime;
> times->sum_exec_runtime = sig->sum_sched_runtime;
>
> - for_each_thread(tsk, t) {
> + __for_each_thread(sig, t) {
> task_cputime(t, &utime, &stime);
> times->utime += utime;
> times->stime += stime;
> @@ -346,8 +347,9 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
> }
> /* If lockless access failed, take the lock. */
> nextseq = 1;
I think you're right, and indeed there is a possible situation here
where write_seqlock will force all readers to take locks one after
another.
I really don’t know how critical it is in this place.
> - } while (need_seqretry(&sig->stats_lock, seq));
> - done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
> + } while (need_seqcount_retry(&sig->stats_seqc, seq));
> + done_seqcount_retry_irqrestore(&sig->stats_seqc, &sig->stats_lock,
> + seq, flags);
> rcu_read_unlock();
> }
>
> --
> 2.25.1.362.g51ebf55
>
--
Rgrds, legion