If the rcutree.use_softirq is configured, when RCU Stall event
happened, dump status of all rcuc kthreads who due to starvation
prevented grace period ends on CPUs that not report quiescent
state.
Signed-off-by: Zqiang <[email protected]>
---
kernel/rcu/tree_stall.h | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index d7956c03edbd..e6ecc32cfe23 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -348,6 +348,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
}
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+static void rcuc_kthread_dump(struct rcu_data *rdp);
/*
* Dump stacks of all tasks running on stalled CPUs. First try using
* NMIs, but fall back to manual remote stack tracing on architectures
@@ -368,6 +369,7 @@ static void rcu_dump_cpu_stacks(void)
pr_err("Offline CPU %d blocking current GP.\n", cpu);
else if (!trigger_single_cpu_backtrace(cpu))
dump_cpu_task(cpu);
+ rcuc_kthread_dump(per_cpu_ptr(&rcu_data, cpu));
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
@@ -471,6 +473,9 @@ static void rcuc_kthread_dump(struct rcu_data *rdp)
unsigned long j;
struct task_struct *rcuc;
+ if (use_softirq)
+ return;
+
rcuc = rdp->rcu_cpu_kthread_task;
if (!rcuc)
return;
@@ -659,9 +664,6 @@ static void print_cpu_stall(unsigned long gps)
rcu_check_gp_kthread_expired_fqs_timer();
rcu_check_gp_kthread_starvation();
- if (!use_softirq)
- rcuc_kthread_dump(rdp);
-
rcu_dump_cpu_stacks();
raw_spin_lock_irqsave_rcu_node(rnp, flags);
--
2.25.1
On Wed, Apr 13, 2022 at 03:44:11PM +0800, Zqiang wrote:
> If the rcutree.use_softirq is configured, when RCU Stall event
> happened, dump status of all rcuc kthreads who due to starvation
> prevented grace period ends on CPUs that not report quiescent
> state.
>
> Signed-off-by: Zqiang <[email protected]>
This could be a good improvement! But a few questions and comments below.
Thanx, Paul
> ---
> kernel/rcu/tree_stall.h | 8 +++++---
> 1 file changed, 5 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
> index d7956c03edbd..e6ecc32cfe23 100644
> --- a/kernel/rcu/tree_stall.h
> +++ b/kernel/rcu/tree_stall.h
> @@ -348,6 +348,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
> }
> #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
>
> +static void rcuc_kthread_dump(struct rcu_data *rdp);
> /*
> * Dump stacks of all tasks running on stalled CPUs. First try using
> * NMIs, but fall back to manual remote stack tracing on architectures
> @@ -368,6 +369,7 @@ static void rcu_dump_cpu_stacks(void)
> pr_err("Offline CPU %d blocking current GP.\n", cpu);
> else if (!trigger_single_cpu_backtrace(cpu))
> dump_cpu_task(cpu);
> + rcuc_kthread_dump(per_cpu_ptr(&rcu_data, cpu));
Was this addition inspired by the chasing of a bug? If so, please let me
know exactly what information was (or would have been) the most helpful.
For example, the starvation information might be more compactly
represented in the information printed by print_cpu_stall_info().
Unless the stack trace was quite valuable, it might be best to omit it.
After all, RCU CPU stall warnings are already infamously heavy on the
text output.
> }
> raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
> }
> @@ -471,6 +473,9 @@ static void rcuc_kthread_dump(struct rcu_data *rdp)
> unsigned long j;
> struct task_struct *rcuc;
>
> + if (use_softirq)
> + return;
> +
> rcuc = rdp->rcu_cpu_kthread_task;
> if (!rcuc)
The checks for use_softirq on the one hand and the checks for non-NULL
rdp->rcu_cpu_kthread_task are a bit "interesting". Not your fault or
problem, of course!
> return;
> @@ -659,9 +664,6 @@ static void print_cpu_stall(unsigned long gps)
> rcu_check_gp_kthread_expired_fqs_timer();
> rcu_check_gp_kthread_starvation();
>
> - if (!use_softirq)
In particular, I am wondering if this "if" is redundant.
> - rcuc_kthread_dump(rdp);
> -
> rcu_dump_cpu_stacks();
>
> raw_spin_lock_irqsave_rcu_node(rnp, flags);
> --
> 2.25.1
>
On Wed, Apr 13, 2022 at 03:44:11PM +0800, Zqiang wrote:
> If the rcutree.use_softirq is configured, when RCU Stall event
> happened, dump status of all rcuc kthreads who due to starvation
> prevented grace period ends on CPUs that not report quiescent state.
>
> Signed-off-by: Zqiang <[email protected]>
>This could be a good improvement! But a few questions and comments below.
>
> Thanx, Paul
> ---
> kernel/rcu/tree_stall.h | 8 +++++---
> 1 file changed, 5 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index
> d7956c03edbd..e6ecc32cfe23 100644
> --- a/kernel/rcu/tree_stall.h
> +++ b/kernel/rcu/tree_stall.h
> @@ -348,6 +348,7 @@ static int rcu_print_task_stall(struct rcu_node
> *rnp, unsigned long flags) } #endif /* #else #ifdef
> CONFIG_PREEMPT_RCU */
>
> +static void rcuc_kthread_dump(struct rcu_data *rdp);
> /*
> * Dump stacks of all tasks running on stalled CPUs. First try using
> * NMIs, but fall back to manual remote stack tracing on
> architectures @@ -368,6 +369,7 @@ static void rcu_dump_cpu_stacks(void)
> pr_err("Offline CPU %d blocking current GP.\n", cpu);
> else if (!trigger_single_cpu_backtrace(cpu))
> dump_cpu_task(cpu);
> + rcuc_kthread_dump(per_cpu_ptr(&rcu_data, cpu));
>Was this addition inspired by the chasing of a bug? If so, please let me know exactly what information was (or would have been) the most helpful.
>
>For example, the starvation information might be more compactly represented in the information printed by print_cpu_stall_info().
>Unless the stack trace was quite valuable, it might be best to omit it.
>After all, RCU CPU stall warnings are already infamously heavy on the text output.
Is it better to modify it like this?
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index d7956c03edbd..37444ff00787 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -465,27 +465,38 @@ static void print_cpu_stall_info(int cpu)
falsepositive ? " (false positive?)" : "");
}
-static void rcuc_kthread_dump(struct rcu_data *rdp)
+static void rcuc_kthread_dump(void)
{
int cpu;
unsigned long j;
+ unsigned long flags
+ struct rcu_node *rnp;
struct task_struct *rcuc;
+ struct rcu_data *rdp;
- rcuc = rdp->rcu_cpu_kthread_task;
- if (!rcuc)
- return;
+ rcu_for_each_leaf_node(rnp) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ rcuc = rdp->rcu_cpu_kthread_task;
+ if (!rcuc)
+ return;
- cpu = task_cpu(rcuc);
- if (cpu_is_offline(cpu) || idle_cpu(cpu))
- return;
+ cpu = task_cpu(rcuc);
+ if (cpu_is_offline(cpu) || idle_cpu(cpu))
+ return;
- if (!rcu_is_rcuc_kthread_starving(rdp, &j))
- return;
+ if (!rcu_is_rcuc_kthread_starving(rdp, &j))
+ return;
- pr_err("%s kthread starved for %ld jiffies\n", rcuc->comm, j);
- sched_show_task(rcuc);
- if (!trigger_single_cpu_backtrace(cpu))
- dump_cpu_task(cpu);
+ pr_err("%s kthread starved for %ld jiffies\n", rcuc->comm, j);
+ sched_show_task(rcuc);
+ if (!trigger_single_cpu_backtrace(cpu))
+ dump_cpu_task(cpu);
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
}
/* Complain about starvation of grace-period kthread. */
@@ -595,6 +606,9 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
smp_processor_id(), (long)(jiffies - gps),
(long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
if (ndetected) {
+ if (!use_softirq)
+ rcuc_kthread_dump();
+
rcu_dump_cpu_stacks();
/* Complain about tasks blocking the grace period. */
@@ -660,7 +674,7 @@ static void print_cpu_stall(unsigned long gps)
rcu_check_gp_kthread_starvation();
if (!use_softirq)
- rcuc_kthread_dump(rdp);
+ rcuc_kthread_dump();
rcu_dump_cpu_stacks();
Thanks
Zqiang
> }
> raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
> }
> @@ -471,6 +473,9 @@ static void rcuc_kthread_dump(struct rcu_data *rdp)
> unsigned long j;
> struct task_struct *rcuc;
>
> + if (use_softirq)
> + return;
> +
> rcuc = rdp->rcu_cpu_kthread_task;
> if (!rcuc)
>The checks for use_softirq on the one hand and the checks for non-NULL
>rdp->rcu_cpu_kthread_task are a bit "interesting". Not your fault or
>problem, of course!
Yes the ' if (!use_softirq)' is redundant, the if rcu_cpu_kthread_task Is completely enough
>
> return;
> @@ -659,9 +664,6 @@ static void print_cpu_stall(unsigned long gps)
> rcu_check_gp_kthread_expired_fqs_timer();
> rcu_check_gp_kthread_starvation();
>
> - if (!use_softirq)
>
>In particular, I am wondering if this "if" is redundant.
Yes the ' if (!use_softirq)' is also redundant
>
> - rcuc_kthread_dump(rdp);
> -
> rcu_dump_cpu_stacks();
>
> raw_spin_lock_irqsave_rcu_node(rnp, flags);
> --
> 2.25.1
>
On Wed, Apr 13, 2022 at 03:44:11PM +0800, Zqiang wrote:
> If the rcutree.use_softirq is configured, when RCU Stall event
> happened, dump status of all rcuc kthreads who due to starvation
> prevented grace period ends on CPUs that not report quiescent state.
>
> Signed-off-by: Zqiang <[email protected]>
>This could be a good improvement! But a few questions and comments below.
>
> Thanx, Paul
> ---
> kernel/rcu/tree_stall.h | 8 +++++---
> 1 file changed, 5 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index
> d7956c03edbd..e6ecc32cfe23 100644
> --- a/kernel/rcu/tree_stall.h
> +++ b/kernel/rcu/tree_stall.h
> @@ -348,6 +348,7 @@ static int rcu_print_task_stall(struct rcu_node
> *rnp, unsigned long flags) } #endif /* #else #ifdef
> CONFIG_PREEMPT_RCU */
>
> +static void rcuc_kthread_dump(struct rcu_data *rdp);
> /*
> * Dump stacks of all tasks running on stalled CPUs. First try using
> * NMIs, but fall back to manual remote stack tracing on
> architectures @@ -368,6 +369,7 @@ static void rcu_dump_cpu_stacks(void)
> pr_err("Offline CPU %d blocking current GP.\n", cpu);
> else if (!trigger_single_cpu_backtrace(cpu))
> dump_cpu_task(cpu);
> + rcuc_kthread_dump(per_cpu_ptr(&rcu_data, cpu));
>Was this addition inspired by the chasing of a bug? If so, please let me know exactly what information was (or would have been) the most helpful.
>
>For example, the starvation information might be more compactly represented in the information printed by print_cpu_stall_info().
>Unless the stack trace was quite valuable, it might be best to omit it.
>After all, RCU CPU stall warnings are already infamously heavy on the text output.
>Is it better to modify it like this?
I'm so sorry, the following modifications are problematic, I'll resend v2
Thanks
Zqiang
>diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index d7956c03edbd..37444ff00787 100644
>--- a/kernel/rcu/tree_stall.h
>+++ b/kernel/rcu/tree_stall.h
>@@ -465,27 +465,38 @@ static void print_cpu_stall_info(int cpu)
> falsepositive ? " (false positive?)" : ""); }
>
>-static void rcuc_kthread_dump(struct rcu_data *rdp)
>+static void rcuc_kthread_dump(void)
> {
> int cpu;
> unsigned long j;
>+ unsigned long flags
>+ struct rcu_node *rnp;
> struct task_struct *rcuc;
>+ struct rcu_data *rdp;
>
>- rcuc = rdp->rcu_cpu_kthread_task;
>- if (!rcuc)
>- return;
>+ rcu_for_each_leaf_node(rnp) {
>+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
>+ for_each_leaf_node_possible_cpu(rnp, cpu)
>+ if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
>+ rdp = per_cpu_ptr(&rcu_data, cpu);
>+ rcuc = rdp->rcu_cpu_kthread_task;
>+ if (!rcuc)
>+ return;
>
>- cpu = task_cpu(rcuc);
>- if (cpu_is_offline(cpu) || idle_cpu(cpu))
>- return;
>+ cpu = task_cpu(rcuc);
>+ if (cpu_is_offline(cpu) || idle_cpu(cpu))
>+ return;
>
>- if (!rcu_is_rcuc_kthread_starving(rdp, &j))
>- return;
>+ if (!rcu_is_rcuc_kthread_starving(rdp, &j))
>+ return;
>
>- pr_err("%s kthread starved for %ld jiffies\n", rcuc->comm, j);
>- sched_show_task(rcuc);
>- if (!trigger_single_cpu_backtrace(cpu))
>- dump_cpu_task(cpu);
>+ pr_err("%s kthread starved for %ld jiffies\n", rcuc->comm, j);
>+ sched_show_task(rcuc);
>+ if (!trigger_single_cpu_backtrace(cpu))
>+ dump_cpu_task(cpu);
>+ }
>+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
>+ }
> }
>
>/* Complain about starvation of grace-period kthread. */ @@ -595,6 +606,9 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
> smp_processor_id(), (long)(jiffies - gps),
> (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
> if (ndetected) {
>+ if (!use_softirq)
>+ rcuc_kthread_dump();
>+
> rcu_dump_cpu_stacks();
>
> /* Complain about tasks blocking the grace period. */ @@ -660,7 +674,7 @@ static void print_cpu_stall(unsigned long gps)
> rcu_check_gp_kthread_starvation();
>
> if (!use_softirq)
>- rcuc_kthread_dump(rdp);
>+ rcuc_kthread_dump();
>
> rcu_dump_cpu_stacks();
>
>Thanks
>Zqiang
> }
> raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
> }
> @@ -471,6 +473,9 @@ static void rcuc_kthread_dump(struct rcu_data *rdp)
> unsigned long j;
> struct task_struct *rcuc;
>
> + if (use_softirq)
> + return;
> +
> rcuc = rdp->rcu_cpu_kthread_task;
> if (!rcuc)
>The checks for use_softirq on the one hand and the checks for non-NULL
>rdp->rcu_cpu_kthread_task are a bit "interesting". Not your fault or
>problem, of course!
>Yes the ' if (!use_softirq)' is redundant, the if rcu_cpu_kthread_task Is completely enough
>
> return;
> @@ -659,9 +664,6 @@ static void print_cpu_stall(unsigned long gps)
> rcu_check_gp_kthread_expired_fqs_timer();
> rcu_check_gp_kthread_starvation();
>
> - if (!use_softirq)
>
>In particular, I am wondering if this "if" is redundant.
>Yes the ' if (!use_softirq)' is also redundant
>
> - rcuc_kthread_dump(rdp);
> -
> rcu_dump_cpu_stacks();
>
> raw_spin_lock_irqsave_rcu_node(rnp, flags);
> --
> 2.25.1
>