2020-03-05 08:21:04

by Zhaolong Zhang

[permalink] [raw]
Subject: [PATCH] rcu: Fix the (t=0 jiffies) false positive

Calculate 't' with the previously recorded 'gps' instead of 'gp_start'.

Signed-off-by: Zhaolong Zhang <[email protected]>
---
kernel/rcu/tree_stall.h | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 55f9b84..4223b8b 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -371,7 +371,7 @@ static void rcu_check_gp_kthread_starvation(void)
}
}

-static void print_other_cpu_stall(unsigned long gp_seq)
+static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
{
int cpu;
unsigned long flags;
@@ -408,7 +408,7 @@ static void print_other_cpu_stall(unsigned long gp_seq)
for_each_possible_cpu(cpu)
totqlen += rcu_get_n_cbs_cpu(cpu);
pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
- smp_processor_id(), (long)(jiffies - rcu_state.gp_start),
+ smp_processor_id(), (long)(jiffies - gps),
(long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
if (ndetected) {
rcu_dump_cpu_stacks();
@@ -442,7 +442,7 @@ static void print_other_cpu_stall(unsigned long gp_seq)
rcu_force_quiescent_state(); /* Kick them all. */
}

-static void print_cpu_stall(void)
+static void print_cpu_stall(unsigned long gps)
{
int cpu;
unsigned long flags;
@@ -467,7 +467,7 @@ static void print_cpu_stall(void)
for_each_possible_cpu(cpu)
totqlen += rcu_get_n_cbs_cpu(cpu);
pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n",
- jiffies - rcu_state.gp_start,
+ jiffies - gps,
(long)rcu_seq_current(&rcu_state.gp_seq), totqlen);

rcu_check_gp_kthread_starvation();
@@ -546,7 +546,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {

/* We haven't checked in, so go dump stack. */
- print_cpu_stall();
+ print_cpu_stall(gps);
if (rcu_cpu_stall_ftrace_dump)
rcu_ftrace_dump(DUMP_ALL);

@@ -555,7 +555,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {

/* They had a few time units to dump stack, so complain. */
- print_other_cpu_stall(gs2);
+ print_other_cpu_stall(gs2, gps);
if (rcu_cpu_stall_ftrace_dump)
rcu_ftrace_dump(DUMP_ALL);
}
--
1.8.3.1


2020-03-05 23:05:47

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH] rcu: Fix the (t=0 jiffies) false positive

On Thu, Mar 05, 2020 at 03:45:57PM +0800, Zhaolong Zhang wrote:
> Calculate 't' with the previously recorded 'gps' instead of 'gp_start'.
>
> Signed-off-by: Zhaolong Zhang <[email protected]>

Good catch, thank you!

I had to apply this by hand. My guess is that you developed against
mainline rather than the "dev" branch of the -rcu tree:

git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git

Or perhaps your email client corrupted the patch?

Regardless, please check the version below and let me know if I messed
anything up.

Thanx, Paul

------------------------------------------------------------------------

commit 7dd581bbbcc00246fec35eda60add4b32200b0a0
Author: Zhaolong Zhang <[email protected]>
Date: Thu Mar 5 14:56:11 2020 -0800

rcu: Fix the (t=0 jiffies) false positive

It is possible that an over-long grace period will end while the RCU
CPU stall warning message is printing. In this case, the estimate of
the offending grace period's duration can be erroneous due to refetching
of rcu_state.gp_start, which will now be the time of the newly started
grace period. Computation of this duration clearly needs to use the
start time for the old over-long grace period, not the fresh new one.
This commit avoids such errors by causing both print_other_cpu_stall() and
print_cpu_stall() to reuse the value previously fetched by their caller.

Signed-off-by: Zhaolong Zhang <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>

diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index b17cd9b..502b4dd 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -371,7 +371,7 @@ static void rcu_check_gp_kthread_starvation(void)
}
}

-static void print_other_cpu_stall(unsigned long gp_seq)
+static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
{
int cpu;
unsigned long flags;
@@ -408,7 +408,7 @@ static void print_other_cpu_stall(unsigned long gp_seq)
for_each_possible_cpu(cpu)
totqlen += rcu_get_n_cbs_cpu(cpu);
pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
- smp_processor_id(), (long)(jiffies - rcu_state.gp_start),
+ smp_processor_id(), (long)(jiffies - gps),
(long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
if (ndetected) {
rcu_dump_cpu_stacks();
@@ -442,7 +442,7 @@ static void print_other_cpu_stall(unsigned long gp_seq)
rcu_force_quiescent_state(); /* Kick them all. */
}

-static void print_cpu_stall(void)
+static void print_cpu_stall(unsigned long gps)
{
int cpu;
unsigned long flags;
@@ -467,7 +467,7 @@ static void print_cpu_stall(void)
for_each_possible_cpu(cpu)
totqlen += rcu_get_n_cbs_cpu(cpu);
pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n",
- jiffies - rcu_state.gp_start,
+ jiffies - gps,
(long)rcu_seq_current(&rcu_state.gp_seq), totqlen);

rcu_check_gp_kthread_starvation();
@@ -546,7 +546,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {

/* We haven't checked in, so go dump stack. */
- print_cpu_stall();
+ print_cpu_stall(gps);
if (rcu_cpu_stall_ftrace_dump)
rcu_ftrace_dump(DUMP_ALL);

@@ -555,7 +555,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {

/* They had a few time units to dump stack, so complain. */
- print_other_cpu_stall(gs2);
+ print_other_cpu_stall(gs2, gps);
if (rcu_cpu_stall_ftrace_dump)
rcu_ftrace_dump(DUMP_ALL);
}

2020-03-06 05:08:24

by Zhaolong Zhang

[permalink] [raw]
Subject: Re: [PATCH] rcu: Fix the (t=0 jiffies) false positive

Hi Paul,

Sorry for the inconvenience.
I created my patch based on mainline' commit 776e49e8ddb5169e6477fd33a396e9c7b2eb7400

Thank you for the professional description and hand applying.
I will send future rcu related patches based on your "dev" branch of linux-rcu tree.

Regards,
Zhaolong


At 2020-03-06 07:05:15, "Paul E. McKenney" <[email protected]> wrote:
>On Thu, Mar 05, 2020 at 03:45:57PM +0800, Zhaolong Zhang wrote:
>> Calculate 't' with the previously recorded 'gps' instead of 'gp_start'.
>>
>> Signed-off-by: Zhaolong Zhang <[email protected]>
>
>Good catch, thank you!
>
>I had to apply this by hand. My guess is that you developed against
>mainline rather than the "dev" branch of the -rcu tree:
>
>git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git
>
>Or perhaps your email client corrupted the patch?
>
>Regardless, please check the version below and let me know if I messed
>anything up.
>
> Thanx, Paul
>
>------------------------------------------------------------------------
>
>commit 7dd581bbbcc00246fec35eda60add4b32200b0a0
>Author: Zhaolong Zhang <[email protected]>
>Date: Thu Mar 5 14:56:11 2020 -0800
>
> rcu: Fix the (t=0 jiffies) false positive
>
> It is possible that an over-long grace period will end while the RCU
> CPU stall warning message is printing. In this case, the estimate of
> the offending grace period's duration can be erroneous due to refetching
> of rcu_state.gp_start, which will now be the time of the newly started
> grace period. Computation of this duration clearly needs to use the
> start time for the old over-long grace period, not the fresh new one.
> This commit avoids such errors by causing both print_other_cpu_stall() and
> print_cpu_stall() to reuse the value previously fetched by their caller.
>
> Signed-off-by: Zhaolong Zhang <[email protected]>
> Signed-off-by: Paul E. McKenney <[email protected]>
>
>diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
>index b17cd9b..502b4dd 100644
>--- a/kernel/rcu/tree_stall.h
>+++ b/kernel/rcu/tree_stall.h
>@@ -371,7 +371,7 @@ static void rcu_check_gp_kthread_starvation(void)
> }
> }
>
>-static void print_other_cpu_stall(unsigned long gp_seq)
>+static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
> {
> int cpu;
> unsigned long flags;
>@@ -408,7 +408,7 @@ static void print_other_cpu_stall(unsigned long gp_seq)
> for_each_possible_cpu(cpu)
> totqlen += rcu_get_n_cbs_cpu(cpu);
> pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
>- smp_processor_id(), (long)(jiffies - rcu_state.gp_start),
>+ smp_processor_id(), (long)(jiffies - gps),
> (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
> if (ndetected) {
> rcu_dump_cpu_stacks();
>@@ -442,7 +442,7 @@ static void print_other_cpu_stall(unsigned long gp_seq)
> rcu_force_quiescent_state(); /* Kick them all. */
> }
>
>-static void print_cpu_stall(void)
>+static void print_cpu_stall(unsigned long gps)
> {
> int cpu;
> unsigned long flags;
>@@ -467,7 +467,7 @@ static void print_cpu_stall(void)
> for_each_possible_cpu(cpu)
> totqlen += rcu_get_n_cbs_cpu(cpu);
> pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n",
>- jiffies - rcu_state.gp_start,
>+ jiffies - gps,
> (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
>
> rcu_check_gp_kthread_starvation();
>@@ -546,7 +546,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
> cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
>
> /* We haven't checked in, so go dump stack. */
>- print_cpu_stall();
>+ print_cpu_stall(gps);
> if (rcu_cpu_stall_ftrace_dump)
> rcu_ftrace_dump(DUMP_ALL);
>
>@@ -555,7 +555,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
> cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
>
> /* They had a few time units to dump stack, so complain. */
>- print_other_cpu_stall(gs2);
>+ print_other_cpu_stall(gs2, gps);
> if (rcu_cpu_stall_ftrace_dump)
> rcu_ftrace_dump(DUMP_ALL);
> }