Hello,
syzbot found the following issue on:
HEAD commit: e73f0f0e Linux 5.14-rc1
git tree: upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=172196d2300000
kernel config: https://syzkaller.appspot.com/x/.config?x=f5e73542d774430b
dashboard link: https://syzkaller.appspot.com/bug?extid=e08a83a1940ec3846cd5
compiler: Debian clang version 11.0.1-2
Unfortunately, I don't have any reproducer for this issue yet.
IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: [email protected]
==================================================================
BUG: KCSAN: data-race in call_rcu / rcu_gp_kthread
write to 0xffffffff837328a0 of 8 bytes by task 11 on cpu 0:
rcu_gp_fqs kernel/rcu/tree.c:1949 [inline]
rcu_gp_fqs_loop kernel/rcu/tree.c:2010 [inline]
rcu_gp_kthread+0xd78/0xec0 kernel/rcu/tree.c:2169
kthread+0x262/0x280 kernel/kthread.c:319
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295
read to 0xffffffff837328a0 of 8 bytes by task 30193 on cpu 1:
__call_rcu_core kernel/rcu/tree.c:2946 [inline]
__call_rcu kernel/rcu/tree.c:3062 [inline]
call_rcu+0x4b0/0x6c0 kernel/rcu/tree.c:3109
file_free fs/file_table.c:58 [inline]
__fput+0x43e/0x4e0 fs/file_table.c:298
____fput+0x11/0x20 fs/file_table.c:313
task_work_run+0xae/0x130 kernel/task_work.c:164
get_signal+0x156c/0x15e0 kernel/signal.c:2581
arch_do_signal_or_restart+0x2a/0x220 arch/x86/kernel/signal.c:865
handle_signal_work kernel/entry/common.c:148 [inline]
exit_to_user_mode_loop kernel/entry/common.c:172 [inline]
exit_to_user_mode_prepare+0x109/0x190 kernel/entry/common.c:209
__syscall_exit_to_user_mode_work kernel/entry/common.c:291 [inline]
syscall_exit_to_user_mode+0x20/0x40 kernel/entry/common.c:302
do_syscall_64+0x49/0x90 arch/x86/entry/common.c:86
entry_SYSCALL_64_after_hwframe+0x44/0xae
value changed: 0x0000000000000f57 -> 0x0000000000000f58
Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 30193 Comm: syz-executor.5 Tainted: G W 5.14.0-rc1-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
==================================================================
---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at [email protected].
syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
[-Bcc: fsdevel]
[+Cc: Paul, [email protected]]
On Mon, 12 Jul 2021 at 18:09, syzbot
<[email protected]> wrote:
>
> Hello,
>
> syzbot found the following issue on:
>
> HEAD commit: e73f0f0e Linux 5.14-rc1
> git tree: upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=172196d2300000
> kernel config: https://syzkaller.appspot.com/x/.config?x=f5e73542d774430b
> dashboard link: https://syzkaller.appspot.com/bug?extid=e08a83a1940ec3846cd5
> compiler: Debian clang version 11.0.1-2
>
> Unfortunately, I don't have any reproducer for this issue yet.
>
> IMPORTANT: if you fix the issue, please add the following tag to the commit:
> Reported-by: [email protected]
>
> ==================================================================
> BUG: KCSAN: data-race in call_rcu / rcu_gp_kthread
>
> write to 0xffffffff837328a0 of 8 bytes by task 11 on cpu 0:
> rcu_gp_fqs kernel/rcu/tree.c:1949 [inline]
> rcu_gp_fqs_loop kernel/rcu/tree.c:2010 [inline]
> rcu_gp_kthread+0xd78/0xec0 kernel/rcu/tree.c:2169
> kthread+0x262/0x280 kernel/kthread.c:319
> ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295
>
> read to 0xffffffff837328a0 of 8 bytes by task 30193 on cpu 1:
> __call_rcu_core kernel/rcu/tree.c:2946 [inline]
> __call_rcu kernel/rcu/tree.c:3062 [inline]
> call_rcu+0x4b0/0x6c0 kernel/rcu/tree.c:3109
> file_free fs/file_table.c:58 [inline]
> __fput+0x43e/0x4e0 fs/file_table.c:298
> ____fput+0x11/0x20 fs/file_table.c:313
> task_work_run+0xae/0x130 kernel/task_work.c:164
> get_signal+0x156c/0x15e0 kernel/signal.c:2581
> arch_do_signal_or_restart+0x2a/0x220 arch/x86/kernel/signal.c:865
> handle_signal_work kernel/entry/common.c:148 [inline]
> exit_to_user_mode_loop kernel/entry/common.c:172 [inline]
> exit_to_user_mode_prepare+0x109/0x190 kernel/entry/common.c:209
> __syscall_exit_to_user_mode_work kernel/entry/common.c:291 [inline]
> syscall_exit_to_user_mode+0x20/0x40 kernel/entry/common.c:302
> do_syscall_64+0x49/0x90 arch/x86/entry/common.c:86
> entry_SYSCALL_64_after_hwframe+0x44/0xae
>
> value changed: 0x0000000000000f57 -> 0x0000000000000f58
>
> Reported by Kernel Concurrency Sanitizer on:
> CPU: 1 PID: 30193 Comm: syz-executor.5 Tainted: G W 5.14.0-rc1-syzkaller #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> ==================================================================
>
>
> ---
> This report is generated by a bot. It may contain errors.
> See https://goo.gl/tpsmEJ for more information about syzbot.
> syzbot engineers can be reached at [email protected].
>
> syzbot will keep track of this issue. See:
> https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
>
> --
> You received this message because you are subscribed to the Google Groups "syzkaller-bugs" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to [email protected].
> To view this discussion on the web visit https://groups.google.com/d/msgid/syzkaller-bugs/00000000000080403805c6ef586d%40google.com.
On Mon, Jul 12, 2021 at 06:14:37PM +0200, Marco Elver wrote:
> [-Bcc: fsdevel]
> [+Cc: Paul, [email protected]]
>
> On Mon, 12 Jul 2021 at 18:09, syzbot
> <[email protected]> wrote:
> >
> > Hello,
> >
> > syzbot found the following issue on:
> >
> > HEAD commit: e73f0f0e Linux 5.14-rc1
> > git tree: upstream
> > console output: https://syzkaller.appspot.com/x/log.txt?x=172196d2300000
> > kernel config: https://syzkaller.appspot.com/x/.config?x=f5e73542d774430b
> > dashboard link: https://syzkaller.appspot.com/bug?extid=e08a83a1940ec3846cd5
> > compiler: Debian clang version 11.0.1-2
> >
> > Unfortunately, I don't have any reproducer for this issue yet.
> >
> > IMPORTANT: if you fix the issue, please add the following tag to the commit:
> > Reported-by: [email protected]
> >
> > ==================================================================
> > BUG: KCSAN: data-race in call_rcu / rcu_gp_kthread
> >
> > write to 0xffffffff837328a0 of 8 bytes by task 11 on cpu 0:
> > rcu_gp_fqs kernel/rcu/tree.c:1949 [inline]
> > rcu_gp_fqs_loop kernel/rcu/tree.c:2010 [inline]
> > rcu_gp_kthread+0xd78/0xec0 kernel/rcu/tree.c:2169
> > kthread+0x262/0x280 kernel/kthread.c:319
> > ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295
> >
> > read to 0xffffffff837328a0 of 8 bytes by task 30193 on cpu 1:
> > __call_rcu_core kernel/rcu/tree.c:2946 [inline]
> > __call_rcu kernel/rcu/tree.c:3062 [inline]
> > call_rcu+0x4b0/0x6c0 kernel/rcu/tree.c:3109
> > file_free fs/file_table.c:58 [inline]
> > __fput+0x43e/0x4e0 fs/file_table.c:298
> > ____fput+0x11/0x20 fs/file_table.c:313
> > task_work_run+0xae/0x130 kernel/task_work.c:164
> > get_signal+0x156c/0x15e0 kernel/signal.c:2581
> > arch_do_signal_or_restart+0x2a/0x220 arch/x86/kernel/signal.c:865
> > handle_signal_work kernel/entry/common.c:148 [inline]
> > exit_to_user_mode_loop kernel/entry/common.c:172 [inline]
> > exit_to_user_mode_prepare+0x109/0x190 kernel/entry/common.c:209
> > __syscall_exit_to_user_mode_work kernel/entry/common.c:291 [inline]
> > syscall_exit_to_user_mode+0x20/0x40 kernel/entry/common.c:302
> > do_syscall_64+0x49/0x90 arch/x86/entry/common.c:86
> > entry_SYSCALL_64_after_hwframe+0x44/0xae
> >
> > value changed: 0x0000000000000f57 -> 0x0000000000000f58
> >
> > Reported by Kernel Concurrency Sanitizer on:
> > CPU: 1 PID: 30193 Comm: syz-executor.5 Tainted: G W 5.14.0-rc1-syzkaller #0
> > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> > ==================================================================
> >
> >
> > ---
> > This report is generated by a bot. It may contain errors.
> > See https://goo.gl/tpsmEJ for more information about syzbot.
> > syzbot engineers can be reached at [email protected].
> >
> > syzbot will keep track of this issue. See:
> > https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
Good catch! And yes, this would be hard to reproduce.
How about as shown below?
Thanx, Paul
------------------------------------------------------------------------
commit 43e0f01f3b6f510dbe31d02a8f4c909c45deff04
Author: Paul E. McKenney <[email protected]>
Date: Tue Jul 20 06:16:27 2021 -0700
rcu: Mark accesses to rcu_state.n_force_qs
This commit marks accesses to the rcu_state.n_force_qs. These data
races are hard to make happen, but syzkaller was equal to the task.
Reported-by: [email protected]
Signed-off-by: Paul E. McKenney <[email protected]>
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a7379c44a2366..245bca7cdf6ee 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1913,7 +1913,7 @@ static void rcu_gp_fqs(bool first_time)
struct rcu_node *rnp = rcu_get_root();
WRITE_ONCE(rcu_state.gp_activity, jiffies);
- rcu_state.n_force_qs++;
+ WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1);
if (first_time) {
/* Collect dyntick-idle snapshots. */
force_qs_rnp(dyntick_save_progress_counter);
@@ -2556,7 +2556,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
/* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
if (count == 0 && rdp->qlen_last_fqs_check != 0) {
rdp->qlen_last_fqs_check = 0;
- rdp->n_force_qs_snap = rcu_state.n_force_qs;
+ rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
} else if (count < rdp->qlen_last_fqs_check - qhimark)
rdp->qlen_last_fqs_check = count;
@@ -2904,10 +2904,10 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
} else {
/* Give the grace period a kick. */
rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
- if (rcu_state.n_force_qs == rdp->n_force_qs_snap &&
+ if (READ_ONCE(rcu_state.n_force_qs) == rdp->n_force_qs_snap &&
rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
rcu_force_quiescent_state();
- rdp->n_force_qs_snap = rcu_state.n_force_qs;
+ rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
}
}
@@ -4134,7 +4134,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rdp->qlen_last_fqs_check = 0;
- rdp->n_force_qs_snap = rcu_state.n_force_qs;
+ rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
rdp->blimit = blimit;
rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */
rcu_dynticks_eqs_online();
On Tue, 20 Jul 2021 at 15:18, Paul E. McKenney <[email protected]> wrote:
[...]
> Good catch! And yes, this would be hard to reproduce.
>
> How about as shown below?
Acked-by: Marco Elver <[email protected]>
I was merely a little surprised syzbot was able to exercise RCU in a
way that resulted in a data race your torture runs hadn't found yet
(or perhaps it did and missed?).
Thanks,
-- Marco
> Thanx, Paul
>
> ------------------------------------------------------------------------
>
> commit 43e0f01f3b6f510dbe31d02a8f4c909c45deff04
> Author: Paul E. McKenney <[email protected]>
> Date: Tue Jul 20 06:16:27 2021 -0700
>
> rcu: Mark accesses to rcu_state.n_force_qs
>
> This commit marks accesses to the rcu_state.n_force_qs. These data
> races are hard to make happen, but syzkaller was equal to the task.
>
> Reported-by: [email protected]
> Signed-off-by: Paul E. McKenney <[email protected]>
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index a7379c44a2366..245bca7cdf6ee 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -1913,7 +1913,7 @@ static void rcu_gp_fqs(bool first_time)
> struct rcu_node *rnp = rcu_get_root();
>
> WRITE_ONCE(rcu_state.gp_activity, jiffies);
> - rcu_state.n_force_qs++;
> + WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1);
> if (first_time) {
> /* Collect dyntick-idle snapshots. */
> force_qs_rnp(dyntick_save_progress_counter);
> @@ -2556,7 +2556,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
> /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
> if (count == 0 && rdp->qlen_last_fqs_check != 0) {
> rdp->qlen_last_fqs_check = 0;
> - rdp->n_force_qs_snap = rcu_state.n_force_qs;
> + rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
> } else if (count < rdp->qlen_last_fqs_check - qhimark)
> rdp->qlen_last_fqs_check = count;
>
> @@ -2904,10 +2904,10 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
> } else {
> /* Give the grace period a kick. */
> rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
> - if (rcu_state.n_force_qs == rdp->n_force_qs_snap &&
> + if (READ_ONCE(rcu_state.n_force_qs) == rdp->n_force_qs_snap &&
> rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
> rcu_force_quiescent_state();
> - rdp->n_force_qs_snap = rcu_state.n_force_qs;
> + rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
> rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
> }
> }
> @@ -4134,7 +4134,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
> /* Set up local state, ensuring consistent view of global state. */
> raw_spin_lock_irqsave_rcu_node(rnp, flags);
> rdp->qlen_last_fqs_check = 0;
> - rdp->n_force_qs_snap = rcu_state.n_force_qs;
> + rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
> rdp->blimit = blimit;
> rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */
> rcu_dynticks_eqs_online();
On Tue, Jul 20, 2021 at 10:16 PM Marco Elver <[email protected]> wrote:
>
> On Tue, 20 Jul 2021 at 15:18, Paul E. McKenney <[email protected]> wrote:
> [...]
> > Good catch! And yes, this would be hard to reproduce.
> >
> > How about as shown below?
>
> Acked-by: Marco Elver <[email protected]>
>
> I was merely a little surprised syzbot was able to exercise RCU in a
> way that resulted in a data race your torture runs hadn't found yet
> (or perhaps it did and missed?).
I think rcu_state.n_force_qs is used to give grace period a kick. In worst
case, the data race will cause the grace period to miss at most one
kick, but the grace
period will get kicked soon because of too many callbacks, I guess that's
why rcu torture will not find this data race.
Many thanks
Zhouyi
>
> Thanks,
> -- Marco
>
> > Thanx, Paul
> >
> > ------------------------------------------------------------------------
> >
> > commit 43e0f01f3b6f510dbe31d02a8f4c909c45deff04
> > Author: Paul E. McKenney <[email protected]>
> > Date: Tue Jul 20 06:16:27 2021 -0700
> >
> > rcu: Mark accesses to rcu_state.n_force_qs
> >
> > This commit marks accesses to the rcu_state.n_force_qs. These data
> > races are hard to make happen, but syzkaller was equal to the task.
> >
> > Reported-by: [email protected]
> > Signed-off-by: Paul E. McKenney <[email protected]>
> >
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index a7379c44a2366..245bca7cdf6ee 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -1913,7 +1913,7 @@ static void rcu_gp_fqs(bool first_time)
> > struct rcu_node *rnp = rcu_get_root();
> >
> > WRITE_ONCE(rcu_state.gp_activity, jiffies);
> > - rcu_state.n_force_qs++;
> > + WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1);
> > if (first_time) {
> > /* Collect dyntick-idle snapshots. */
> > force_qs_rnp(dyntick_save_progress_counter);
> > @@ -2556,7 +2556,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
> > /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
> > if (count == 0 && rdp->qlen_last_fqs_check != 0) {
> > rdp->qlen_last_fqs_check = 0;
> > - rdp->n_force_qs_snap = rcu_state.n_force_qs;
> > + rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
> > } else if (count < rdp->qlen_last_fqs_check - qhimark)
> > rdp->qlen_last_fqs_check = count;
> >
> > @@ -2904,10 +2904,10 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
> > } else {
> > /* Give the grace period a kick. */
> > rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
> > - if (rcu_state.n_force_qs == rdp->n_force_qs_snap &&
> > + if (READ_ONCE(rcu_state.n_force_qs) == rdp->n_force_qs_snap &&
> > rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
> > rcu_force_quiescent_state();
> > - rdp->n_force_qs_snap = rcu_state.n_force_qs;
> > + rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
> > rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
> > }
> > }
> > @@ -4134,7 +4134,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
> > /* Set up local state, ensuring consistent view of global state. */
> > raw_spin_lock_irqsave_rcu_node(rnp, flags);
> > rdp->qlen_last_fqs_check = 0;
> > - rdp->n_force_qs_snap = rcu_state.n_force_qs;
> > + rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
> > rdp->blimit = blimit;
> > rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */
> > rcu_dynticks_eqs_online();
On Tue, Jul 20, 2021 at 04:10:10PM +0200, Marco Elver wrote:
> On Tue, 20 Jul 2021 at 15:18, Paul E. McKenney <[email protected]> wrote:
> [...]
> > Good catch! And yes, this would be hard to reproduce.
> >
> > How about as shown below?
>
> Acked-by: Marco Elver <[email protected]>
I will apply on the next rebase, thank you!
> I was merely a little surprised syzbot was able to exercise RCU in a
> way that resulted in a data race your torture runs hadn't found yet
> (or perhaps it did and missed?).
My KCSAN runs are necessarily quite short because I do a normal, KASAN,
and KCSAN variant of each scenario of each torture test, with the
constraint that it all run overnight.
So there are probably more to find. ;-)
Thanx, Paul
> Thanks,
> -- Marco
>
> > Thanx, Paul
> >
> > ------------------------------------------------------------------------
> >
> > commit 43e0f01f3b6f510dbe31d02a8f4c909c45deff04
> > Author: Paul E. McKenney <[email protected]>
> > Date: Tue Jul 20 06:16:27 2021 -0700
> >
> > rcu: Mark accesses to rcu_state.n_force_qs
> >
> > This commit marks accesses to the rcu_state.n_force_qs. These data
> > races are hard to make happen, but syzkaller was equal to the task.
> >
> > Reported-by: [email protected]
> > Signed-off-by: Paul E. McKenney <[email protected]>
> >
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index a7379c44a2366..245bca7cdf6ee 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -1913,7 +1913,7 @@ static void rcu_gp_fqs(bool first_time)
> > struct rcu_node *rnp = rcu_get_root();
> >
> > WRITE_ONCE(rcu_state.gp_activity, jiffies);
> > - rcu_state.n_force_qs++;
> > + WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1);
> > if (first_time) {
> > /* Collect dyntick-idle snapshots. */
> > force_qs_rnp(dyntick_save_progress_counter);
> > @@ -2556,7 +2556,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
> > /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
> > if (count == 0 && rdp->qlen_last_fqs_check != 0) {
> > rdp->qlen_last_fqs_check = 0;
> > - rdp->n_force_qs_snap = rcu_state.n_force_qs;
> > + rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
> > } else if (count < rdp->qlen_last_fqs_check - qhimark)
> > rdp->qlen_last_fqs_check = count;
> >
> > @@ -2904,10 +2904,10 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
> > } else {
> > /* Give the grace period a kick. */
> > rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
> > - if (rcu_state.n_force_qs == rdp->n_force_qs_snap &&
> > + if (READ_ONCE(rcu_state.n_force_qs) == rdp->n_force_qs_snap &&
> > rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
> > rcu_force_quiescent_state();
> > - rdp->n_force_qs_snap = rcu_state.n_force_qs;
> > + rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
> > rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
> > }
> > }
> > @@ -4134,7 +4134,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
> > /* Set up local state, ensuring consistent view of global state. */
> > raw_spin_lock_irqsave_rcu_node(rnp, flags);
> > rdp->qlen_last_fqs_check = 0;
> > - rdp->n_force_qs_snap = rcu_state.n_force_qs;
> > + rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
> > rdp->blimit = blimit;
> > rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */
> > rcu_dynticks_eqs_online();
On Wed, Jul 21, 2021 at 12:19:02AM +0800, Zhouyi Zhou wrote:
> On Tue, Jul 20, 2021 at 10:16 PM Marco Elver <[email protected]> wrote:
> >
> > On Tue, 20 Jul 2021 at 15:18, Paul E. McKenney <[email protected]> wrote:
> > [...]
> > > Good catch! And yes, this would be hard to reproduce.
> > >
> > > How about as shown below?
> >
> > Acked-by: Marco Elver <[email protected]>
> >
> > I was merely a little surprised syzbot was able to exercise RCU in a
> > way that resulted in a data race your torture runs hadn't found yet
> > (or perhaps it did and missed?).
>
> I think rcu_state.n_force_qs is used to give grace period a kick. In worst
> case, the data race will cause the grace period to miss at most one
> kick, but the grace
> period will get kicked soon because of too many callbacks, I guess that's
> why rcu torture will not find this data race.
In theory, rcutorture -could- find it, but it would require a call_rcu()
or similar just at the beginning of the grace period. This is a tough
target to hit.
Thanx, Paul
> Many thanks
> Zhouyi
> >
> > Thanks,
> > -- Marco
> >
> > > Thanx, Paul
> > >
> > > ------------------------------------------------------------------------
> > >
> > > commit 43e0f01f3b6f510dbe31d02a8f4c909c45deff04
> > > Author: Paul E. McKenney <[email protected]>
> > > Date: Tue Jul 20 06:16:27 2021 -0700
> > >
> > > rcu: Mark accesses to rcu_state.n_force_qs
> > >
> > > This commit marks accesses to the rcu_state.n_force_qs. These data
> > > races are hard to make happen, but syzkaller was equal to the task.
> > >
> > > Reported-by: [email protected]
> > > Signed-off-by: Paul E. McKenney <[email protected]>
> > >
> > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > index a7379c44a2366..245bca7cdf6ee 100644
> > > --- a/kernel/rcu/tree.c
> > > +++ b/kernel/rcu/tree.c
> > > @@ -1913,7 +1913,7 @@ static void rcu_gp_fqs(bool first_time)
> > > struct rcu_node *rnp = rcu_get_root();
> > >
> > > WRITE_ONCE(rcu_state.gp_activity, jiffies);
> > > - rcu_state.n_force_qs++;
> > > + WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1);
> > > if (first_time) {
> > > /* Collect dyntick-idle snapshots. */
> > > force_qs_rnp(dyntick_save_progress_counter);
> > > @@ -2556,7 +2556,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
> > > /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
> > > if (count == 0 && rdp->qlen_last_fqs_check != 0) {
> > > rdp->qlen_last_fqs_check = 0;
> > > - rdp->n_force_qs_snap = rcu_state.n_force_qs;
> > > + rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
> > > } else if (count < rdp->qlen_last_fqs_check - qhimark)
> > > rdp->qlen_last_fqs_check = count;
> > >
> > > @@ -2904,10 +2904,10 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
> > > } else {
> > > /* Give the grace period a kick. */
> > > rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
> > > - if (rcu_state.n_force_qs == rdp->n_force_qs_snap &&
> > > + if (READ_ONCE(rcu_state.n_force_qs) == rdp->n_force_qs_snap &&
> > > rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
> > > rcu_force_quiescent_state();
> > > - rdp->n_force_qs_snap = rcu_state.n_force_qs;
> > > + rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
> > > rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
> > > }
> > > }
> > > @@ -4134,7 +4134,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
> > > /* Set up local state, ensuring consistent view of global state. */
> > > raw_spin_lock_irqsave_rcu_node(rnp, flags);
> > > rdp->qlen_last_fqs_check = 0;
> > > - rdp->n_force_qs_snap = rcu_state.n_force_qs;
> > > + rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
> > > rdp->blimit = blimit;
> > > rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */
> > > rcu_dynticks_eqs_online();