Hello!
This series provides miscellaneous fixes:
1. Make non-production PREEMPT cond_resched() help Tasks RCU.
2. Inline rcu_preempt_do_callback() into its sole caller, courtesy
of Byungchul Park.
3. Don't allocate rcu_nocb_mask if no one needs it, for example, in
the case where there are no no-CBs CPUs.
4. Call wake_nocb_leader_defer() with 'FORCE' when nocb_q_count is
high, courtesy of Byungchul Park.
5. Remove deprecated RCU debugfs tracing code, courtesy of Byungchul
Park.
6. Rename cond_resched_rcu_qs() to cond_resched_tasks_rcu_qs() in
order to reflect its actual current use case.
7. Eliminate unused cond_resched_softirq() macro.
8. Move __rcu_read_lock() and __rcu_read_unlock() to tree_plugin.h.
9. Update rcu_bind_gp_kthread() header comment.
10. Declare rcu_eqs_special_set() in public header, courtesy of
Yury Norov.
11. Add cleanup_srcu_struct_quiesced().
12. Avoid flush dependency in delete controller flow, which is
included in this series due to its need for the shiny new
cleanup_srcu_struct_quiesced() function.
13. Exclude near-simultaneous RCU CPU stall warnings.
14. Add leaf-node macros to simplify code.
15. Ensure whatisRCU.txt actually says what RCU is, courtesy of
Paul Gortmaker.
16-21: Debugging commits that may or may not be submitted for inclusion,
depending on how helpful they prove to be.
22: Use the proper lockdep annotation in dump_blkd_tasks(),
courtesy of Boqun Feng.
Thanx, Paul
------------------------------------------------------------------------
Documentation/RCU/whatisRCU.txt | 2
drivers/nvme/host/core.c | 2
include/linux/rcupdate.h | 4 -
include/linux/rcutree.h | 1
include/linux/sched.h | 14 ++--
include/linux/srcu.h | 36 ++++++++++++
kernel/rcu/rcu.h | 11 ++-
kernel/rcu/rcuperf.c | 2
kernel/rcu/rcutorture.c | 7 ++
kernel/rcu/srcutiny.c | 9 ++-
kernel/rcu/srcutree.c | 30 +++++-----
kernel/rcu/tree.c | 76 ++++++++++++++------------
kernel/rcu/tree.h | 8 --
kernel/rcu/tree_exp.h | 13 +---
kernel/rcu/tree_plugin.h | 116 +++++++++++++++++++++++++++++++---------
kernel/rcu/update.c | 50 -----------------
kernel/sched/core.c | 14 ----
kernel/softirq.c | 3 -
kernel/torture.c | 2
kernel/trace/trace_benchmark.c | 4 -
20 files changed, 232 insertions(+), 172 deletions(-)
In CONFIG_PREEMPT=y kernels, cond_resched() is a complete no-op, and
thus cannot help advance Tasks-RCU grace periods. However, such grace
periods are only an issue in non-production benchmarking runs of the
Linux kernel. This commit therefore makes cond_resched() invoke
rcu_note_voluntary_context_switch() for kernels implementing Tasks RCU
even in CONFIG_PREEMPT=y kernels.
Reported-by: Steven Rostedt <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
---
include/linux/sched.h | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b3d697f3b573..fe8f7178a22d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1617,6 +1617,12 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
*/
#ifndef CONFIG_PREEMPT
extern int _cond_resched(void);
+#elif defined(CONFIG_TRACEPOINT_BENCHMARK)
+static inline int _cond_resched(void)
+{
+ rcu_note_voluntary_context_switch(current);
+ return 0;
+}
#else
static inline int _cond_resched(void) { return 0; }
#endif
--
2.5.2
From: Byungchul Park <[email protected]>
The rcu_preempt_do_callbacks() function was introduced in commit
09223371dea(rcu: Use softirq to address performance regression), where it
was necessary to handle kernel builds both containing and not containing
RCU-preempt. Since then, various changes (most notably f8b7fc6b51
("rcu: use softirq instead of kthreads except when RCU_BOOST=y")) have
resulted in this function being invoked only from rcu_kthread_do_work(),
which is present only in kernels containing RCU-preempt, which in turn
means that the rcu_preempt_do_callbacks() function is no longer needed.
This commit therefore inlines rcu_preempt_do_callbacks() into its
sole remaining caller and also removes the rcu_state_p and rcu_data_p
indirection for added clarity.
Signed-off-by: Byungchul Park <[email protected]>
Reviewed-by: Steven Rostedt (VMware) <[email protected]>
[ paulmck: Remove the rcu_state_p and rcu_data_p indirection. ]
Signed-off-by: Paul E. McKenney <[email protected]>
---
kernel/rcu/tree.h | 1 -
kernel/rcu/tree_plugin.h | 11 +----------
2 files changed, 1 insertion(+), 11 deletions(-)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index f491ab4f2e8e..3a0dc30100e8 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -438,7 +438,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
static void invoke_rcu_callbacks_kthread(void);
static bool rcu_is_callbacks_kthread(void);
#ifdef CONFIG_RCU_BOOST
-static void rcu_preempt_do_callbacks(void);
static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
struct rcu_node *rnp);
#endif /* #ifdef CONFIG_RCU_BOOST */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 84fbee4686d3..6c5df18bbf2f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -685,15 +685,6 @@ static void rcu_preempt_check_callbacks(void)
t->rcu_read_unlock_special.b.need_qs = true;
}
-#ifdef CONFIG_RCU_BOOST
-
-static void rcu_preempt_do_callbacks(void)
-{
- rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
-}
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
/**
* call_rcu() - Queue an RCU callback for invocation after a grace period.
* @head: structure to be used for queueing the RCU updates.
@@ -1140,7 +1131,7 @@ static void rcu_kthread_do_work(void)
{
rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
- rcu_preempt_do_callbacks();
+ rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
}
static void rcu_cpu_kthread_setup(unsigned int cpu)
--
2.5.2
From: Nitzan Carmi <[email protected]>
The nvme_delete_ctrl() function queues a work item on a MEM_RECLAIM
queue (nvme_delete_wq), which eventually calls cleanup_srcu_struct(),
which in turn flushes a delayed work from an !MEM_RECLAIM queue. This
is unsafe as we might trigger deadlocks under severe memory pressure.
Since we don't ever invoke call_srcu(), it is safe to use the shiny new
_quiesced() version of srcu cleanup, thus avoiding that flush dependency.
This commit makes that change.
Signed-off-by: Nitzan Carmi <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
Reviewed-by: Max Gurtovoy <[email protected]>
---
drivers/nvme/host/core.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 9df4f71e58ca..c3cea8a29843 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -349,7 +349,7 @@ static void nvme_free_ns_head(struct kref *ref)
nvme_mpath_remove_disk(head);
ida_simple_remove(&head->subsys->ns_ida, head->instance);
list_del_init(&head->entry);
- cleanup_srcu_struct(&head->srcu);
+ cleanup_srcu_struct_quiesced(&head->srcu);
kfree(head);
}
--
2.5.2
Signed-off-by: Paul E. McKenney <[email protected]>
---
kernel/rcu/tree_plugin.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index cdfd39f58368..0dbf813f371b 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -697,7 +697,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
struct task_struct *t;
RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
- WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
+ if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
+ dump_blkd_tasks(rnp, 10);
if (rcu_preempt_has_tasks(rnp)) {
rnp->gp_tasks = rnp->blkd_tasks.next;
t = container_of(rnp->gp_tasks, struct task_struct,
--
2.5.2
The WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp()) in
rcu_gp_cleanup() triggers (inexplicably, of course) every so often.
This commit therefore extracts more information.
Signed-off-by: Paul E. McKenney <[email protected]>
---
kernel/rcu/tree.c | 17 ++++++++++++++++-
1 file changed, 16 insertions(+), 1 deletion(-)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 08b72dd02d88..6ad2ac34558f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2147,7 +2147,22 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
*/
rcu_for_each_node_breadth_first(rsp, rnp) {
raw_spin_lock_irq_rcu_node(rnp);
- WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
+#ifdef CONFIG_PREEMPT_RCU
+ if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) {
+ int i;
+ struct task_struct *t;
+
+ pr_info("%s: grp: %d-%d level: %d ->gp_tasks %p ->exp_tasks %p &->blkd_tasks: %p offset: %u\n", __func__, rnp->grplo, rnp->grphi, rnp->level, rnp->gp_tasks, rnp->exp_tasks, &rnp->blkd_tasks, (unsigned int)offsetof(typeof(*rnp), blkd_tasks));
+ pr_cont("\t->blkd_tasks");
+ i = 0;
+ list_for_each_entry(t, &rnp->blkd_tasks, rcu_node_entry) {
+ pr_cont(" %p", t);
+ if (++i >= 10)
+ break;
+ }
+ pr_cont("\n");
+ }
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
WARN_ON_ONCE(rnp->qsmask);
WRITE_ONCE(rnp->completed, rsp->gpnum);
rdp = this_cpu_ptr(rsp->rda);
--
2.5.2
Signed-off-by: Paul E. McKenney <[email protected]>
---
kernel/rcu/tree_plugin.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0dbf813f371b..429f9358d0b6 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -852,7 +852,7 @@ static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
struct list_head *lhp;
lockdep_assert_held(&rnp->lock);
- pr_info("%s: grp: %d-%d level: %d ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p &->blkd_tasks: %p offset: %u\n", __func__, rnp->grplo, rnp->grphi, rnp->level, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks, &rnp->blkd_tasks, (unsigned int)offsetof(typeof(*rnp), blkd_tasks));
+ pr_info("%s: grp: %d-%d level: %d ->qamask %#lx ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p &->blkd_tasks: %p offset: %u\n", __func__, rnp->grplo, rnp->grphi, rnp->level, rnp->qsmask, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks, &rnp->blkd_tasks, (unsigned int)offsetof(typeof(*rnp), blkd_tasks));
pr_cont("\t->blkd_tasks");
i = 0;
list_for_each(lhp, &rnp->blkd_tasks) {
--
2.5.2
Commit e31d28b6ab8f ("trace: Eliminate cond_resched_rcu_qs() in favor
of cond_resched()") substituted cond_resched() for the earlier call
to cond_resched_rcu_qs(). However, the new-age cond_resched() does
not do anything to help RCU-tasks grace periods because (1) RCU-tasks
is only enabled when CONFIG_PREEMPT=y and (2) cond_resched() is a
complete no-op when preemption is enabled. This situation results
in hangs when running the trace benchmarks.
A number of potential fixes were discussed on LKML
(https://lkml.kernel.org/r/[email protected]),
including making cond_resched() not be a no-op; making cond_resched()
not be a no-op, but only when running tracing benchmarks; reverting
the aforementioned commit (which works because cond_resched_rcu_qs()
does provide an RCU-tasks quiescent state; and adding a call to the
scheduler/RCU rcu_note_voluntary_context_switch() function. All were
deemed unsatisfactory, either due to added cond_resched() overhead or
due to magic functions inviting cargo culting.
This commit renames cond_resched_rcu_qs() to cond_resched_tasks_rcu_qs(),
which provides a clear hint as to what this function is doing and
why and where it should be used, and then replaces the call to
cond_resched() with cond_resched_tasks_rcu_qs() in the trace benchmark's
benchmark_event_kthread() function.
Reported-by: Steven Rostedt <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
---
include/linux/rcupdate.h | 4 ++--
kernel/rcu/rcuperf.c | 2 +-
kernel/rcu/tree.c | 20 ++++++++++----------
kernel/rcu/tree_plugin.h | 4 ++--
kernel/rcu/update.c | 2 +-
kernel/torture.c | 2 +-
kernel/trace/trace_benchmark.c | 4 ++--
7 files changed, 19 insertions(+), 19 deletions(-)
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 36360d07f25b..19d235fefdb9 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -188,13 +188,13 @@ static inline void exit_tasks_rcu_finish(void) { }
#endif /* #else #ifdef CONFIG_TASKS_RCU */
/**
- * cond_resched_rcu_qs - Report potential quiescent states to RCU
+ * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU
*
* This macro resembles cond_resched(), except that it is defined to
* report potential quiescent states to RCU-tasks even if the cond_resched()
* machinery were to be shut off, as some advocate for PREEMPT kernels.
*/
-#define cond_resched_rcu_qs() \
+#define cond_resched_tasks_rcu_qs() \
do { \
if (!cond_resched()) \
rcu_note_voluntary_context_switch_lite(current); \
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 777e7a6a0292..e232846516b3 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -369,7 +369,7 @@ static bool __maybe_unused torturing_tasks(void)
*/
static void rcu_perf_wait_shutdown(void)
{
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters)
return;
while (!torture_must_stop())
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2a734692a581..c4db0e20b035 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1234,10 +1234,10 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
}
/*
- * Has this CPU encountered a cond_resched_rcu_qs() since the
- * beginning of the grace period? For this to be the case,
- * the CPU has to have noticed the current grace period. This
- * might not be the case for nohz_full CPUs looping in the kernel.
+ * Has this CPU encountered a cond_resched() since the beginning
+ * of the grace period? For this to be the case, the CPU has to
+ * have noticed the current grace period. This might not be the
+ * case for nohz_full CPUs looping in the kernel.
*/
jtsq = jiffies_till_sched_qs;
ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu);
@@ -2049,7 +2049,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
rnp->level, rnp->grplo,
rnp->grphi, rnp->qsmask);
raw_spin_unlock_irq_rcu_node(rnp);
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
}
@@ -2151,7 +2151,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
sq = rcu_nocb_gp_get(rnp);
raw_spin_unlock_irq_rcu_node(rnp);
rcu_nocb_gp_cleanup(sq);
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
rcu_gp_slow(rsp, gp_cleanup_delay);
}
@@ -2202,7 +2202,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
/* Locking provides needed memory barrier. */
if (rcu_gp_init(rsp))
break;
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
WARN_ON(signal_pending(current));
trace_rcu_grace_period(rsp->name,
@@ -2247,7 +2247,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
trace_rcu_grace_period(rsp->name,
READ_ONCE(rsp->gpnum),
TPS("fqsend"));
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
ret = 0; /* Force full wait till next FQS. */
j = jiffies_till_next_fqs;
@@ -2260,7 +2260,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
}
} else {
/* Deal with stray signal. */
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
WARN_ON(signal_pending(current));
trace_rcu_grace_period(rsp->name,
@@ -2782,7 +2782,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp))
struct rcu_node *rnp;
rcu_for_each_leaf_node(rsp, rnp) {
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
mask = 0;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->qsmask == 0) {
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 12774c4fb546..dfc42c3d52d4 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1598,7 +1598,7 @@ static int rcu_oom_notify(struct notifier_block *self,
for_each_online_cpu(cpu) {
smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
}
/* Unconditionally decrement: no need to wake ourselves up. */
@@ -2227,7 +2227,7 @@ static int rcu_nocb_kthread(void *arg)
cl++;
c++;
local_bh_enable();
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
list = next;
}
trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 68fa19a5e7bd..e401960c7f51 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -624,7 +624,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
* grace period has elapsed, in other words after all currently
* executing rcu-tasks read-side critical sections have elapsed. These
* read-side critical sections are delimited by calls to schedule(),
- * cond_resched_rcu_qs(), idle execution, userspace execution, calls
+ * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls
* to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched().
*
* This is a very specialized primitive, intended only for a few uses in
diff --git a/kernel/torture.c b/kernel/torture.c
index 37b94012a3f8..3de1efbecd6a 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -574,7 +574,7 @@ void stutter_wait(const char *title)
{
int spt;
- cond_resched_rcu_qs();
+ cond_resched_tasks_rcu_qs();
spt = READ_ONCE(stutter_pause_test);
for (; spt; spt = READ_ONCE(stutter_pause_test)) {
if (spt == 1) {
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 22fee766081b..80e0b2aca703 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -159,13 +159,13 @@ static int benchmark_event_kthread(void *arg)
* wants to run, schedule in, but if the CPU is idle,
* we'll keep burning cycles.
*
- * Note the _rcu_qs() version of cond_resched() will
+ * Note the tasks_rcu_qs() version of cond_resched() will
* notify synchronize_rcu_tasks() that this thread has
* passed a quiescent state for rcu_tasks. Otherwise
* this thread will never voluntarily schedule which would
* block synchronize_rcu_tasks() indefinitely.
*/
- cond_resched();
+ cond_resched_tasks_rcu_qs();
}
return 0;
--
2.5.2
The __rcu_read_lock() and __rcu_read_unlock() functions were moved
to kernel/rcu/update.c in order to implement tiny preemptible RCU.
However, tiny preemptible RCU was removed from the kernel a long time
ago, so this commit belatedly moves them back into the only remaining
preemptible-RCU code.
Signed-off-by: Paul E. McKenney <[email protected]>
---
kernel/rcu/tree_plugin.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
kernel/rcu/update.c | 48 ------------------------------------------------
2 files changed, 44 insertions(+), 48 deletions(-)
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index dfc42c3d52d4..ee24a5de0503 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -384,6 +384,50 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
}
/*
+ * Preemptible RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+ current->rcu_read_lock_nesting++;
+ barrier(); /* critical section after entry code. */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+
+/*
+ * Preemptible RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+ struct task_struct *t = current;
+
+ if (t->rcu_read_lock_nesting != 1) {
+ --t->rcu_read_lock_nesting;
+ } else {
+ barrier(); /* critical section before exit code. */
+ t->rcu_read_lock_nesting = INT_MIN;
+ barrier(); /* assign before ->rcu_read_unlock_special load */
+ if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
+ rcu_read_unlock_special(t);
+ barrier(); /* ->rcu_read_unlock_special load before assign */
+ t->rcu_read_lock_nesting = 0;
+ }
+#ifdef CONFIG_PROVE_LOCKING
+ {
+ int rrln = READ_ONCE(t->rcu_read_lock_nesting);
+
+ WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
+ }
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+
+/*
* Advance a ->blkd_tasks-list pointer to the next entry, instead
* returning NULL if at the end of the list.
*/
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index e401960c7f51..4c230a60ece4 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -226,54 +226,6 @@ core_initcall(rcu_set_runtime_mode);
#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */
-#ifdef CONFIG_PREEMPT_RCU
-
-/*
- * Preemptible RCU implementation for rcu_read_lock().
- * Just increment ->rcu_read_lock_nesting, shared state will be updated
- * if we block.
- */
-void __rcu_read_lock(void)
-{
- current->rcu_read_lock_nesting++;
- barrier(); /* critical section after entry code. */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_lock);
-
-/*
- * Preemptible RCU implementation for rcu_read_unlock().
- * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
- * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
- * invoke rcu_read_unlock_special() to clean up after a context switch
- * in an RCU read-side critical section and other special cases.
- */
-void __rcu_read_unlock(void)
-{
- struct task_struct *t = current;
-
- if (t->rcu_read_lock_nesting != 1) {
- --t->rcu_read_lock_nesting;
- } else {
- barrier(); /* critical section before exit code. */
- t->rcu_read_lock_nesting = INT_MIN;
- barrier(); /* assign before ->rcu_read_unlock_special load */
- if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
- rcu_read_unlock_special(t);
- barrier(); /* ->rcu_read_unlock_special load before assign */
- t->rcu_read_lock_nesting = 0;
- }
-#ifdef CONFIG_PROVE_LOCKING
- {
- int rrln = READ_ONCE(t->rcu_read_lock_nesting);
-
- WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
- }
-#endif /* #ifdef CONFIG_PROVE_LOCKING */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_unlock);
-
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
-
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
struct lockdep_map rcu_lock_map =
--
2.5.2
The current cleanup_srcu_struct() flushes work, which prevents it
from being invoked from some workqueue contexts, as well as from
atomic (non-blocking) contexts. This patch therefore introduced a
cleanup_srcu_struct_quiesced(), which can be invoked only after all
activity on the specified srcu_struct has completed. This restriction
allows cleanup_srcu_struct_quiesced() to be invoked from workqueue
contexts as well as from atomic contexts.
Suggested-by: Christoph Hellwig <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
Tested-by: Nitzan Carmi <[email protected]>
---
include/linux/srcu.h | 36 +++++++++++++++++++++++++++++++++++-
kernel/rcu/rcutorture.c | 7 ++++++-
kernel/rcu/srcutiny.c | 9 ++++++---
kernel/rcu/srcutree.c | 30 +++++++++++++++++-------------
4 files changed, 64 insertions(+), 18 deletions(-)
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 33c1c698df09..91494d7e8e41 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -69,11 +69,45 @@ struct srcu_struct { };
void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
void (*func)(struct rcu_head *head));
-void cleanup_srcu_struct(struct srcu_struct *sp);
+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced);
int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
void synchronize_srcu(struct srcu_struct *sp);
+/**
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+static inline void cleanup_srcu_struct(struct srcu_struct *sp)
+{
+ _cleanup_srcu_struct(sp, false);
+}
+
+/**
+ * cleanup_srcu_struct_quiesced - deconstruct a quiesced sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory. Also,
+ * all grace-period processing must have completed.
+ *
+ * "Completed" means that the last synchronize_srcu() and
+ * synchronize_srcu_expedited() calls must have returned before the call
+ * to cleanup_srcu_struct_quiesced(). It also means that the callback
+ * from the last call_srcu() must have been invoked before the call to
+ * cleanup_srcu_struct_quiesced(), but you can use srcu_barrier() to help
+ * with this last. Violating these rules will get you a WARN_ON() splat
+ * (with high probability, anyway), and will also cause the srcu_struct
+ * to be leaked.
+ */
+static inline void cleanup_srcu_struct_quiesced(struct srcu_struct *sp)
+{
+ _cleanup_srcu_struct(sp, true);
+}
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/**
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 680c96d8c00f..f0e1d44459f8 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -593,7 +593,12 @@ static void srcu_torture_init(void)
static void srcu_torture_cleanup(void)
{
- cleanup_srcu_struct(&srcu_ctld);
+ static DEFINE_TORTURE_RANDOM(rand);
+
+ if (torture_random(&rand) & 0x800)
+ cleanup_srcu_struct(&srcu_ctld);
+ else
+ cleanup_srcu_struct_quiesced(&srcu_ctld);
srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
}
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 76ac5f50b2c7..622792abe41a 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -86,16 +86,19 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
* Must invoke this after you are finished using a given srcu_struct that
* was initialized via init_srcu_struct(), else you leak memory.
*/
-void cleanup_srcu_struct(struct srcu_struct *sp)
+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
{
WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
- flush_work(&sp->srcu_work);
+ if (quiesced)
+ WARN_ON(work_pending(&sp->srcu_work));
+ else
+ flush_work(&sp->srcu_work);
WARN_ON(sp->srcu_gp_running);
WARN_ON(sp->srcu_gp_waiting);
WARN_ON(sp->srcu_cb_head);
WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail);
}
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
/*
* Removes the count for the old reader from the appropriate element of
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index fb560fca9ef4..b4123d7a2cec 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -366,24 +366,28 @@ static unsigned long srcu_get_delay(struct srcu_struct *sp)
return SRCU_INTERVAL;
}
-/**
- * cleanup_srcu_struct - deconstruct a sleep-RCU structure
- * @sp: structure to clean up.
- *
- * Must invoke this after you are finished using a given srcu_struct that
- * was initialized via init_srcu_struct(), else you leak memory.
- */
-void cleanup_srcu_struct(struct srcu_struct *sp)
+/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */
+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
{
int cpu;
if (WARN_ON(!srcu_get_delay(sp)))
- return; /* Leakage unless caller handles error. */
+ return; /* Just leak it! */
if (WARN_ON(srcu_readers_active(sp)))
- return; /* Leakage unless caller handles error. */
- flush_delayed_work(&sp->work);
+ return; /* Just leak it! */
+ if (quiesced) {
+ if (WARN_ON(delayed_work_pending(&sp->work)))
+ return; /* Just leak it! */
+ } else {
+ flush_delayed_work(&sp->work);
+ }
for_each_possible_cpu(cpu)
- flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
+ if (quiesced) {
+ if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work)))
+ return; /* Just leak it! */
+ } else {
+ flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
+ }
if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
WARN_ON(srcu_readers_active(sp))) {
pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
@@ -392,7 +396,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
free_percpu(sp->sda);
sp->sda = NULL;
}
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
/*
* Counts the new reader in the appropriate per-CPU element of the
--
2.5.2
Signed-off-by: Paul E. McKenney <[email protected]>
---
kernel/rcu/tree_plugin.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index a2ae6b49f900..cdfd39f58368 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -851,7 +851,7 @@ static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
struct list_head *lhp;
lockdep_assert_held(&rnp->lock);
- pr_info("%s: grp: %d-%d level: %d ->gp_tasks %p ->exp_tasks %p &->blkd_tasks: %p offset: %u\n", __func__, rnp->grplo, rnp->grphi, rnp->level, rnp->gp_tasks, rnp->exp_tasks, &rnp->blkd_tasks, (unsigned int)offsetof(typeof(*rnp), blkd_tasks));
+ pr_info("%s: grp: %d-%d level: %d ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p &->blkd_tasks: %p offset: %u\n", __func__, rnp->grplo, rnp->grphi, rnp->level, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks, &rnp->blkd_tasks, (unsigned int)offsetof(typeof(*rnp), blkd_tasks));
pr_cont("\t->blkd_tasks");
i = 0;
list_for_each(lhp, &rnp->blkd_tasks) {
--
2.5.2
From: Byungchul Park <[email protected]>
If an excessive number of callbacks have been queued, but the NOCB
leader kthread's wakeup must be deferred, then we should wake up the
leader unconditionally once it is safe to do so.
This was handled correctly in commit fbce7497ee ("rcu: Parallelize and
economize NOCB kthread wakeups"), but then commit 8be6e1b15c ("rcu:
Use timer as backstop for NOCB deferred wakeups") passed RCU_NOCB_WAKE
instead of the correct RCU_NOCB_WAKE_FORCE to wake_nocb_leader_defer().
As an interesting aside, RCU_NOCB_WAKE_FORCE is never passed to anything,
which should have been taken as a hint. ;-)
This commit therefore passes RCU_NOCB_WAKE_FORCE instead of RCU_NOCB_WAKE
to wake_nocb_leader_defer() when a callback is queued onto a NOCB CPU
that already has an excessive number of callbacks pending.
Signed-off-by: Byungchul Park <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
---
kernel/rcu/tree_plugin.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3566e4f6dfcc..12774c4fb546 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1957,7 +1957,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeOvf"));
} else {
- wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
+ wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE,
TPS("WakeOvfIsDeferred"));
}
rdp->qlen_last_fqs_check = LONG_MAX / 2;
--
2.5.2
There is a two-jiffy delay between the time that a CPU will self-report
an RCU CPU stall warning and the time that some other CPU will report a
warning on behalf of the first CPU. This has worked well in the past,
but on busy systems, it is possible for the two warnings to overlap,
which makes interpreting them extremely difficult.
This commit therefore uses a cmpxchg-based timing decision that
allows only one report in a given one-minute period (assuming default
stall-warning Kconfig parameters). This approach will of course fail
if you are seeing minute-long vCPU preemption, but in that case the
overlapping RCU CPU stall warnings are the least of your worries.
Reported-by: Dmitry Vyukov <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
---
kernel/rcu/tree.c | 15 +++++++++++----
1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c4db0e20b035..19d9475d74f2 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1429,8 +1429,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
- WRITE_ONCE(rsp->jiffies_stall,
- jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/*
@@ -1481,6 +1479,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
sched_show_task(current);
}
}
+ /* Rewrite if needed in case of slow consoles. */
+ if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
+ WRITE_ONCE(rsp->jiffies_stall,
+ jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
rcu_check_gp_kthread_starvation(rsp);
@@ -1525,6 +1527,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
rcu_dump_cpu_stacks(rsp);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ /* Rewrite if needed in case of slow consoles. */
if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
WRITE_ONCE(rsp->jiffies_stall,
jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
@@ -1548,6 +1551,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
unsigned long gpnum;
unsigned long gps;
unsigned long j;
+ unsigned long jn;
unsigned long js;
struct rcu_node *rnp;
@@ -1586,14 +1590,17 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
ULONG_CMP_GE(gps, js))
return; /* No stall or GP completed since entering function. */
rnp = rdp->mynode;
+ jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
if (rcu_gp_in_progress(rsp) &&
- (READ_ONCE(rnp->qsmask) & rdp->grpmask)) {
+ (READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
+ cmpxchg(&rsp->jiffies_stall, js, jn) == js) {
/* We haven't checked in, so go dump stack. */
print_cpu_stall(rsp);
} else if (rcu_gp_in_progress(rsp) &&
- ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
+ ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
+ cmpxchg(&rsp->jiffies_stall, js, jn) == js) {
/* They had a few time units to dump stack, so complain. */
print_other_cpu_stall(rsp, gpnum);
--
2.5.2
The ->gp_flags pointer must not be set unless the current rcu_node structure
is waiting on at least one quiescent state.
Signed-off-by: Paul E. McKenney <[email protected]>
---
kernel/rcu/tree.c | 2 ++
kernel/rcu/tree.h | 1 +
kernel/rcu/tree_plugin.h | 6 +++++-
3 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 3bfd1519d378..d275c49aa0e8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2419,6 +2419,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
+ rnp->completedqs = rnp->gpnum;
mask = rnp->grpmask;
if (rnp->parent == NULL) {
@@ -4035,6 +4036,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
&rcu_fqs_class[i], fqs[i]);
rnp->gpnum = rsp->gpnum;
rnp->completed = rsp->completed;
+ rnp->completedqs = rsp->completed;
rnp->qsmask = 0;
rnp->qsmaskinit = 0;
rnp->grplo = j * cpustride;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 23ae7c00d3c5..f03da0b091fb 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -79,6 +79,7 @@ struct rcu_node {
unsigned long completed; /* Last GP completed for this node. */
/* This will either be equal to or one */
/* behind the root rcu_node's gpnum. */
+ unsigned long completedqs; /* All QSes done for this node. */
unsigned long qsmask; /* CPUs or groups that need to switch in */
/* order for current grace period to proceed.*/
/* In leaf rcu_node, each bit corresponds to */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 429f9358d0b6..8014a09a5c9b 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -260,8 +260,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
* ->exp_tasks pointers, respectively, to reference the newly
* blocked tasks.
*/
- if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD))
+ if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) {
rnp->gp_tasks = &t->rcu_node_entry;
+ WARN_ON_ONCE(rnp->completedqs == rnp->gpnum);
+ }
if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
rnp->exp_tasks = &t->rcu_node_entry;
WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) !=
@@ -535,6 +537,8 @@ void rcu_read_unlock_special(struct task_struct *t)
WARN_ON_ONCE(rnp != t->rcu_blocked_node);
WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
+ WARN_ON_ONCE(rnp->completedqs == rnp->gpnum &&
+ (!empty_norm || rnp->qsmask));
empty_exp = sync_rcu_preempt_exp_done(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
np = rcu_next_node_entry(t, rnp);
--
2.5.2
From: Boqun Feng <[email protected]>
Sparse reported this:
| kernel/rcu/tree_plugin.h:814:9: warning: incorrect type in argument 1 (different modifiers)
| kernel/rcu/tree_plugin.h:814:9: expected struct lockdep_map const *lock
| kernel/rcu/tree_plugin.h:814:9: got struct lockdep_map [noderef] *<noident>
This is caused by using vanilla lockdep annotations on rcu_node::lock,
and that requires accessing ->lock of rcu_node directly. However we need
to keep rcu_node::lock __private to avoid breaking its extra ordering
guarantee. And we have a dedicated lockdep annotation for
rcu_node::lock, so use it.
Signed-off-by: Boqun Feng <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
---
kernel/rcu/tree_plugin.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 8014a09a5c9b..52bd92acbe6b 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -855,7 +855,7 @@ static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
int i;
struct list_head *lhp;
- lockdep_assert_held(&rnp->lock);
+ raw_lockdep_assert_held_rcu_node(rnp);
pr_info("%s: grp: %d-%d level: %d ->qamask %#lx ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p &->blkd_tasks: %p offset: %u\n", __func__, rnp->grplo, rnp->grphi, rnp->level, rnp->qsmask, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks, &rnp->blkd_tasks, (unsigned int)offsetof(typeof(*rnp), blkd_tasks));
pr_cont("\t->blkd_tasks");
i = 0;
--
2.5.2
The header comment for rcu_bind_gp_kthread() refers to sysidle, which
is no longer with us. However, it is still important to bind RCU's
grace-period kthreads to the housekeeping CPU(s), so rather than remove
rcu_bind_gp_kthread(), this commit updates the comment.
Signed-off-by: Paul E. McKenney <[email protected]>
---
kernel/rcu/tree_plugin.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index ee24a5de0503..d37b9bb3f481 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2622,8 +2622,7 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
}
/*
- * Bind the grace-period kthread for the sysidle flavor of RCU to the
- * timekeeping CPU.
+ * Bind the RCU grace-period kthreads to the housekeeping CPU.
*/
static void rcu_bind_gp_kthread(void)
{
--
2.5.2
From: Byungchul Park <[email protected]>
Commit ae91aa0adb14 ("rcu: Remove debugfs tracing") removed the
RCU debugfs tracing code, but did not remove the no-longer used
->exp_workdone{0,1,2,3} fields in the srcu_data structure. This commit
therefore removes these fields along with the code that uselessly
updates them.
Signed-off-by: Byungchul Park <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
---
kernel/rcu/tree.h | 4 ----
kernel/rcu/tree_exp.h | 13 +++++--------
2 files changed, 5 insertions(+), 12 deletions(-)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 3a0dc30100e8..5fd374c71404 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -224,10 +224,6 @@ struct rcu_data {
#ifdef CONFIG_RCU_FAST_NO_HZ
struct rcu_head oom_head;
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
- atomic_long_t exp_workdone0; /* # done by workqueue. */
- atomic_long_t exp_workdone1; /* # done by others #1. */
- atomic_long_t exp_workdone2; /* # done by others #2. */
- atomic_long_t exp_workdone3; /* # done by others #3. */
int exp_dynticks_snap; /* Double-check need for IPI. */
/* 6) Callback offloading. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index f72eefab8543..f512dd4e57a8 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -248,14 +248,12 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
}
/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
-static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat,
- unsigned long s)
+static bool sync_exp_work_done(struct rcu_state *rsp, unsigned long s)
{
if (rcu_exp_gp_seq_done(rsp, s)) {
trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
/* Ensure test happens before caller kfree(). */
smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(stat);
return true;
}
return false;
@@ -289,7 +287,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
* promoting locality and is not strictly needed for correctness.
*/
for (; rnp != NULL; rnp = rnp->parent) {
- if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s))
+ if (sync_exp_work_done(rsp, s))
return true;
/* Work not done, either wait here or go up. */
@@ -302,8 +300,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
rnp->grplo, rnp->grphi,
TPS("wait"));
wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
- sync_exp_work_done(rsp,
- &rdp->exp_workdone2, s));
+ sync_exp_work_done(rsp, s));
return true;
}
rnp->exp_seq_rq = s; /* Followers can wait on us. */
@@ -313,7 +310,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
}
mutex_lock(&rsp->exp_mutex);
fastpath:
- if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
+ if (sync_exp_work_done(rsp, s)) {
mutex_unlock(&rsp->exp_mutex);
return true;
}
@@ -633,7 +630,7 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
rnp = rcu_get_root(rsp);
wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
- sync_exp_work_done(rsp, &rdp->exp_workdone0, s));
+ sync_exp_work_done(rsp, s));
smp_mb(); /* Workqueue actions happen before return. */
/* Let the next expedited grace period start. */
--
2.5.2
From: Paul Gortmaker <[email protected]>
It came to my attention that the file "whatisRCU.txt" does not
manage to actually ever spell out what is RCU.
This might not be an issue for a lot of people, but we have to
assume the consumers of these documents are starting from ground
zero; otherwise they'd not be reading the docs.
Signed-off-by: Paul Gortmaker <[email protected]>
---
Documentation/RCU/whatisRCU.txt | 2 ++
1 file changed, 2 insertions(+)
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index a27fbfb0efb8..65eb856526b7 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -1,3 +1,5 @@
+What is RCU? -- "Read, Copy, Update"
+
Please note that the "What is RCU?" LWN series is an excellent place
to start learning about RCU:
--
2.5.2
The cond_resched_softirq() macro is not used anywhere in mainline, so
this commit simplifies the kernel by eliminating it.
Signed-off-by: Paul E. McKenney <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
---
include/linux/sched.h | 8 --------
kernel/sched/core.c | 14 --------------
kernel/softirq.c | 3 +--
3 files changed, 1 insertion(+), 24 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fe8f7178a22d..9ae6943b7310 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1613,7 +1613,6 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
* explicit rescheduling in places that are safe. The return
* value indicates whether a reschedule was done in fact.
* cond_resched_lock() will drop the spinlock before scheduling,
- * cond_resched_softirq() will enable bhs before scheduling.
*/
#ifndef CONFIG_PREEMPT
extern int _cond_resched(void);
@@ -1639,13 +1638,6 @@ extern int __cond_resched_lock(spinlock_t *lock);
__cond_resched_lock(lock); \
})
-extern int __cond_resched_softirq(void);
-
-#define cond_resched_softirq() ({ \
- ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \
- __cond_resched_softirq(); \
-})
-
static inline void cond_resched_rcu(void)
{
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5e10aaeebfcc..6a09e6af64b9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5012,20 +5012,6 @@ int __cond_resched_lock(spinlock_t *lock)
}
EXPORT_SYMBOL(__cond_resched_lock);
-int __sched __cond_resched_softirq(void)
-{
- BUG_ON(!in_softirq());
-
- if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
- local_bh_enable();
- preempt_schedule_common();
- local_bh_disable();
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL(__cond_resched_softirq);
-
/**
* yield - yield the current processor to other threads.
*
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 177de3640c78..03981f1c39ea 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -145,8 +145,7 @@ static void __local_bh_enable(unsigned int cnt)
}
/*
- * Special-case - softirqs can safely be enabled in
- * cond_resched_softirq(), or by __do_softirq(),
+ * Special-case - softirqs can safely be enabled by __do_softirq(),
* without processing still-pending softirqs:
*/
void _local_bh_enable(void)
--
2.5.2
From: Yury Norov <[email protected]>
Because rcu_eqs_special_set() is declared only in internal header
kernel/rcu/tree.h and stubbed in include/linux/rcutiny.h, it is
inaccessible outside of the RCU implementation. This patch therefore
moves the rcu_eqs_special_set() declaration to include/linux/rcutree.h,
which allows it to be used in non-rcu kernel code.
Signed-off-by: Yury Norov <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
---
include/linux/rcutree.h | 1 +
kernel/rcu/tree.h | 1 -
2 files changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index fd996cdf1833..448f20f27396 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -74,6 +74,7 @@ static inline void synchronize_rcu_bh_expedited(void)
void rcu_barrier(void);
void rcu_barrier_bh(void);
void rcu_barrier_sched(void);
+bool rcu_eqs_special_set(int cpu);
unsigned long get_state_synchronize_rcu(void);
void cond_synchronize_rcu(unsigned long oldstate);
unsigned long get_state_synchronize_sched(void);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 5fd374c71404..0b3a90ebe225 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -404,7 +404,6 @@ extern struct rcu_state rcu_preempt_state;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
-bool rcu_eqs_special_set(int cpu);
#ifdef CONFIG_RCU_BOOST
DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
--
2.5.2
This commit adds rcu_first_leaf_node() that returns a pointer to
the first leaf rcu_node structure in the specified RCU flavor and an
rcu_is_leaf_node() that returns true iff the specified rcu_node structure
is a leaf. This commit also uses these macros where appropriate.
Signed-off-by: Paul E. McKenney <[email protected]>
---
kernel/rcu/rcu.h | 11 ++++++++---
kernel/rcu/tree.c | 4 ++--
kernel/rcu/tree_plugin.h | 4 ++--
3 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 7a693e31184a..5b5bb9ee2e20 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -270,6 +270,12 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
}
}
+/* Returns first leaf rcu_node of the specified RCU flavor. */
+#define rcu_first_leaf_node(rsp) ((rsp)->level[rcu_num_lvls - 1])
+
+/* Is this rcu_node a leaf? */
+#define rcu_is_leaf_node(rnp) ((rnp)->level == rcu_num_lvls - 1)
+
/*
* Do a full breadth-first scan of the rcu_node structures for the
* specified rcu_state structure.
@@ -284,8 +290,7 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
* rcu_node tree with but one rcu_node structure, this loop is a no-op.
*/
#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
- for ((rnp) = &(rsp)->node[0]; \
- (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
+ for ((rnp) = &(rsp)->node[0]; !rcu_is_leaf_node(rsp, rnp); (rnp)++)
/*
* Scan the leaves of the rcu_node hierarchy for the specified rcu_state
@@ -294,7 +299,7 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
* It is still a leaf node, even if it is also the root node.
*/
#define rcu_for_each_leaf_node(rsp, rnp) \
- for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
+ for ((rnp) = rcu_first_leaf_node(rsp); \
(rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
/*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 19d9475d74f2..08b72dd02d88 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2405,7 +2405,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
return;
}
WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
- WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 &&
+ WARN_ON_ONCE(!rcu_is_leaf_node(rnp) &&
rcu_preempt_blocked_readers_cgp(rnp));
rnp->qsmask &= ~mask;
trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
@@ -4063,7 +4063,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
init_swait_queue_head(&rsp->gp_wq);
init_swait_queue_head(&rsp->expedited_wq);
- rnp = rsp->level[rcu_num_lvls - 1];
+ rnp = rcu_first_leaf_node(rsp);
for_each_possible_cpu(i) {
while (i > rnp->grphi)
rnp++;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index d37b9bb3f481..b999032e9466 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -182,7 +182,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
raw_lockdep_assert_held_rcu_node(rnp);
WARN_ON_ONCE(rdp->mynode != rnp);
- WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
+ WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
/*
* Decide where to queue the newly blocked task. In theory,
@@ -533,7 +533,7 @@ void rcu_read_unlock_special(struct task_struct *t)
rnp = t->rcu_blocked_node;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
WARN_ON_ONCE(rnp != t->rcu_blocked_node);
- WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
+ WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
empty_exp = sync_rcu_preempt_exp_done(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
--
2.5.2
Signed-off-by: Paul E. McKenney <[email protected]>
---
kernel/rcu/tree.c | 18 ++----------------
kernel/rcu/tree.h | 1 +
kernel/rcu/tree_plugin.h | 29 +++++++++++++++++++++++++++++
3 files changed, 32 insertions(+), 16 deletions(-)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6ad2ac34558f..3bfd1519d378 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2147,22 +2147,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
*/
rcu_for_each_node_breadth_first(rsp, rnp) {
raw_spin_lock_irq_rcu_node(rnp);
-#ifdef CONFIG_PREEMPT_RCU
- if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) {
- int i;
- struct task_struct *t;
-
- pr_info("%s: grp: %d-%d level: %d ->gp_tasks %p ->exp_tasks %p &->blkd_tasks: %p offset: %u\n", __func__, rnp->grplo, rnp->grphi, rnp->level, rnp->gp_tasks, rnp->exp_tasks, &rnp->blkd_tasks, (unsigned int)offsetof(typeof(*rnp), blkd_tasks));
- pr_cont("\t->blkd_tasks");
- i = 0;
- list_for_each_entry(t, &rnp->blkd_tasks, rcu_node_entry) {
- pr_cont(" %p", t);
- if (++i >= 10)
- break;
- }
- pr_cont("\n");
- }
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
+ if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
+ dump_blkd_tasks(rnp, 10);
WARN_ON_ONCE(rnp->qsmask);
WRITE_ONCE(rnp->completed, rsp->gpnum);
rdp = this_cpu_ptr(rsp->rda);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 0b3a90ebe225..23ae7c00d3c5 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -428,6 +428,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
static void rcu_preempt_check_callbacks(void);
void call_rcu(struct rcu_head *head, rcu_callback_t func);
static void __init __rcu_init_preempt(void);
+static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
static void invoke_rcu_callbacks_kthread(void);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index b999032e9466..a2ae6b49f900 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -841,6 +841,27 @@ void exit_rcu(void)
__rcu_read_unlock();
}
+/*
+ * Dump the blocked-tasks state, but limit the list dump to the
+ * specified number of elements.
+ */
+static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
+{
+ int i;
+ struct list_head *lhp;
+
+ lockdep_assert_held(&rnp->lock);
+ pr_info("%s: grp: %d-%d level: %d ->gp_tasks %p ->exp_tasks %p &->blkd_tasks: %p offset: %u\n", __func__, rnp->grplo, rnp->grphi, rnp->level, rnp->gp_tasks, rnp->exp_tasks, &rnp->blkd_tasks, (unsigned int)offsetof(typeof(*rnp), blkd_tasks));
+ pr_cont("\t->blkd_tasks");
+ i = 0;
+ list_for_each(lhp, &rnp->blkd_tasks) {
+ pr_cont(" %p", lhp);
+ if (++i >= 10)
+ break;
+ }
+ pr_cont("\n");
+}
+
#else /* #ifdef CONFIG_PREEMPT_RCU */
static struct rcu_state *const rcu_state_p = &rcu_sched_state;
@@ -949,6 +970,14 @@ void exit_rcu(void)
{
}
+/*
+ * Dump the guaranteed-empty blocked-tasks state. Trust but verify.
+ */
+static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
+{
+ WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks));
+}
+
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_RCU_BOOST
--
2.5.2
Commit 44c65ff2e3b0 ("rcu: Eliminate NOCBs CPU-state Kconfig options")
made allocation of rcu_nocb_mask depend only on the rcu_nocbs=,
nohz_full=, or isolcpus= kernel boot parameters. However, it failed
to change the initial value of rcu_init_nohz()'s local variable
need_rcu_nocb_mask to false, which can result in useless allocation
of an all-zero rcu_nocb_mask. This commit therefore fixes this bug by
changing the initial value of need_rcu_nocb_mask to false.
While we are in the area, also correct the error message that is printed
when someone specifies that can-never-exist CPUs should be NOCBs CPUs.
Reported-by: Byungchul Park <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
Acked-by: Byungchul Park <[email protected]>
---
kernel/rcu/tree_plugin.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 6c5df18bbf2f..3566e4f6dfcc 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2283,7 +2283,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
void __init rcu_init_nohz(void)
{
int cpu;
- bool need_rcu_nocb_mask = true;
+ bool need_rcu_nocb_mask = false;
struct rcu_state *rsp;
#if defined(CONFIG_NO_HZ_FULL)
@@ -2306,7 +2306,7 @@ void __init rcu_init_nohz(void)
#endif /* #if defined(CONFIG_NO_HZ_FULL) */
if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
- pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
+ pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
cpumask_and(rcu_nocb_mask, cpu_possible_mask,
rcu_nocb_mask);
}
--
2.5.2
On Sun, Apr 22, 2018 at 07:32:06PM -0700, Paul E. McKenney wrote:
> In CONFIG_PREEMPT=y kernels, cond_resched() is a complete no-op, and
> thus cannot help advance Tasks-RCU grace periods. However, such grace
> periods are only an issue in non-production benchmarking runs of the
> Linux kernel. This commit therefore makes cond_resched() invoke
> rcu_note_voluntary_context_switch() for kernels implementing Tasks RCU
> even in CONFIG_PREEMPT=y kernels.
I'm confused.. why is having this conditional on TRACEPOINT_BENCHMARK a
sane idea?
>
> Reported-by: Steven Rostedt <[email protected]>
> Signed-off-by: Paul E. McKenney <[email protected]>
> ---
> include/linux/sched.h | 6 ++++++
> 1 file changed, 6 insertions(+)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index b3d697f3b573..fe8f7178a22d 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1617,6 +1617,12 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
> */
> #ifndef CONFIG_PREEMPT
> extern int _cond_resched(void);
> +#elif defined(CONFIG_TRACEPOINT_BENCHMARK)
> +static inline int _cond_resched(void)
> +{
> + rcu_note_voluntary_context_switch(current);
> + return 0;
> +}
> #else
> static inline int _cond_resched(void) { return 0; }
> #endif
> --
> 2.5.2
>
On Sun, Apr 22, 2018 at 07:32:12PM -0700, Paul E. McKenney wrote:
> The cond_resched_softirq() macro is not used anywhere in mainline, so
> this commit simplifies the kernel by eliminating it.
>
> Signed-off-by: Paul E. McKenney <[email protected]>
> Cc: Ingo Molnar <[email protected]>
Fair enough,
Acked-by: Peter Zijlstra (Intel) <[email protected]>
On Sun, Apr 22, 2018 at 07:32:11PM -0700, Paul E. McKenney wrote:
> Commit e31d28b6ab8f ("trace: Eliminate cond_resched_rcu_qs() in favor
> of cond_resched()") substituted cond_resched() for the earlier call
> to cond_resched_rcu_qs(). However, the new-age cond_resched() does
> not do anything to help RCU-tasks grace periods because (1) RCU-tasks
> is only enabled when CONFIG_PREEMPT=y and (2) cond_resched() is a
> complete no-op when preemption is enabled. This situation results
> in hangs when running the trace benchmarks.
But a few patches earlier you tried to address that..
On Mon, Apr 23, 2018 at 10:53:47AM +0200, Peter Zijlstra wrote:
> On Sun, Apr 22, 2018 at 07:32:11PM -0700, Paul E. McKenney wrote:
> > Commit e31d28b6ab8f ("trace: Eliminate cond_resched_rcu_qs() in favor
> > of cond_resched()") substituted cond_resched() for the earlier call
> > to cond_resched_rcu_qs(). However, the new-age cond_resched() does
> > not do anything to help RCU-tasks grace periods because (1) RCU-tasks
> > is only enabled when CONFIG_PREEMPT=y and (2) cond_resched() is a
> > complete no-op when preemption is enabled. This situation results
> > in hangs when running the trace benchmarks.
>
> But a few patches earlier you tried to address that..
Yes. And all this patch does is change the name to better match
what this primitive actually does.
I could merge the two patches, if that would help.
Thanx, Paul
On Mon, Apr 23, 2018 at 10:51:27AM +0200, Peter Zijlstra wrote:
> On Sun, Apr 22, 2018 at 07:32:06PM -0700, Paul E. McKenney wrote:
> > In CONFIG_PREEMPT=y kernels, cond_resched() is a complete no-op, and
> > thus cannot help advance Tasks-RCU grace periods. However, such grace
> > periods are only an issue in non-production benchmarking runs of the
> > Linux kernel. This commit therefore makes cond_resched() invoke
> > rcu_note_voluntary_context_switch() for kernels implementing Tasks RCU
> > even in CONFIG_PREEMPT=y kernels.
>
> I'm confused.. why is having this conditional on TRACEPOINT_BENCHMARK a
> sane idea?
Because the TRACEPOINT_BENCHMARK tests are insane, so a similar
level of insanity is required to make things work. Plus having this
be unconditional would not be good for performance, as 0day has been
telling me frequently over the past couple of years.
All that aside, I am very open to ideas. What would you suggest?
Thanx, Paul
> > Reported-by: Steven Rostedt <[email protected]>
> > Signed-off-by: Paul E. McKenney <[email protected]>
> > ---
> > include/linux/sched.h | 6 ++++++
> > 1 file changed, 6 insertions(+)
> >
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index b3d697f3b573..fe8f7178a22d 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -1617,6 +1617,12 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
> > */
> > #ifndef CONFIG_PREEMPT
> > extern int _cond_resched(void);
> > +#elif defined(CONFIG_TRACEPOINT_BENCHMARK)
> > +static inline int _cond_resched(void)
> > +{
> > + rcu_note_voluntary_context_switch(current);
> > + return 0;
> > +}
> > #else
> > static inline int _cond_resched(void) { return 0; }
> > #endif
> > --
> > 2.5.2
> >
>
On Mon, Apr 23, 2018 at 05:32:23AM -0700, Paul E. McKenney wrote:
> On Mon, Apr 23, 2018 at 10:53:47AM +0200, Peter Zijlstra wrote:
> > On Sun, Apr 22, 2018 at 07:32:11PM -0700, Paul E. McKenney wrote:
> > > Commit e31d28b6ab8f ("trace: Eliminate cond_resched_rcu_qs() in favor
> > > of cond_resched()") substituted cond_resched() for the earlier call
> > > to cond_resched_rcu_qs(). However, the new-age cond_resched() does
> > > not do anything to help RCU-tasks grace periods because (1) RCU-tasks
> > > is only enabled when CONFIG_PREEMPT=y and (2) cond_resched() is a
> > > complete no-op when preemption is enabled. This situation results
> > > in hangs when running the trace benchmarks.
> >
> > But a few patches earlier you tried to address that..
>
> Yes. And all this patch does is change the name to better match
> what this primitive actually does.
>
> I could merge the two patches, if that would help.
I understood the earlier patch was actually changing cond_resched() to
also work for task-rcu; which resulted in it acually not being a no-op
for PREEMPT anymore.
But then here you say it doesn't because it's a no-op.
It's not about the rename.. It is about one patch changing PREEMPT
cond_resched to not be a no-op and you here justifying things because
PREEMPT cond_resched it a no-op.
On Mon, Apr 23, 2018 at 02:48:55PM +0200, Peter Zijlstra wrote:
> On Mon, Apr 23, 2018 at 05:32:23AM -0700, Paul E. McKenney wrote:
> > On Mon, Apr 23, 2018 at 10:53:47AM +0200, Peter Zijlstra wrote:
> > > On Sun, Apr 22, 2018 at 07:32:11PM -0700, Paul E. McKenney wrote:
> > > > Commit e31d28b6ab8f ("trace: Eliminate cond_resched_rcu_qs() in favor
> > > > of cond_resched()") substituted cond_resched() for the earlier call
> > > > to cond_resched_rcu_qs(). However, the new-age cond_resched() does
> > > > not do anything to help RCU-tasks grace periods because (1) RCU-tasks
> > > > is only enabled when CONFIG_PREEMPT=y and (2) cond_resched() is a
> > > > complete no-op when preemption is enabled. This situation results
> > > > in hangs when running the trace benchmarks.
> > >
> > > But a few patches earlier you tried to address that..
> >
> > Yes. And all this patch does is change the name to better match
> > what this primitive actually does.
> >
> > I could merge the two patches, if that would help.
>
> I understood the earlier patch was actually changing cond_resched() to
> also work for task-rcu; which resulted in it acually not being a no-op
> for PREEMPT anymore.
>
> But then here you say it doesn't because it's a no-op.
>
> It's not about the rename.. It is about one patch changing PREEMPT
> cond_resched to not be a no-op and you here justifying things because
> PREEMPT cond_resched it a no-op.
So would it help if I instead said in the commit log that it is a
no-op in production builds of the Linux kernel?
Thanx, Paul
On Mon, Apr 23, 2018 at 1:54 AM Peter Zijlstra <[email protected]> wrote:
> On Sun, Apr 22, 2018 at 07:32:12PM -0700, Paul E. McKenney wrote:
> > The cond_resched_softirq() macro is not used anywhere in mainline, so
> > this commit simplifies the kernel by eliminating it.
> >
> > Signed-off-by: Paul E. McKenney <[email protected]>
> > Cc: Ingo Molnar <[email protected]>
> Fair enough,
> Acked-by: Peter Zijlstra (Intel) <[email protected]>
Yes, I suggested this removal in
https://www.spinics.net/lists/netdev/msg375161.html
Reviewed-by: Eric Dumazet <[email protected]>
Thanks Paul.
On Mon, 23 Apr 2018 05:40:00 -0700
"Paul E. McKenney" <[email protected]> wrote:
> > I'm confused.. why is having this conditional on TRACEPOINT_BENCHMARK a
> > sane idea?
>
> Because the TRACEPOINT_BENCHMARK tests are insane, so a similar
> level of insanity is required to make things work. Plus having this
> be unconditional would not be good for performance, as 0day has been
> telling me frequently over the past couple of years.
Just for some context. The tracepoint benchmark (which should never be
enabled in any production machine), will start a thread when the
benchmark trace event is enabled. This thread will never exit (until
the trace event is disabled), and does a benchmark loop and constantly
calls "cond_resched()" to allow other tasks to run. The point is, this
thread will never have a quiescent state for task_rcu, unless we tell
rcu that cond_resched() is a quiescent state. But this is only required
because the tracepoint benchmark has this nasty thread, that is only
used for debugging and benchmarking the tracepoint (during development).
I also suggested having a direct call into RCU from the thread to tell
RCU that it entered a quiescent state, but Paul didn't like that idea
as it caused the tracepoint benchmark to call too deep into RCU
internals.
http://lkml.kernel.org/r/[email protected]
-- Steve
On Mon, Apr 23, 2018 at 05:40:00AM -0700, Paul E. McKenney wrote:
> On Mon, Apr 23, 2018 at 10:51:27AM +0200, Peter Zijlstra wrote:
> > On Sun, Apr 22, 2018 at 07:32:06PM -0700, Paul E. McKenney wrote:
> > > In CONFIG_PREEMPT=y kernels, cond_resched() is a complete no-op, and
> > > thus cannot help advance Tasks-RCU grace periods. However, such grace
> > > periods are only an issue in non-production benchmarking runs of the
> > > Linux kernel. This commit therefore makes cond_resched() invoke
> > > rcu_note_voluntary_context_switch() for kernels implementing Tasks RCU
> > > even in CONFIG_PREEMPT=y kernels.
> >
> > I'm confused.. why is having this conditional on TRACEPOINT_BENCHMARK a
> > sane idea?
>
> Because the TRACEPOINT_BENCHMARK tests are insane, so a similar
> level of insanity is required to make things work. Plus having this
> be unconditional would not be good for performance, as 0day has been
> telling me frequently over the past couple of years.
>
> All that aside, I am very open to ideas. What would you suggest?
Dunno; Steve how insane is that benchmark? Is it at all possible for an
actual user to cause something like tha?
Thing is, I find it very dodgy to have cond_resched() behaviour depend
on a benchmark config.
Either we should always have that (and somehow fix the performance
issues) or we should not and then have the tracepoint crud not be
insane, possibly adding a few of those cond_resched_trace_rcu_qs()
things from the later patch.
On Mon, Apr 23, 2018 at 09:47:42AM -0400, Steven Rostedt wrote:
> On Mon, 23 Apr 2018 05:40:00 -0700
> "Paul E. McKenney" <[email protected]> wrote:
>
> > > I'm confused.. why is having this conditional on TRACEPOINT_BENCHMARK a
> > > sane idea?
> >
> > Because the TRACEPOINT_BENCHMARK tests are insane, so a similar
> > level of insanity is required to make things work. Plus having this
> > be unconditional would not be good for performance, as 0day has been
> > telling me frequently over the past couple of years.
>
> Just for some context. The tracepoint benchmark (which should never be
> enabled in any production machine), will start a thread when the
> benchmark trace event is enabled. This thread will never exit (until
> the trace event is disabled), and does a benchmark loop and constantly
> calls "cond_resched()" to allow other tasks to run. The point is, this
> thread will never have a quiescent state for task_rcu, unless we tell
> rcu that cond_resched() is a quiescent state. But this is only required
> because the tracepoint benchmark has this nasty thread, that is only
> used for debugging and benchmarking the tracepoint (during development).
>
> I also suggested having a direct call into RCU from the thread to tell
> RCU that it entered a quiescent state, but Paul didn't like that idea
> as it caused the tracepoint benchmark to call too deep into RCU
> internals.
>
> http://lkml.kernel.org/r/[email protected]
That thread using cond_resched_task_rcu_qs() seems like a _lot_ better
than having cond_resched() semantics change depending on random
!scheduler config parameters.
On Mon, 23 Apr 2018 16:10:38 +0200
Peter Zijlstra <[email protected]> wrote:
> > http://lkml.kernel.org/r/[email protected]
>
> That thread using cond_resched_task_rcu_qs() seems like a _lot_ better
> than having cond_resched() semantics change depending on random
> !scheduler config parameters.
Yeah, I agree. Not sure why Paul didn't push it. Maybe because I never
replied to that final email and he forgot?
Paul?
-- Steve
On Mon, Apr 23, 2018 at 10:35:29AM -0400, Steven Rostedt wrote:
> On Mon, 23 Apr 2018 16:10:38 +0200
> Peter Zijlstra <[email protected]> wrote:
>
> > > http://lkml.kernel.org/r/[email protected]
> >
> > That thread using cond_resched_task_rcu_qs() seems like a _lot_ better
> > than having cond_resched() semantics change depending on random
> > !scheduler config parameters.
>
> Yeah, I agree. Not sure why Paul didn't push it. Maybe because I never
> replied to that final email and he forgot?
>
> Paul?
Yeah, I have been a bit event-driven of late. So the thought is to keep
cond_resched() as-is and use cond_resched_task_rcu_qs(), that is after
the rename, for the stress tests instead of the current cond_resched().
Or did I lose the thread?
Thanx, paul
On Mon, Apr 23, 2018 at 01:25:06PM +0000, Eric Dumazet wrote:
> On Mon, Apr 23, 2018 at 1:54 AM Peter Zijlstra <[email protected]> wrote:
>
> > On Sun, Apr 22, 2018 at 07:32:12PM -0700, Paul E. McKenney wrote:
> > > The cond_resched_softirq() macro is not used anywhere in mainline, so
> > > this commit simplifies the kernel by eliminating it.
> > >
> > > Signed-off-by: Paul E. McKenney <[email protected]>
> > > Cc: Ingo Molnar <[email protected]>
>
> > Fair enough,
>
> > Acked-by: Peter Zijlstra (Intel) <[email protected]>
>
> Yes, I suggested this removal in
> https://www.spinics.net/lists/netdev/msg375161.html
>
> Reviewed-by: Eric Dumazet <[email protected]>
>
> Thanks Paul.
I added Peter's Acked-by, Eric's Reviewed-by, and on the strength of
the above URL, a Suggested-by from Eric. Thank you both!
Thanx, Paul
On Mon, Apr 23, 2018 at 08:47:07AM -0700, Paul E. McKenney wrote:
> On Mon, Apr 23, 2018 at 10:35:29AM -0400, Steven Rostedt wrote:
> > On Mon, 23 Apr 2018 16:10:38 +0200
> > Peter Zijlstra <[email protected]> wrote:
> >
> > > > http://lkml.kernel.org/r/[email protected]
> > >
> > > That thread using cond_resched_task_rcu_qs() seems like a _lot_ better
> > > than having cond_resched() semantics change depending on random
> > > !scheduler config parameters.
> >
> > Yeah, I agree. Not sure why Paul didn't push it. Maybe because I never
> > replied to that final email and he forgot?
> >
> > Paul?
>
> Yeah, I have been a bit event-driven of late. So the thought is to keep
> cond_resched() as-is and use cond_resched_task_rcu_qs(), that is after
> the rename, for the stress tests instead of the current cond_resched().
> Or did I lose the thread?
Actually, I already use cond_resched_tasks_rcu_qs() in
benchmark_event_kthread(), so all I need to do is drop this commit.
Which I have done, thanks everyone for the review!
Thanx, Paul
On Sun, Apr 22, 2018 at 07:32:18PM -0700, Paul E. McKenney wrote:
> There is a two-jiffy delay between the time that a CPU will self-report
> an RCU CPU stall warning and the time that some other CPU will report a
> warning on behalf of the first CPU. This has worked well in the past,
> but on busy systems, it is possible for the two warnings to overlap,
> which makes interpreting them extremely difficult.
>
> This commit therefore uses a cmpxchg-based timing decision that
> allows only one report in a given one-minute period (assuming default
> stall-warning Kconfig parameters). This approach will of course fail
> if you are seeing minute-long vCPU preemption, but in that case the
> overlapping RCU CPU stall warnings are the least of your worries.
>
> Reported-by: Dmitry Vyukov <[email protected]>
> Signed-off-by: Paul E. McKenney <[email protected]>
And later testing showed that this commit had the unfortunate side-effect
of completely suppressing other-CPU reporting of RCU CPU stalls. The
patch below includes a fix, and this patch has been kicked out of the
queue for the next merge window in favor of the one following.
Thanx, Paul
------------------------------------------------------------------------
commit ed569311d8d655a72f93310dbf479ca84daa736f
Author: Paul E. McKenney <[email protected]>
Date: Mon Apr 9 11:04:46 2018 -0700
rcu: Exclude near-simultaneous RCU CPU stall warnings
There is a two-jiffy delay between the time that a CPU will self-report
an RCU CPU stall warning and the time that some other CPU will report a
warning on behalf of the first CPU. This has worked well in the past,
but on busy systems, it is possible for the two warnings to overlap,
which makes interpreting them extremely difficult.
This commit therefore uses a cmpxchg-based timing decision that
allows only one report in a given one-minute period (assuming default
stall-warning Kconfig parameters). This approach will of course fail
if you are seeing minute-long vCPU preemption, but in that case the
overlapping RCU CPU stall warnings are the least of your worries.
Reported-by: Dmitry Vyukov <[email protected]>
Signed-off-by: Paul E. McKenney <[email protected]>
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 35efe85c35b4..f066269d5b91 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1368,7 +1368,6 @@ static inline void panic_on_rcu_stall(void)
static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
{
int cpu;
- long delta;
unsigned long flags;
unsigned long gpa;
unsigned long j;
@@ -1381,18 +1380,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
if (rcu_cpu_stall_suppress)
return;
- /* Only let one CPU complain about others per time interval. */
-
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- delta = jiffies - READ_ONCE(rsp->jiffies_stall);
- if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- return;
- }
- WRITE_ONCE(rsp->jiffies_stall,
- jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-
/*
* OK, time to rat on our buddy...
* See Documentation/RCU/stallwarn.txt for info on how to debug
@@ -1441,6 +1428,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
sched_show_task(current);
}
}
+ /* Rewrite if needed in case of slow consoles. */
+ if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
+ WRITE_ONCE(rsp->jiffies_stall,
+ jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
rcu_check_gp_kthread_starvation(rsp);
@@ -1485,6 +1476,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
rcu_dump_cpu_stacks(rsp);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ /* Rewrite if needed in case of slow consoles. */
if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
WRITE_ONCE(rsp->jiffies_stall,
jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
@@ -1508,6 +1500,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
unsigned long gpnum;
unsigned long gps;
unsigned long j;
+ unsigned long jn;
unsigned long js;
struct rcu_node *rnp;
@@ -1546,14 +1539,17 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
ULONG_CMP_GE(gps, js))
return; /* No stall or GP completed since entering function. */
rnp = rdp->mynode;
+ jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
if (rcu_gp_in_progress(rsp) &&
- (READ_ONCE(rnp->qsmask) & rdp->grpmask)) {
+ (READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
+ cmpxchg(&rsp->jiffies_stall, js, jn) == js) {
/* We haven't checked in, so go dump stack. */
print_cpu_stall(rsp);
} else if (rcu_gp_in_progress(rsp) &&
- ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
+ ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
+ cmpxchg(&rsp->jiffies_stall, js, jn) == js) {
/* They had a few time units to dump stack, so complain. */
print_other_cpu_stall(rsp, gpnum);
On Sun, Apr 22, 2018 at 07:32:21PM -0700, Paul E. McKenney wrote:
> The WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp()) in
> rcu_gp_cleanup() triggers (inexplicably, of course) every so often.
> This commit therefore extracts more information.
>
> Signed-off-by: Paul E. McKenney <[email protected]>
This commit and the remaining ones in this series are deferred from
the next merge window to the one following, where modified versions
are actually able to find a problem.
Thanx, Paul