2007-01-25 02:11:16

by Paul E. McKenney

[permalink] [raw]
Subject: [RFC PATCH -rt 0/2] RCU priority boosting that survives semi-vicious testing

Hello!

This is an update of the patch at http://lkml.org/lkml/2007/01/15/219.
This series contains (1) the RCU-boost patch itself and (2) some
rcutorture modifications to test it more thoroughly. This version is
getting quite close.

There is still no OOM tie-in. As before, dynamic adjustment of the
booster's priority will dynamically adjust the boostees.

Thanx, Paul


2007-01-25 02:15:06

by Paul E. McKenney

[permalink] [raw]
Subject: [RFC PATCH -rt 1/2] RCU priority boosting that survives semi-vicious testing

Here is the priority-boosting piece of the set. Very similar to the
earlier patch (http://lkml.org/lkml/2007/01/15/219).

Signed-off-by: Paul E. McKenney <[email protected]>
---

include/linux/init_task.h | 12 +
include/linux/rcupdate.h | 12 +
include/linux/rcupreempt.h | 19 +
include/linux/sched.h | 16 +
init/main.c | 1
kernel/Kconfig.preempt | 32 ++
kernel/fork.c | 6
kernel/rcupreempt.c | 536 +++++++++++++++++++++++++++++++++++++++++++++
kernel/rtmutex.c | 9
kernel/sched.c | 5
10 files changed, 645 insertions(+), 3 deletions(-)

diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/include/linux/init_task.h linux-2.6.20-rc4-rt1-rcub/include/linux/init_task.h
--- linux-2.6.20-rc4-rt1/include/linux/init_task.h 2007-01-09 10:59:54.000000000 -0800
+++ linux-2.6.20-rc4-rt1-rcub/include/linux/init_task.h 2007-01-09 11:01:12.000000000 -0800
@@ -87,6 +87,17 @@ extern struct nsproxy init_nsproxy;
.siglock = __SPIN_LOCK_UNLOCKED(sighand.siglock), \
}

+#ifdef CONFIG_PREEMPT_RCU_BOOST
+#define INIT_RCU_BOOST_PRIO .rcu_prio = MAX_PRIO,
+#define INIT_PREEMPT_RCU_BOOST(tsk) \
+ .rcub_rbdp = NULL, \
+ .rcub_state = RCU_BOOST_IDLE, \
+ .rcub_entry = LIST_HEAD_INIT(tsk.rcub_entry),
+#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */
+#define INIT_RCU_BOOST_PRIO
+#define INIT_PREEMPT_RCU_BOOST(tsk)
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */
+
extern struct group_info init_groups;

/*
@@ -143,6 +154,7 @@ extern struct group_info init_groups;
.pi_lock = RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \
INIT_TRACE_IRQFLAGS \
INIT_LOCKDEP \
+ INIT_PREEMPT_RCU_BOOST(tsk) \
}


diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/include/linux/rcupdate.h linux-2.6.20-rc4-rt1-rcub/include/linux/rcupdate.h
--- linux-2.6.20-rc4-rt1/include/linux/rcupdate.h 2007-01-09 10:59:54.000000000 -0800
+++ linux-2.6.20-rc4-rt1-rcub/include/linux/rcupdate.h 2007-01-09 11:01:12.000000000 -0800
@@ -227,6 +227,18 @@ extern void rcu_barrier(void);
extern void rcu_init(void);
extern void rcu_advance_callbacks(int cpu, int user);
extern void rcu_check_callbacks(int cpu, int user);
+#ifdef CONFIG_PREEMPT_RCU_BOOST
+extern void init_rcu_boost_late(void);
+extern void __rcu_preempt_boost(void);
+#define rcu_preempt_boost() \
+ do { \
+ if (unlikely(current->rcu_read_lock_nesting > 0)) \
+ __rcu_preempt_boost(); \
+ } while (0)
+#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */
+#define init_rcu_boost_late()
+#define rcu_preempt_boost()
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */

#endif /* __KERNEL__ */
#endif /* __LINUX_RCUPDATE_H */
diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/include/linux/rcupreempt.h linux-2.6.20-rc4-rt1-rcub/include/linux/rcupreempt.h
--- linux-2.6.20-rc4-rt1/include/linux/rcupreempt.h 2007-01-09 10:59:54.000000000 -0800
+++ linux-2.6.20-rc4-rt1-rcub/include/linux/rcupreempt.h 2007-01-09 11:01:12.000000000 -0800
@@ -42,6 +42,25 @@
#include <linux/cpumask.h>
#include <linux/seqlock.h>

+#ifdef CONFIG_PREEMPT_RCU_BOOST
+/*
+ * Task state with respect to being RCU-boosted. This state is changed
+ * by the task itself in response to the following three events:
+ * 1. Preemption (or block on lock) while in RCU read-side critical section.
+ * 2. Outermost rcu_read_unlock() for blocked RCU read-side critical section.
+ *
+ * The RCU-boost task also updates the state when boosting priority.
+ */
+enum rcu_boost_state {
+ RCU_BOOST_IDLE = 0, /* Not yet blocked if in RCU read-side. */
+ RCU_BOOST_BLOCKED = 1, /* Blocked from RCU read-side. */
+ RCU_BOOSTED = 2, /* Boosting complete. */
+};
+
+#define N_RCU_BOOST_STATE (RCU_BOOSTED + 1)
+
+#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST */
+
#define rcu_qsctr_inc(cpu)
#define rcu_bh_qsctr_inc(cpu)
#define call_rcu_bh(head, rcu) call_rcu(head, rcu)
diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/include/linux/sched.h linux-2.6.20-rc4-rt1-rcub/include/linux/sched.h
--- linux-2.6.20-rc4-rt1/include/linux/sched.h 2007-01-09 10:59:54.000000000 -0800
+++ linux-2.6.20-rc4-rt1-rcub/include/linux/sched.h 2007-01-09 11:01:12.000000000 -0800
@@ -699,6 +699,14 @@ struct signal_struct {
#define is_rt_policy(p) ((p) != SCHED_NORMAL && (p) != SCHED_BATCH)
#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy))

+#ifdef CONFIG_PREEMPT_RCU_BOOST
+#define set_rcu_prio(p, prio) ((p)->rcu_prio = (prio))
+#define get_rcu_prio(p) ((p)->rcu_prio)
+#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */
+#define set_rcu_prio(p, prio) do { } while (0)
+#define get_rcu_prio(p) MAX_PRIO
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */
+
/*
* Some day this will be a full-fledged user tracking system..
*/
@@ -982,6 +990,9 @@ struct task_struct {
#endif
int load_weight; /* for niceness load balancing purposes */
int prio, static_prio, normal_prio;
+#ifdef CONFIG_PREEMPT_RCU_BOOST
+ int rcu_prio;
+#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST */
struct list_head run_list;
struct prio_array *array;

@@ -1003,6 +1014,11 @@ struct task_struct {
atomic_t *rcu_flipctr1;
atomic_t *rcu_flipctr2;
#endif
+#ifdef CONFIG_PREEMPT_RCU_BOOST
+ struct rcu_boost_dat *rcub_rbdp;
+ enum rcu_boost_state rcub_state;
+ struct list_head rcub_entry;
+#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST */

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/init/main.c linux-2.6.20-rc4-rt1-rcub/init/main.c
--- linux-2.6.20-rc4-rt1/init/main.c 2007-01-09 10:59:54.000000000 -0800
+++ linux-2.6.20-rc4-rt1-rcub/init/main.c 2007-01-09 11:01:12.000000000 -0800
@@ -712,6 +712,7 @@ static void __init do_basic_setup(void)
init_workqueues();
usermodehelper_init();
driver_init();
+ init_rcu_boost_late();

#ifdef CONFIG_SYSCTL
sysctl_init();
diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/Kconfig.preempt linux-2.6.20-rc4-rt1-rcub/kernel/Kconfig.preempt
--- linux-2.6.20-rc4-rt1/kernel/Kconfig.preempt 2007-01-09 10:59:54.000000000 -0800
+++ linux-2.6.20-rc4-rt1-rcub/kernel/Kconfig.preempt 2007-01-09 11:01:12.000000000 -0800
@@ -176,3 +176,35 @@ config RCU_TRACE

Say Y here if you want to enable RCU tracing
Say N if you are unsure.
+
+config PREEMPT_RCU_BOOST
+ bool "Enable priority boosting of RCU read-side critical sections"
+ depends on PREEMPT_RCU
+ default n
+ help
+ This option permits priority boosting of RCU read-side critical
+ sections that have been preempted in order to prevent indefinite
+ delay of grace periods in face of runaway non-realtime processes.
+
+ Say N if you are unsure.
+
+config PREEMPT_RCU_BOOST_STATS
+ bool "Enable RCU priority-boosting statistic printing"
+ depends on PREEMPT_RCU_BOOST
+ default n
+ help
+ This option enables debug printk()s of RCU boost statistics,
+ which are normally only used to debug RCU priority boost
+ implementations.
+
+ Say N if you are unsure.
+
+config PREEMPT_RCU_BOOST_STATS_INTERVAL
+ int "RCU priority-boosting statistic printing interval (seconds)"
+ depends on PREEMPT_RCU_BOOST_STATS
+ default 100
+ range 10 86400
+ help
+ This option controls the timing of debug printk()s of RCU boost
+ statistics, which are normally only used to debug RCU priority
+ boost implementations.
diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/fork.c linux-2.6.20-rc4-rt1-rcub/kernel/fork.c
--- linux-2.6.20-rc4-rt1/kernel/fork.c 2007-01-09 10:59:54.000000000 -0800
+++ linux-2.6.20-rc4-rt1-rcub/kernel/fork.c 2007-01-09 11:01:12.000000000 -0800
@@ -1129,6 +1129,12 @@ static struct task_struct *copy_process(
p->softirq_context = 0;
#endif
p->pagefault_disabled = 0;
+#ifdef CONFIG_PREEMPT_RCU_BOOST
+ p->rcu_prio = MAX_PRIO;
+ p->rcub_rbdp = NULL;
+ p->rcub_state = RCU_BOOST_IDLE;
+ INIT_LIST_HEAD(&p->rcub_entry);
+#endif
#ifdef CONFIG_LOCKDEP
p->lockdep_depth = 0; /* no locks held yet */
p->curr_chain_key = 0;
diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/rcupreempt.c linux-2.6.20-rc4-rt1-rcub/kernel/rcupreempt.c
--- linux-2.6.20-rc4-rt1/kernel/rcupreempt.c 2007-01-09 10:59:54.000000000 -0800
+++ linux-2.6.20-rc4-rt1-rcub/kernel/rcupreempt.c 2007-01-23 15:41:33.000000000 -0800
@@ -49,6 +49,7 @@
#include <linux/byteorder/swabb.h>
#include <linux/cpumask.h>
#include <linux/rcupreempt_trace.h>
+#include <linux/kthread.h>

/*
* PREEMPT_RCU data structures.
@@ -80,6 +81,534 @@ static struct rcu_ctrlblk rcu_ctrlblk =
static DEFINE_PER_CPU(atomic_t [2], rcu_flipctr) =
{ ATOMIC_INIT(0), ATOMIC_INIT(0) };

+#ifndef CONFIG_PREEMPT_RCU_BOOST
+static inline void init_rcu_boost_early(void) { }
+static inline void rcu_read_unlock_unboost(void) { }
+#else /* #ifndef CONFIG_PREEMPT_RCU_BOOST */
+
+/* Defines possible event indices for ->rbs_stats[] (first index). */
+
+#define RCU_BOOST_DAT_BLOCK 0
+#define RCU_BOOST_DAT_BOOST 1
+#define RCU_BOOST_DAT_UNLOCK 2
+#define N_RCU_BOOST_DAT_EVENTS 3
+
+/* RCU-boost per-CPU array element. */
+
+struct rcu_boost_dat {
+ raw_spinlock_t rbs_mutex;
+ struct list_head rbs_toboost;
+ struct list_head rbs_boosted;
+ long rbs_blocked;
+ long rbs_boost_attempt;
+ long rbs_boost;
+ long rbs_unlock;
+ long rbs_unboosted;
+#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS
+ long rbs_stats[N_RCU_BOOST_DAT_EVENTS][N_RCU_BOOST_STATE + 1];
+#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */
+};
+#define RCU_BOOST_ELEMENTS 4
+
+int rcu_boost_idx = -1; /* invalid value in case someone uses RCU early. */
+DEFINE_PER_CPU(struct rcu_boost_dat, rcu_boost_dat[RCU_BOOST_ELEMENTS]);
+static struct task_struct *rcu_boost_task = NULL;
+
+#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS
+
+/*
+ * Function to increment indicated ->rbs_stats[] element.
+ */
+static inline void rcu_boost_dat_stat(struct rcu_boost_dat *rbdp,
+ int event,
+ enum rcu_boost_state oldstate)
+{
+ if (oldstate >= RCU_BOOST_IDLE &&
+ oldstate <= RCU_BOOSTED) {
+ rbdp->rbs_stats[event][oldstate]++;
+ } else {
+ rbdp->rbs_stats[event][N_RCU_BOOST_STATE]++;
+ }
+}
+
+#define rcu_boost_dat_stat_block(rbdp, oldstate) \
+ rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_BLOCK, oldstate)
+#define rcu_boost_dat_stat_boost(rbdp, oldstate) \
+ rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_BOOST, oldstate)
+#define rcu_boost_dat_stat_unlock(rbdp, oldstate) \
+ rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_UNLOCK, oldstate)
+
+/*
+ * Prefix for kprint() strings for periodic statistics messages.
+ */
+static char *rcu_boost_state_event[] = {
+ "block: ",
+ "boost: ",
+ "unlock: ",
+};
+
+/*
+ * Indicators for numbers in kprint() strings. "!" indicates a state-event
+ * pair that should not happen, while "?" indicates a state that should
+ * not happen.
+ */
+static char *rcu_boost_state_error[] = {
+ /*ibBe*/
+ " ?", /* block */
+ "! ?", /* boost */
+ "? ?", /* unlock */
+};
+
+/*
+ * Print out RCU booster task statistics at the specified interval.
+ */
+static void rcu_boost_dat_stat_print(void)
+{
+ char buf[N_RCU_BOOST_STATE * (sizeof(long) * 3 + 2) + 2];
+ int cpu;
+ int event;
+ int i;
+ static time_t lastprint = 0;
+ struct rcu_boost_dat *rbdp;
+ int state;
+ struct rcu_boost_dat sum;
+
+ /* Wait a graceful interval between printk spamming. */
+
+ if (xtime.tv_sec - lastprint <
+ CONFIG_PREEMPT_RCU_BOOST_STATS_INTERVAL)
+ return;
+
+ /* Sum up the state/event-independent counters. */
+
+ sum.rbs_blocked = 0;
+ sum.rbs_boost_attempt = 0;
+ sum.rbs_boost = 0;
+ sum.rbs_unlock = 0;
+ sum.rbs_unboosted = 0;
+ for_each_possible_cpu(cpu)
+ for (i = 0; i < RCU_BOOST_ELEMENTS; i++) {
+ rbdp = per_cpu(rcu_boost_dat, cpu);
+ sum.rbs_blocked += rbdp[i].rbs_blocked;
+ sum.rbs_boost_attempt += rbdp[i].rbs_boost_attempt;
+ sum.rbs_boost += rbdp[i].rbs_boost;
+ sum.rbs_unlock += rbdp[i].rbs_unlock;
+ sum.rbs_unboosted += rbdp[i].rbs_unboosted;
+ }
+
+ /* Sum up the state/event-dependent counters. */
+
+ for (event = 0; event < N_RCU_BOOST_DAT_EVENTS; event++)
+ for (state = 0; state <= N_RCU_BOOST_STATE; state++) {
+ sum.rbs_stats[event][state] = 0;
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < RCU_BOOST_ELEMENTS; i++) {
+ sum.rbs_stats[event][state]
+ += per_cpu(rcu_boost_dat,
+ cpu)[i].rbs_stats[event][state];
+ }
+ }
+ }
+
+ /* Print them out! */
+
+ printk(KERN_ALERT
+ "rcu_boost_dat: idx=%d "
+ "b=%ld ul=%ld ub=%ld boost: a=%ld b=%ld\n",
+ rcu_boost_idx,
+ sum.rbs_blocked, sum.rbs_unlock, sum.rbs_unboosted,
+ sum.rbs_boost_attempt, sum.rbs_boost);
+ for (event = 0; event < N_RCU_BOOST_DAT_EVENTS; event++) {
+ i = 0;
+ for (state = 0; state <= N_RCU_BOOST_STATE; state++) {
+ i += sprintf(&buf[i], " %ld%c",
+ sum.rbs_stats[event][state],
+ rcu_boost_state_error[event][state]);
+ }
+ printk(KERN_ALERT "rcu_boost_dat %s %s\n",
+ rcu_boost_state_event[event], buf);
+ }
+
+ /* Go away and don't come back for awhile. */
+
+ lastprint = xtime.tv_sec;
+}
+
+#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */
+
+#define rcu_boost_dat_stat_block(rbdp, oldstate)
+#define rcu_boost_dat_stat_boost(rbdp, oldstate)
+#define rcu_boost_dat_stat_unlock(rbdp, oldstate)
+#define rcu_boost_dat_stat_print()
+
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */
+
+/*
+ * Initialize RCU-boost state. This happens early in the boot process,
+ * when the scheduler does not yet exist. So don't try to use it.
+ */
+static void init_rcu_boost_early(void)
+{
+ struct rcu_boost_dat *rbdp;
+ int cpu;
+ int i;
+
+ for_each_possible_cpu(cpu) {
+ rbdp = per_cpu(rcu_boost_dat, cpu);
+ for (i = 0; i < RCU_BOOST_ELEMENTS; i++) {
+ rbdp[i].rbs_mutex =
+ RAW_SPIN_LOCK_UNLOCKED(rbdp[i].rbs_mutex);
+ INIT_LIST_HEAD(&rbdp[i].rbs_toboost);
+ INIT_LIST_HEAD(&rbdp[i].rbs_boosted);
+ rbdp[i].rbs_blocked = 0;
+ rbdp[i].rbs_boost_attempt = 0;
+ rbdp[i].rbs_boost = 0;
+ rbdp[i].rbs_unlock = 0;
+ rbdp[i].rbs_unboosted = 0;
+#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS
+ {
+ int j, k;
+
+ for (j = 0; j < N_RCU_BOOST_DAT_EVENTS; j++)
+ for (k = 0; k <= N_RCU_BOOST_STATE; k++)
+ rbdp[i].rbs_stats[j][k] = 0;
+ }
+#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */
+ }
+ smp_wmb();
+ rcu_boost_idx = 0;
+ }
+}
+
+/*
+ * Return the current boost index for boosting target tasks.
+ * May only be invoked by the booster task, so guaranteed to
+ * already be initialized.
+ */
+static inline int rcu_boost_idx_boosting(void)
+{
+ return (rcu_boost_idx + 1) & (RCU_BOOST_ELEMENTS - 1);
+}
+
+/*
+ * Return the list on which the calling task should add itself, or
+ * NULL if too early during initialization.
+ */
+static inline struct rcu_boost_dat *rcu_rbd_new(void)
+{
+ int cpu = raw_smp_processor_id(); /* locks used, so preemption OK. */
+ int idx = rcu_boost_idx;
+
+ smp_read_barrier_depends(); barrier();
+ if (unlikely(idx < 0))
+ return (NULL);
+ return &per_cpu(rcu_boost_dat, cpu)[idx];
+}
+
+/*
+ * Return the list from which to boost target tasks.
+ * May only be invoked by the booster task, so guaranteed to
+ * already be initialized.
+ */
+static inline struct rcu_boost_dat *rcu_rbd_boosting(int cpu)
+{
+ int idx = (rcu_boost_idx + 1) & (RCU_BOOST_ELEMENTS - 1);
+
+ return &per_cpu(rcu_boost_dat, cpu)[idx];
+}
+
+#define PREEMPT_RCU_BOOSTER_PRIO 49 /* Match curr_irq_prio manually. */
+ /* Administrators can always adjust */
+ /* via the /proc interface. */
+
+/*
+ * Boost the specified task from an RCU viewpoint.
+ * Boost the target task to a priority just a bit less-favored than
+ * that of the RCU-boost task, but boost to a realtime priority even
+ * if the RCU-boost task is running at a non-realtime priority.
+ * We check the priority of the RCU-boost task each time we boost
+ * in case the sysadm manually changes the priority.
+ */
+static void rcu_boost_prio(struct task_struct *taskp)
+{
+ unsigned long oldirq;
+ int rcuprio;
+
+ spin_lock_irqsave(&current->pi_lock, oldirq);
+ rcuprio = rt_mutex_getprio(current) + 1;
+ if (rcuprio >= MAX_USER_RT_PRIO)
+ rcuprio = MAX_USER_RT_PRIO - 1;
+ spin_unlock_irqrestore(&current->pi_lock, oldirq);
+ spin_lock_irqsave(&taskp->pi_lock, oldirq);
+ if (taskp->rcu_prio != rcuprio) {
+ taskp->rcu_prio = rcuprio;
+ if (taskp->rcu_prio != taskp->prio)
+ rt_mutex_setprio(taskp, taskp->rcu_prio);
+ }
+ spin_unlock_irqrestore(&taskp->pi_lock, oldirq);
+}
+
+/*
+ * Unboost the specified task from an RCU viewpoint.
+ */
+static void rcu_unboost_prio(struct task_struct *taskp)
+{
+ int nprio;
+ unsigned long oldirq;
+
+ spin_lock_irqsave(&taskp->pi_lock, oldirq);
+ taskp->rcu_prio = MAX_PRIO;
+ nprio = rt_mutex_getprio(taskp);
+ if (nprio > taskp->prio)
+ rt_mutex_setprio(taskp, nprio);
+ spin_unlock_irqrestore(&taskp->pi_lock, oldirq);
+}
+
+/*
+ * Boost all of the RCU-reader tasks on the specified list.
+ */
+static void rcu_boost_one_reader_list(struct rcu_boost_dat *rbdp)
+{
+ LIST_HEAD(list);
+ unsigned long oldirq;
+ struct task_struct *taskp;
+
+ /*
+ * Splice both lists onto a local list. We will still
+ * need to hold the lock when manipulating the local list
+ * because tasks can remove themselves at any time.
+ * The reason for splicing the rbs_boosted list is that
+ * our priority may have changed, so reboosting may be
+ * required.
+ */
+
+ spin_lock_irqsave(&rbdp->rbs_mutex, oldirq);
+ list_splice_init(&rbdp->rbs_toboost, &list);
+ list_splice_init(&rbdp->rbs_boosted, &list);
+ while (!list_empty(&list)) {
+
+ /*
+ * Pause for a bit before boosting each task.
+ * @@@FIXME: reduce/eliminate pausing in case of OOM.
+ */
+
+ spin_unlock_irqrestore(&rbdp->rbs_mutex, oldirq);
+ schedule_timeout_uninterruptible(1);
+ spin_lock_irqsave(&rbdp->rbs_mutex, oldirq);
+
+ /*
+ * All tasks might have removed themselves while
+ * we were waiting. Recheck list emptiness.
+ */
+
+ if (list_empty(&list))
+ break;
+
+ /* Remove first task in local list, count the attempt. */
+
+ taskp = list_entry(list.next, typeof(*taskp), rcub_entry);
+ list_del_init(&taskp->rcub_entry);
+ rbdp->rbs_boost_attempt++;
+
+ /* Ignore tasks in unexpected states. */
+
+ if (taskp->rcub_state == RCU_BOOST_IDLE) {
+ list_add_tail(&taskp->rcub_entry, &rbdp->rbs_toboost);
+ rcu_boost_dat_stat_boost(rbdp, taskp->rcub_state);
+ continue;
+ }
+
+ /* Boost the task's priority. */
+
+ rcu_boost_prio(taskp);
+ rbdp->rbs_boost++;
+ rcu_boost_dat_stat_boost(rbdp, taskp->rcub_state);
+ taskp->rcub_state = RCU_BOOSTED;
+ list_add_tail(&taskp->rcub_entry, &rbdp->rbs_boosted);
+ }
+ spin_unlock_irqrestore(&rbdp->rbs_mutex, oldirq);
+}
+
+/*
+ * Priority-boost tasks stuck in RCU read-side critical sections as
+ * needed (presumably rarely).
+ */
+static int rcu_booster(void *arg)
+{
+ int cpu;
+ struct sched_param sp;
+
+ sp.sched_priority = PREEMPT_RCU_BOOSTER_PRIO;
+ sched_setscheduler(current, SCHED_RR, &sp);
+ current->flags |= PF_NOFREEZE;
+
+ do {
+
+ /* Advance the lists of tasks. */
+
+ rcu_boost_idx = (rcu_boost_idx + 1) % RCU_BOOST_ELEMENTS;
+ for_each_possible_cpu(cpu) {
+
+ /*
+ * Boost all sufficiently aged readers.
+ * Readers must first be preempted or block
+ * on a mutex in an RCU read-side critical section,
+ * then remain in that critical section for
+ * RCU_BOOST_ELEMENTS-1 time intervals.
+ * So most of the time we should end up doing
+ * nothing.
+ */
+
+ rcu_boost_one_reader_list(rcu_rbd_boosting(cpu));
+
+ /*
+ * Large SMP systems may need to sleep sometimes
+ * in this loop. Or have multiple RCU-boost tasks.
+ */
+ }
+
+ /*
+ * Sleep to allow any unstalled RCU read-side critical
+ * sections to age out of the list. @@@ FIXME: reduce,
+ * adjust, or eliminate in case of OOM.
+ */
+
+ schedule_timeout_uninterruptible(HZ / 100);
+
+ /* Print stats if enough time has passed. */
+
+ rcu_boost_dat_stat_print();
+
+ } while (!kthread_should_stop());
+
+ return 0;
+}
+
+/*
+ * Perform the portions of RCU-boost initialization that require the
+ * scheduler to be up and running.
+ */
+void init_rcu_boost_late(void)
+{
+ int i;
+
+ /* Spawn RCU-boost task. */
+
+ printk(KERN_ALERT "Starting RCU priority booster\n");
+ rcu_boost_task = kthread_run(rcu_booster, NULL, "RCU Prio Booster");
+ if (IS_ERR(rcu_boost_task)) {
+ i = PTR_ERR(rcu_boost_task);
+ printk(KERN_ALERT
+ "Unable to create RCU Priority Booster, errno %d\n", -i);
+
+ /*
+ * Continue running, but tasks permanently blocked
+ * in RCU read-side critical sections will be able
+ * to stall grace-period processing, potentially
+ * OOMing the machine.
+ */
+
+ rcu_boost_task = NULL;
+ }
+}
+
+/*
+ * Update task's RCU-boost state to reflect blocking in RCU read-side
+ * critical section, so that the RCU-boost task can find it in case it
+ * later needs its priority boosted.
+ */
+void __rcu_preempt_boost(void)
+{
+ struct rcu_boost_dat *rbdp;
+ unsigned long oldirq;
+
+ /* Identify list to place task on for possible later boosting. */
+
+ local_irq_save(oldirq);
+ rbdp = rcu_rbd_new();
+ if (rbdp == NULL) {
+ local_irq_restore(oldirq);
+ printk("Preempted RCU read-side critical section too early.\n");
+ return;
+ }
+ spin_lock(&rbdp->rbs_mutex);
+ rbdp->rbs_blocked++;
+
+ /*
+ * Update state. We hold the lock and aren't yet on the list,
+ * so the booster cannot mess with us yet.
+ */
+
+ rcu_boost_dat_stat_block(rbdp, current->rcub_state);
+ if (current->rcub_state != RCU_BOOST_IDLE) {
+
+ /*
+ * We have been here before, so just update stats.
+ * It may seem strange to do all this work just to
+ * accumulate statistics, but this is such a
+ * low-probability code path that we shouldn't care.
+ * If it becomes a problem, it can be fixed.
+ */
+
+ spin_unlock_irqrestore(&rbdp->rbs_mutex, oldirq);
+ return;
+ }
+ current->rcub_state = RCU_BOOST_BLOCKED;
+
+ /* Now add ourselves to the list so that the booster can find use. */
+
+ list_add_tail(&current->rcub_entry, &rbdp->rbs_toboost);
+ current->rcub_rbdp = rbdp;
+ spin_unlock_irqrestore(&rbdp->rbs_mutex, oldirq);
+}
+
+/*
+ * Do the list-removal and priority-unboosting "heavy lifting" when
+ * required.
+ */
+static void __rcu_read_unlock_unboost(void)
+{
+ unsigned long oldirq;
+ struct rcu_boost_dat *rbdp;
+
+ /* Identify the list structure and acquire the corresponding lock. */
+
+ rbdp = current->rcub_rbdp;
+ spin_lock_irqsave(&rbdp->rbs_mutex, oldirq);
+
+ /* Remove task from the list it was on. */
+
+ list_del_init(&current->rcub_entry);
+ rbdp->rbs_unlock++;
+ current->rcub_rbdp = NULL;
+
+ /* Record stats, unboost if needed, and update state. */
+
+ rcu_boost_dat_stat_unlock(rbdp, current->rcub_state);
+ if (current->rcub_state == RCU_BOOSTED) {
+ rcu_unboost_prio(current);
+ rbdp->rbs_unboosted++;
+ }
+ current->rcub_state = RCU_BOOST_IDLE;
+ spin_unlock_irqrestore(&rbdp->rbs_mutex, oldirq);
+}
+
+/*
+ * Do any state changes and unboosting needed for rcu_read_unlock().
+ * Pass any complex work on to __rcu_read_unlock_unboost().
+ * The vast majority of the time, no work will be needed, as preemption
+ * and blocking within RCU read-side critical sections is comparatively
+ * rare.
+ */
+static inline void rcu_read_unlock_unboost(void)
+{
+
+ if (unlikely(current->rcub_state != RCU_BOOST_IDLE))
+ __rcu_read_unlock_unboost();
+}
+
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU_BOOST */
+
/*
* Return the number of RCU batches processed thus far. Useful
* for debug and statistics.
@@ -155,6 +684,7 @@ void __rcu_read_unlock(void)
atomic_dec(current->rcu_flipctr2);
current->rcu_flipctr2 = NULL;
}
+ rcu_read_unlock_unboost();
}

local_irq_restore(oldirq);
@@ -345,6 +875,11 @@ int notrace rcu_pending(int cpu)
rcu_data.nextlist != NULL);
}

+/*
+ * Initialize RCU. This is called very early in boot, so is restricted
+ * to very simple operations. Don't even think about messing with anything
+ * that involves the scheduler, as it doesn't exist yet.
+ */
void __init __rcu_init(void)
{
/*&&&&*/printk("WARNING: experimental RCU implementation.\n");
@@ -356,6 +891,7 @@ void __init __rcu_init(void)
rcu_data.waittail = &rcu_data.waitlist;
rcu_data.donelist = NULL;
rcu_data.donetail = &rcu_data.donelist;
+ init_rcu_boost_early();
tasklet_init(&rcu_data.rcu_tasklet, rcu_process_callbacks, 0UL);
}

diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/rtmutex.c linux-2.6.20-rc4-rt1-rcub/kernel/rtmutex.c
--- linux-2.6.20-rc4-rt1/kernel/rtmutex.c 2007-01-09 10:59:54.000000000 -0800
+++ linux-2.6.20-rc4-rt1-rcub/kernel/rtmutex.c 2007-01-09 11:01:12.000000000 -0800
@@ -105,11 +105,14 @@ static inline void init_lists(struct rt_
*/
int rt_mutex_getprio(struct task_struct *task)
{
+ int prio = task->normal_prio;
+
+ if (get_rcu_prio(task) < prio)
+ prio = get_rcu_prio(task);
if (likely(!task_has_pi_waiters(task)))
- return task->normal_prio;
+ return prio;

- return min(task_top_pi_waiter(task)->pi_list_entry.prio,
- task->normal_prio);
+ return min(task_top_pi_waiter(task)->pi_list_entry.prio, prio);
}

/*
diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/sched.c linux-2.6.20-rc4-rt1-rcub/kernel/sched.c
--- linux-2.6.20-rc4-rt1/kernel/sched.c 2007-01-09 10:59:54.000000000 -0800
+++ linux-2.6.20-rc4-rt1-rcub/kernel/sched.c 2007-01-09 11:01:12.000000000 -0800
@@ -1962,6 +1962,7 @@ void fastcall sched_fork(struct task_str
* Make sure we do not leak PI boosting priority to the child:
*/
p->prio = current->normal_prio;
+ set_rcu_prio(p, MAX_PRIO);

INIT_LIST_HEAD(&p->run_list);
p->array = NULL;
@@ -2044,6 +2045,7 @@ void fastcall wake_up_new_task(struct ta
else {
p->prio = current->prio;
p->normal_prio = current->normal_prio;
+ set_rcu_prio(p, MAX_PRIO);
__activate_task_after(p, current, rq);
}
set_need_resched();
@@ -3986,6 +3988,8 @@ void __sched __schedule(void)
}
profile_hit(SCHED_PROFILING, __builtin_return_address(0));

+ rcu_preempt_boost();
+
preempt_disable(); // FIXME: disable irqs here
prev = current;
release_kernel_lock(prev);
@@ -5725,6 +5729,7 @@ void __cpuinit init_idle(struct task_str
idle->sleep_avg = 0;
idle->array = NULL;
idle->prio = idle->normal_prio = MAX_PRIO;
+ set_rcu_prio(idle, MAX_PRIO);
idle->state = TASK_RUNNING;
idle->cpus_allowed = cpumask_of_cpu(cpu);
set_task_cpu(idle, cpu);

2007-01-25 02:23:53

by Paul E. McKenney

[permalink] [raw]
Subject: [RFC PATCH -rt 2/2] RCU priority boosting additions to rcutorture

This patch adds an optional preemption kernel thread to the rcutorture
tests. This thread sets itself to a low RT priority and chews up
CPU in 10-second bursts, verifying that grace periods progress during
this 10-second interval. This has thus far passed about 30 hours of
RCU torture testing on a 4-CPU (a pair of 2-CPU dies) 64-bit Xeon
system.

I am experimenting with more-vicious tests, but extra viciousness thus
far comes at the expense of grotesque code.

Signed-off-by: Paul E. McKenney <[email protected]>
---

rcutorture.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 84 insertions(+), 1 deletion(-)

diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/rcutorture.c linux-2.6.20-rc4-rt1-rcubtorture/kernel/rcutorture.c
--- linux-2.6.20-rc4-rt1/kernel/rcutorture.c 2007-01-09 10:59:54.000000000 -0800
+++ linux-2.6.20-rc4-rt1-rcubtorture/kernel/rcutorture.c 2007-01-23 11:27:49.000000000 -0800
@@ -194,6 +194,8 @@ struct rcu_torture_ops {
int (*completed)(void);
void (*deferredfree)(struct rcu_torture *p);
void (*sync)(void);
+ void (*preemptstart)(void);
+ void (*preemptend)(void);
int (*stats)(char *page);
char *name;
};
@@ -258,6 +260,71 @@ static void rcu_torture_deferred_free(st
call_rcu(&p->rtort_rcu, rcu_torture_cb);
}

+#ifndef CONFIG_PREEMPT_RCU_BOOST
+static void rcu_preempt_start(void) { }
+static void rcu_preempt_end(void) { }
+static int rcu_preempt_stats(char *page) { return 0; }
+#else /* #ifndef CONFIG_PREEMPT_RCU_BOOST */
+
+static struct task_struct *rcu_preeempt_task;
+static long rcu_torture_preempt_errors = 0;
+
+static int rcu_torture_preempt(void *arg)
+{
+ int completedstart;
+ time_t gcstart;
+ struct sched_param sp;
+
+ sp.sched_priority = MAX_RT_PRIO - 1;
+ sched_setscheduler(current, SCHED_RR, &sp);
+ current->flags |= PF_NOFREEZE;
+
+ do {
+ completedstart = rcu_torture_completed();
+ gcstart = xtime.tv_sec;
+ while ((xtime.tv_sec - gcstart < 10) &&
+ (rcu_torture_completed() == completedstart))
+ cond_resched();
+ if (rcu_torture_completed() == completedstart)
+ rcu_torture_preempt_errors++;
+ schedule_timeout_interruptible(shuffle_interval * HZ);
+ } while (!kthread_should_stop());
+ return NULL;
+}
+
+static void rcu_preempt_start(void)
+{
+ rcu_preeempt_task = kthread_run(rcu_torture_preempt, NULL,
+ "rcu_torture_preempt");
+ if (IS_ERR(rcu_preeempt_task)) {
+ VERBOSE_PRINTK_ERRSTRING("Failed to create preempter");
+ rcu_preeempt_task = NULL;
+ }
+}
+
+static void rcu_preempt_end(void)
+{
+ if (rcu_preeempt_task != NULL) {
+ VERBOSE_PRINTK_STRING("Stopping rcu_preempt task");
+ kthread_stop(rcu_preeempt_task);
+ }
+ rcu_preeempt_task = NULL;
+}
+
+static int rcu_preempt_stats(char *page) {
+ int cnt = 0;
+
+ cnt += sprintf(&page[cnt],
+ "Preemption stalls: %ld\n", rcu_torture_preempt_errors);
+ return (cnt);
+}
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU_BOOST */
+
+static void rcu_preemptstart(void)
+{
+
+}
+
static struct rcu_torture_ops rcu_ops = {
.init = NULL,
.cleanup = NULL,
@@ -267,7 +334,9 @@ static struct rcu_torture_ops rcu_ops =
.completed = rcu_torture_completed,
.deferredfree = rcu_torture_deferred_free,
.sync = synchronize_rcu,
- .stats = NULL,
+ .preemptstart = rcu_preempt_start,
+ .preemptend = rcu_preempt_end,
+ .stats = rcu_preempt_stats,
.name = "rcu"
};

@@ -306,6 +375,8 @@ static struct rcu_torture_ops rcu_sync_o
.completed = rcu_torture_completed,
.deferredfree = rcu_sync_torture_deferred_free,
.sync = synchronize_rcu,
+ .preemptstart = NULL,
+ .preemptend = NULL,
.stats = NULL,
.name = "rcu_sync"
};
@@ -370,6 +441,8 @@ static struct rcu_torture_ops rcu_bh_ops
.completed = rcu_bh_torture_completed,
.deferredfree = rcu_bh_torture_deferred_free,
.sync = rcu_bh_torture_synchronize,
+ .preemptstart = NULL,
+ .preemptend = NULL,
.stats = NULL,
.name = "rcu_bh"
};
@@ -383,6 +456,8 @@ static struct rcu_torture_ops rcu_bh_syn
.completed = rcu_bh_torture_completed,
.deferredfree = rcu_sync_torture_deferred_free,
.sync = rcu_bh_torture_synchronize,
+ .preemptstart = NULL,
+ .preemptend = NULL,
.stats = NULL,
.name = "rcu_bh_sync"
};
@@ -464,6 +539,8 @@ static struct rcu_torture_ops srcu_ops =
.completed = srcu_torture_completed,
.deferredfree = rcu_sync_torture_deferred_free,
.sync = srcu_torture_synchronize,
+ .preemptstart = NULL,
+ .preemptend = NULL,
.stats = srcu_torture_stats,
.name = "srcu"
};
@@ -502,6 +579,8 @@ static struct rcu_torture_ops sched_ops
.completed = sched_torture_completed,
.deferredfree = rcu_sync_torture_deferred_free,
.sync = sched_torture_synchronize,
+ .preemptstart = NULL,
+ .preemptend = NULL,
.stats = NULL,
.name = "sched"
};
@@ -856,6 +935,8 @@ rcu_torture_cleanup(void)
kthread_stop(stats_task);
}
stats_task = NULL;
+ if (cur_ops->preemptend != NULL)
+ cur_ops->preemptend();

/* Wait for all RCU callbacks to fire. */
rcu_barrier();
@@ -997,6 +1078,8 @@ rcu_torture_init(void)
goto unwind;
}
}
+ if (cur_ops->preemptstart != NULL)
+ cur_ops->preemptstart();
return 0;

unwind:

2007-01-25 08:47:10

by Josh Triplett

[permalink] [raw]
Subject: Re: [RFC PATCH -rt 2/2] RCU priority boosting additions to rcutorture

Paul E. McKenney wrote:
> This patch adds an optional preemption kernel thread to the rcutorture
> tests. This thread sets itself to a low RT priority and chews up
> CPU in 10-second bursts, verifying that grace periods progress during
> this 10-second interval. This has thus far passed about 30 hours of
> RCU torture testing on a 4-CPU (a pair of 2-CPU dies) 64-bit Xeon
> system.
>
> I am experimenting with more-vicious tests, but extra viciousness thus
> far comes at the expense of grotesque code.

Overall, the new feature seems like a good idea, and it should exercise the
new RCU boosting code. Some comments below.

One major item: this new test feature really needs a new module parameter to
enable or disable it.

> diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/rcutorture.c linux-2.6.20-rc4-rt1-rcubtorture/kernel/rcutorture.c
> --- linux-2.6.20-rc4-rt1/kernel/rcutorture.c 2007-01-09 10:59:54.000000000 -0800
> +++ linux-2.6.20-rc4-rt1-rcubtorture/kernel/rcutorture.c 2007-01-23 11:27:49.000000000 -0800
> @@ -194,6 +194,8 @@ struct rcu_torture_ops {
> int (*completed)(void);
> void (*deferredfree)(struct rcu_torture *p);
> void (*sync)(void);
> + void (*preemptstart)(void);
> + void (*preemptend)(void);
> int (*stats)(char *page);
> char *name;
> };
> @@ -258,6 +260,71 @@ static void rcu_torture_deferred_free(st
> call_rcu(&p->rtort_rcu, rcu_torture_cb);
> }
>
> +#ifndef CONFIG_PREEMPT_RCU_BOOST
> +static void rcu_preempt_start(void) { }
> +static void rcu_preempt_end(void) { }
> +static int rcu_preempt_stats(char *page) { return 0; }
> +#else /* #ifndef CONFIG_PREEMPT_RCU_BOOST */
> +
> +static struct task_struct *rcu_preeempt_task;
> +static long rcu_torture_preempt_errors = 0;

Might as well make this an unsigned long; negative values wouldn't make sense.

> +static int rcu_torture_preempt(void *arg)
> +{
> + int completedstart;
> + time_t gcstart;
> + struct sched_param sp;
> +
> + sp.sched_priority = MAX_RT_PRIO - 1;
> + sched_setscheduler(current, SCHED_RR, &sp);
> + current->flags |= PF_NOFREEZE;
> +
> + do {
> + completedstart = rcu_torture_completed();
> + gcstart = xtime.tv_sec;
> + while ((xtime.tv_sec - gcstart < 10) &&
> + (rcu_torture_completed() == completedstart))
> + cond_resched();
> + if (rcu_torture_completed() == completedstart)
> + rcu_torture_preempt_errors++;
> + schedule_timeout_interruptible(shuffle_interval * HZ);

Why call schedule_timeout_interruptible here without actually handling
interruptions? So that you can send it a signal to cause the shuffle early?

> + } while (!kthread_should_stop());
> + return NULL;
> +}
> +
> +static void rcu_preempt_start(void)
> +{
> + rcu_preeempt_task = kthread_run(rcu_torture_preempt, NULL,
> + "rcu_torture_preempt");
> + if (IS_ERR(rcu_preeempt_task)) {
> + VERBOSE_PRINTK_ERRSTRING("Failed to create preempter");

This ought to include the errno value, PTR_ERR(rcu_preempt_task).

> + rcu_preeempt_task = NULL;
> + }
> +}
> +
> +static void rcu_preempt_end(void)
> +{
> + if (rcu_preeempt_task != NULL) {

if (rcu_preempt_task) would work just as well here.

> + VERBOSE_PRINTK_STRING("Stopping rcu_preempt task");
> + kthread_stop(rcu_preeempt_task);
> + }
> + rcu_preeempt_task = NULL;
> +}
> +
> +static int rcu_preempt_stats(char *page) {
> + int cnt = 0;
> +
> + cnt += sprintf(&page[cnt],
> + "Preemption stalls: %ld\n", rcu_torture_preempt_errors);
> + return (cnt);
> +}

How about just:
return sprintf(page, ...);
?

Also, if you decide to make rcu_torture_preempt_errors an unsigned long as
suggested above, this should use %lu.

> +#endif /* #else #ifndef CONFIG_PREEMPT_RCU_BOOST */
> +
> +static void rcu_preemptstart(void)
> +{
> +
> +}
> +

This looks like a bit of stray code.

> static struct rcu_torture_ops rcu_ops = {
> .init = NULL,
> .cleanup = NULL,
> @@ -267,7 +334,9 @@ static struct rcu_torture_ops rcu_ops =
> .completed = rcu_torture_completed,
> .deferredfree = rcu_torture_deferred_free,
> .sync = synchronize_rcu,
> - .stats = NULL,
> + .preemptstart = rcu_preempt_start,
> + .preemptend = rcu_preempt_end,
> + .stats = rcu_preempt_stats,
> .name = "rcu"
> };
>
> @@ -306,6 +375,8 @@ static struct rcu_torture_ops rcu_sync_o
> .completed = rcu_torture_completed,
> .deferredfree = rcu_sync_torture_deferred_free,
> .sync = synchronize_rcu,
> + .preemptstart = NULL,
> + .preemptend = NULL,
> .stats = NULL,
> .name = "rcu_sync"
> };

Much like other common structures such as struct file_operations, no need to
explicitly specify members as NULL here; any member you don't specify will get
a NULL value. That avoids the need to update every use of this structure
whenever you add a new member used by only some of them.

> @@ -370,6 +441,8 @@ static struct rcu_torture_ops rcu_bh_ops
> .completed = rcu_bh_torture_completed,
> .deferredfree = rcu_bh_torture_deferred_free,
> .sync = rcu_bh_torture_synchronize,
> + .preemptstart = NULL,
> + .preemptend = NULL,
> .stats = NULL,
> .name = "rcu_bh"
> };

Likewise.

> @@ -383,6 +456,8 @@ static struct rcu_torture_ops rcu_bh_syn
> .completed = rcu_bh_torture_completed,
> .deferredfree = rcu_sync_torture_deferred_free,
> .sync = rcu_bh_torture_synchronize,
> + .preemptstart = NULL,
> + .preemptend = NULL,
> .stats = NULL,
> .name = "rcu_bh_sync"
> };

Likewise.

> @@ -464,6 +539,8 @@ static struct rcu_torture_ops srcu_ops =
> .completed = srcu_torture_completed,
> .deferredfree = rcu_sync_torture_deferred_free,
> .sync = srcu_torture_synchronize,
> + .preemptstart = NULL,
> + .preemptend = NULL,
> .stats = srcu_torture_stats,
> .name = "srcu"
> };

Likewise.

> @@ -502,6 +579,8 @@ static struct rcu_torture_ops sched_ops
> .completed = sched_torture_completed,
> .deferredfree = rcu_sync_torture_deferred_free,
> .sync = sched_torture_synchronize,
> + .preemptstart = NULL,
> + .preemptend = NULL,
> .stats = NULL,
> .name = "sched"

Likewise.

> @@ -856,6 +935,8 @@ rcu_torture_cleanup(void)
> kthread_stop(stats_task);
> }
> stats_task = NULL;
> + if (cur_ops->preemptend != NULL)

if (cur_ops->preemptend) would work as well.

> @@ -997,6 +1078,8 @@ rcu_torture_init(void)
> goto unwind;
> }
> }
> + if (cur_ops->preemptstart != NULL)

Likewise.

- Josh Triplett

2007-01-25 09:36:21

by Josh Triplett

[permalink] [raw]
Subject: Re: [RFC PATCH -rt 1/2] RCU priority boosting that survives semi-vicious testing

Overall, this code looks sensible to me. Some comments on the patch below.

Paul E. McKenney wrote:
> --- linux-2.6.20-rc4-rt1/include/linux/rcupreempt.h 2007-01-09 10:59:54.000000000 -0800
> +++ linux-2.6.20-rc4-rt1-rcub/include/linux/rcupreempt.h 2007-01-09 11:01:12.000000000 -0800
[...]
> +enum rcu_boost_state {
> + RCU_BOOST_IDLE = 0, /* Not yet blocked if in RCU read-side. */
> + RCU_BOOST_BLOCKED = 1, /* Blocked from RCU read-side. */
> + RCU_BOOSTED = 2, /* Boosting complete. */
> +};
> +
> +#define N_RCU_BOOST_STATE (RCU_BOOSTED + 1)

Here, you define the three possible states, the number of states. But
later...

> diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/rcupreempt.c linux-2.6.20-rc4-rt1-rcub/kernel/rcupreempt.c
> --- linux-2.6.20-rc4-rt1/kernel/rcupreempt.c 2007-01-09 10:59:54.000000000 -0800
> +++ linux-2.6.20-rc4-rt1-rcub/kernel/rcupreempt.c 2007-01-23 15:41:33.000000000 -0800
> @@ -49,6 +49,7 @@
[...]
> +#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS
> + long rbs_stats[N_RCU_BOOST_DAT_EVENTS][N_RCU_BOOST_STATE + 1];
> +#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */

...you declare this array to have space for one more than that, and then...

> +#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS
> +
> +/*
> + * Function to increment indicated ->rbs_stats[] element.
> + */
> +static inline void rcu_boost_dat_stat(struct rcu_boost_dat *rbdp,
> + int event,
> + enum rcu_boost_state oldstate)
> +{
> + if (oldstate >= RCU_BOOST_IDLE &&
> + oldstate <= RCU_BOOSTED) {
> + rbdp->rbs_stats[event][oldstate]++;
> + } else {
> + rbdp->rbs_stats[event][N_RCU_BOOST_STATE]++;

...you use the last element as what looks like an "unknown" counter. How
about instead defining the enum to have an "unknown" state, defining
N_RCU_BOOST_STATE accordingly, and using N_RCU_BOOST_STATE without the +1 in
rbs_stats? I think that would make the code much more self-documenting, and
less error-prone.

> +#define rcu_boost_dat_stat_block(rbdp, oldstate) \
> + rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_BLOCK, oldstate)
> +#define rcu_boost_dat_stat_boost(rbdp, oldstate) \
> + rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_BOOST, oldstate)
> +#define rcu_boost_dat_stat_unlock(rbdp, oldstate) \
> + rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_UNLOCK, oldstate)
> +
> +/*
> + * Prefix for kprint() strings for periodic statistics messages.
> + */
> +static char *rcu_boost_state_event[] = {
> + "block: ",
> + "boost: ",
> + "unlock: ",
> +};

I don't know for sure, but I *think* that GCC may sometimes generate more
efficient code if you use a char [][], rather than a char *[].

> +
> +/*
> + * Indicators for numbers in kprint() strings. "!" indicates a state-event
> + * pair that should not happen, while "?" indicates a state that should
> + * not happen.
> + */
> +static char *rcu_boost_state_error[] = {
> + /*ibBe*/
> + " ?", /* block */
> + "! ?", /* boost */
> + "? ?", /* unlock */
> +};

Likewise. Also, the columns here don't seem to line up, probably because the
comment uses spaces and the other lines use a tab.

> +static void rcu_boost_dat_stat_print(void)
> +{
> + char buf[N_RCU_BOOST_STATE * (sizeof(long) * 3 + 2) + 2];

This expression really begs for some constants in place of the magic numbers,
or failing that, a comment.

> + int cpu;
> + int event;
> + int i;
> + static time_t lastprint = 0;
> + struct rcu_boost_dat *rbdp;
> + int state;
> + struct rcu_boost_dat sum;
> +
> + /* Wait a graceful interval between printk spamming. */
> +
> + if (xtime.tv_sec - lastprint <
> + CONFIG_PREEMPT_RCU_BOOST_STATS_INTERVAL)
> + return;

How about printk_ratelimit? It takes a parameter for the rate. (On the other
hand, it also takes a spinlock, which you might not want here, even in the
stats code.)

> + /* Print them out! */
> +
> + printk(KERN_ALERT
> + "rcu_boost_dat: idx=%d "
> + "b=%ld ul=%ld ub=%ld boost: a=%ld b=%ld\n",
> + rcu_boost_idx,
> + sum.rbs_blocked, sum.rbs_unlock, sum.rbs_unboosted,
> + sum.rbs_boost_attempt, sum.rbs_boost);
> + for (event = 0; event < N_RCU_BOOST_DAT_EVENTS; event++) {
> + i = 0;
> + for (state = 0; state <= N_RCU_BOOST_STATE; state++) {
> + i += sprintf(&buf[i], " %ld%c",
> + sum.rbs_stats[event][state],
> + rcu_boost_state_error[event][state]);
> + }
> + printk(KERN_ALERT "rcu_boost_dat %s %s\n",
> + rcu_boost_state_event[event], buf);
> + }

Most or all of these counters could likely become unsigned, since negative
values don't make sense for them.

> + /* Go away and don't come back for awhile. */
> +
> + lastprint = xtime.tv_sec;
> +}
> +

> +/*
> + * Return the current boost index for boosting target tasks.
> + * May only be invoked by the booster task, so guaranteed to
> + * already be initialized.
> + */
> +static inline int rcu_boost_idx_boosting(void)
> +{
> + return (rcu_boost_idx + 1) & (RCU_BOOST_ELEMENTS - 1);
> +}

Quoted for later reference.

> +/*
> + * Initialize RCU-boost state. This happens early in the boot process,
> + * when the scheduler does not yet exist. So don't try to use it.
> + */
> +static void init_rcu_boost_early(void)
> +{
> + struct rcu_boost_dat *rbdp;
> + int cpu;
> + int i;
> +
> + for_each_possible_cpu(cpu) {
> + rbdp = per_cpu(rcu_boost_dat, cpu);
> + for (i = 0; i < RCU_BOOST_ELEMENTS; i++) {
> + rbdp[i].rbs_mutex =
> + RAW_SPIN_LOCK_UNLOCKED(rbdp[i].rbs_mutex);
> + INIT_LIST_HEAD(&rbdp[i].rbs_toboost);
> + INIT_LIST_HEAD(&rbdp[i].rbs_boosted);
> + rbdp[i].rbs_blocked = 0;
> + rbdp[i].rbs_boost_attempt = 0;
> + rbdp[i].rbs_boost = 0;
> + rbdp[i].rbs_unlock = 0;
> + rbdp[i].rbs_unboosted = 0;
> +#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS
> + {
> + int j, k;
> +
> + for (j = 0; j < N_RCU_BOOST_DAT_EVENTS; j++)
> + for (k = 0; k <= N_RCU_BOOST_STATE; k++)
> + rbdp[i].rbs_stats[j][k] = 0;
> + }
> +#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */
> + }
> + smp_wmb();

Barrier without an accompanying comment. You want to make sure the structure
gets initialized before setting rcu_boost_idx and thus allowing boosting,
right?

> + rcu_boost_idx = 0;
> + }
> +}

> +/*
> + * Return the list on which the calling task should add itself, or
> + * NULL if too early during initialization.
> + */
> +static inline struct rcu_boost_dat *rcu_rbd_new(void)
> +{
> + int cpu = raw_smp_processor_id(); /* locks used, so preemption OK. */
> + int idx = rcu_boost_idx;
> +
> + smp_read_barrier_depends(); barrier();

Barriers without accompanying comments, though in this case they seem fairly
obvious.

> + if (unlikely(idx < 0))
> + return (NULL);
> + return &per_cpu(rcu_boost_dat, cpu)[idx];
> +}
> +
> +/*
> + * Return the list from which to boost target tasks.
> + * May only be invoked by the booster task, so guaranteed to
> + * already be initialized.
> + */
> +static inline struct rcu_boost_dat *rcu_rbd_boosting(int cpu)
> +{
> + int idx = (rcu_boost_idx + 1) & (RCU_BOOST_ELEMENTS - 1);

You defined rcu_boost_idx_boosting for exactly this expression.

> + return &per_cpu(rcu_boost_dat, cpu)[idx];

For that matter, since you wrapped it in the function, you could just call it
directly in this array index operation.

> +}

> +/*
> + * Priority-boost tasks stuck in RCU read-side critical sections as
> + * needed (presumably rarely).
> + */
> +static int rcu_booster(void *arg)
> +{
> + int cpu;
> + struct sched_param sp;
> +
> + sp.sched_priority = PREEMPT_RCU_BOOSTER_PRIO;
> + sched_setscheduler(current, SCHED_RR, &sp);
> + current->flags |= PF_NOFREEZE;
> +
> + do {
> +
> + /* Advance the lists of tasks. */
> +
> + rcu_boost_idx = (rcu_boost_idx + 1) % RCU_BOOST_ELEMENTS;
> + for_each_possible_cpu(cpu) {
> +
> + /*
> + * Boost all sufficiently aged readers.
> + * Readers must first be preempted or block
> + * on a mutex in an RCU read-side critical section,
> + * then remain in that critical section for
> + * RCU_BOOST_ELEMENTS-1 time intervals.
> + * So most of the time we should end up doing
> + * nothing.
> + */
> +
> + rcu_boost_one_reader_list(rcu_rbd_boosting(cpu));
> +
> + /*
> + * Large SMP systems may need to sleep sometimes
> + * in this loop. Or have multiple RCU-boost tasks.
> + */

The idea of multiple RCU-boost tasks gives me an idea: what about having
per-CPU boost queues, and using that to eliminate the need to lock the queues?
It might not do quite as good a job of scheduling, but it would eliminate
locking overhead *and* solve the problem you describe here.

> + }
> +
> + /*
> + * Sleep to allow any unstalled RCU read-side critical
> + * sections to age out of the list. @@@ FIXME: reduce,
> + * adjust, or eliminate in case of OOM.
> + */
> +
> + schedule_timeout_uninterruptible(HZ / 100);
> +
> + /* Print stats if enough time has passed. */
> +
> + rcu_boost_dat_stat_print();
> +
> + } while (!kthread_should_stop());
> +
> + return 0;
> +}
> +
> +/*
> + * Perform the portions of RCU-boost initialization that require the
> + * scheduler to be up and running.
> + */
> +void init_rcu_boost_late(void)
> +{
> + int i;
> +
> + /* Spawn RCU-boost task. */
> +
> + printk(KERN_ALERT "Starting RCU priority booster\n");

Judging by other initialization code, I think this belongs at KERN_INFO or
similar.

> + rcu_boost_task = kthread_run(rcu_booster, NULL, "RCU Prio Booster");
> + if (IS_ERR(rcu_boost_task)) {
> + i = PTR_ERR(rcu_boost_task);
> + printk(KERN_ALERT
> + "Unable to create RCU Priority Booster, errno %d\n", -i);

No need for a temporary here.

> +
> + /*
> + * Continue running, but tasks permanently blocked
> + * in RCU read-side critical sections will be able
> + * to stall grace-period processing, potentially
> + * OOMing the machine.
> + */
> +
> + rcu_boost_task = NULL;
> + }
> +}
> +
> +/*
> + * Update task's RCU-boost state to reflect blocking in RCU read-side
> + * critical section, so that the RCU-boost task can find it in case it
> + * later needs its priority boosted.
> + */
> +void __rcu_preempt_boost(void)
> +{
> + struct rcu_boost_dat *rbdp;
> + unsigned long oldirq;
> +
> + /* Identify list to place task on for possible later boosting. */
> +
> + local_irq_save(oldirq);
> + rbdp = rcu_rbd_new();
> + if (rbdp == NULL) {
> + local_irq_restore(oldirq);
> + printk("Preempted RCU read-side critical section too early.\n");

This printk lacks a priority.

> + return;
> + }
> + spin_lock(&rbdp->rbs_mutex);
> + rbdp->rbs_blocked++;
> +
> + /*
> + * Update state. We hold the lock and aren't yet on the list,
> + * so the booster cannot mess with us yet.
> + */
> +
> + rcu_boost_dat_stat_block(rbdp, current->rcub_state);
> + if (current->rcub_state != RCU_BOOST_IDLE) {
> +
> + /*
> + * We have been here before, so just update stats.
> + * It may seem strange to do all this work just to
> + * accumulate statistics, but this is such a
> + * low-probability code path that we shouldn't care.
> + * If it becomes a problem, it can be fixed.
> + */
> +
> + spin_unlock_irqrestore(&rbdp->rbs_mutex, oldirq);
> + return;
> + }
> + current->rcub_state = RCU_BOOST_BLOCKED;
> +
> + /* Now add ourselves to the list so that the booster can find use. */

Typo: s/use/us/

> +
> + list_add_tail(&current->rcub_entry, &rbdp->rbs_toboost);
> + current->rcub_rbdp = rbdp;
> + spin_unlock_irqrestore(&rbdp->rbs_mutex, oldirq);
> +}

> diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/rtmutex.c linux-2.6.20-rc4-rt1-rcub/kernel/rtmutex.c
> --- linux-2.6.20-rc4-rt1/kernel/rtmutex.c 2007-01-09 10:59:54.000000000 -0800
> +++ linux-2.6.20-rc4-rt1-rcub/kernel/rtmutex.c 2007-01-09 11:01:12.000000000 -0800
> @@ -105,11 +105,14 @@ static inline void init_lists(struct rt_
> */
> int rt_mutex_getprio(struct task_struct *task)
> {
> + int prio = task->normal_prio;
> +
> + if (get_rcu_prio(task) < prio)
> + prio = get_rcu_prio(task);

int prio = min(task->normal_prio, get_rcu_prio(task)); ?

> if (likely(!task_has_pi_waiters(task)))
> - return task->normal_prio;
> + return prio;
>
> - return min(task_top_pi_waiter(task)->pi_list_entry.prio,
> - task->normal_prio);
> + return min(task_top_pi_waiter(task)->pi_list_entry.prio, prio);
> }

- Josh Triplett


Attachments:
signature.asc (252.00 B)
OpenPGP digital signature

2007-01-25 18:03:31

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [RFC PATCH -rt 2/2] RCU priority boosting additions to rcutorture

On Thu, Jan 25, 2007 at 12:47:04AM -0800, Josh Triplett wrote:
> Paul E. McKenney wrote:
> > This patch adds an optional preemption kernel thread to the rcutorture
> > tests. This thread sets itself to a low RT priority and chews up
> > CPU in 10-second bursts, verifying that grace periods progress during
> > this 10-second interval. This has thus far passed about 30 hours of
> > RCU torture testing on a 4-CPU (a pair of 2-CPU dies) 64-bit Xeon
> > system.
> >
> > I am experimenting with more-vicious tests, but extra viciousness thus
> > far comes at the expense of grotesque code.
>
> Overall, the new feature seems like a good idea, and it should exercise the
> new RCU boosting code. Some comments below.

Thank you for the review!!!

> One major item: this new test feature really needs a new module parameter to
> enable or disable it.

CONFIG_PREEMPT_RCU_BOOST is the parameter -- if not set, then no test.
This parameter is provided by the accompanying RCU-boost patch.

> > diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/rcutorture.c linux-2.6.20-rc4-rt1-rcubtorture/kernel/rcutorture.c
> > --- linux-2.6.20-rc4-rt1/kernel/rcutorture.c 2007-01-09 10:59:54.000000000 -0800
> > +++ linux-2.6.20-rc4-rt1-rcubtorture/kernel/rcutorture.c 2007-01-23 11:27:49.000000000 -0800
> > @@ -194,6 +194,8 @@ struct rcu_torture_ops {
> > int (*completed)(void);
> > void (*deferredfree)(struct rcu_torture *p);
> > void (*sync)(void);
> > + void (*preemptstart)(void);
> > + void (*preemptend)(void);
> > int (*stats)(char *page);
> > char *name;
> > };
> > @@ -258,6 +260,71 @@ static void rcu_torture_deferred_free(st
> > call_rcu(&p->rtort_rcu, rcu_torture_cb);
> > }
> >
> > +#ifndef CONFIG_PREEMPT_RCU_BOOST
> > +static void rcu_preempt_start(void) { }
> > +static void rcu_preempt_end(void) { }
> > +static int rcu_preempt_stats(char *page) { return 0; }
> > +#else /* #ifndef CONFIG_PREEMPT_RCU_BOOST */
> > +
> > +static struct task_struct *rcu_preeempt_task;
> > +static long rcu_torture_preempt_errors = 0;
>
> Might as well make this an unsigned long; negative values wouldn't make sense.

Good point, fixed.

> > +static int rcu_torture_preempt(void *arg)
> > +{
> > + int completedstart;
> > + time_t gcstart;
> > + struct sched_param sp;
> > +
> > + sp.sched_priority = MAX_RT_PRIO - 1;
> > + sched_setscheduler(current, SCHED_RR, &sp);
> > + current->flags |= PF_NOFREEZE;
> > +
> > + do {
> > + completedstart = rcu_torture_completed();
> > + gcstart = xtime.tv_sec;
> > + while ((xtime.tv_sec - gcstart < 10) &&
> > + (rcu_torture_completed() == completedstart))
> > + cond_resched();
> > + if (rcu_torture_completed() == completedstart)
> > + rcu_torture_preempt_errors++;
> > + schedule_timeout_interruptible(shuffle_interval * HZ);
>
> Why call schedule_timeout_interruptible here without actually handling
> interruptions? So that you can send it a signal to cause the shuffle early?

It allows you to kill the process in order to get the module unload to
happen more quickly in case someone specified an overly long interval.
But now that you mention this, a simple one-second sleep is probably
appropriate here.

> > + } while (!kthread_should_stop());
> > + return NULL;
> > +}
> > +
> > +static void rcu_preempt_start(void)
> > +{
> > + rcu_preeempt_task = kthread_run(rcu_torture_preempt, NULL,
> > + "rcu_torture_preempt");
> > + if (IS_ERR(rcu_preeempt_task)) {
> > + VERBOSE_PRINTK_ERRSTRING("Failed to create preempter");
>
> This ought to include the errno value, PTR_ERR(rcu_preempt_task).

Good point -- what I should do is return this value so that
rcu_torture_init() can return it, failing the module-load process
and unwinding.

> > + rcu_preeempt_task = NULL;
> > + }
> > +}
> > +
> > +static void rcu_preempt_end(void)
> > +{
> > + if (rcu_preeempt_task != NULL) {
>
> if (rcu_preempt_task) would work just as well here.

True, but was being consistent with usage elsewhere in this file.

> > + VERBOSE_PRINTK_STRING("Stopping rcu_preempt task");
> > + kthread_stop(rcu_preeempt_task);
> > + }
> > + rcu_preeempt_task = NULL;
> > +}
> > +
> > +static int rcu_preempt_stats(char *page) {
> > + int cnt = 0;
> > +
> > + cnt += sprintf(&page[cnt],
> > + "Preemption stalls: %ld\n", rcu_torture_preempt_errors);
> > + return (cnt);
> > +}
>
> How about just:
> return sprintf(page, ...);
> ?

Good point.

> Also, if you decide to make rcu_torture_preempt_errors an unsigned long as
> suggested above, this should use %lu.

Done.

> > +#endif /* #else #ifndef CONFIG_PREEMPT_RCU_BOOST */
> > +
> > +static void rcu_preemptstart(void)
> > +{
> > +
> > +}
> > +
>
> This looks like a bit of stray code.

Yep, good eyes! Deleted.

> > static struct rcu_torture_ops rcu_ops = {
> > .init = NULL,
> > .cleanup = NULL,
> > @@ -267,7 +334,9 @@ static struct rcu_torture_ops rcu_ops =
> > .completed = rcu_torture_completed,
> > .deferredfree = rcu_torture_deferred_free,
> > .sync = synchronize_rcu,
> > - .stats = NULL,
> > + .preemptstart = rcu_preempt_start,
> > + .preemptend = rcu_preempt_end,
> > + .stats = rcu_preempt_stats,
> > .name = "rcu"
> > };
> >
> > @@ -306,6 +375,8 @@ static struct rcu_torture_ops rcu_sync_o
> > .completed = rcu_torture_completed,
> > .deferredfree = rcu_sync_torture_deferred_free,
> > .sync = synchronize_rcu,
> > + .preemptstart = NULL,
> > + .preemptend = NULL,
> > .stats = NULL,
> > .name = "rcu_sync"
> > };
>
> Much like other common structures such as struct file_operations, no need to
> explicitly specify members as NULL here; any member you don't specify will get
> a NULL value. That avoids the need to update every use of this structure
> whenever you add a new member used by only some of them.

Untrusting, aren't I? ;-)

I removed all the "= NULL" entries.

> > @@ -370,6 +441,8 @@ static struct rcu_torture_ops rcu_bh_ops
> > .completed = rcu_bh_torture_completed,
> > .deferredfree = rcu_bh_torture_deferred_free,
> > .sync = rcu_bh_torture_synchronize,
> > + .preemptstart = NULL,
> > + .preemptend = NULL,
> > .stats = NULL,
> > .name = "rcu_bh"
> > };
>
> Likewise.
>
> > @@ -383,6 +456,8 @@ static struct rcu_torture_ops rcu_bh_syn
> > .completed = rcu_bh_torture_completed,
> > .deferredfree = rcu_sync_torture_deferred_free,
> > .sync = rcu_bh_torture_synchronize,
> > + .preemptstart = NULL,
> > + .preemptend = NULL,
> > .stats = NULL,
> > .name = "rcu_bh_sync"
> > };
>
> Likewise.
>
> > @@ -464,6 +539,8 @@ static struct rcu_torture_ops srcu_ops =
> > .completed = srcu_torture_completed,
> > .deferredfree = rcu_sync_torture_deferred_free,
> > .sync = srcu_torture_synchronize,
> > + .preemptstart = NULL,
> > + .preemptend = NULL,
> > .stats = srcu_torture_stats,
> > .name = "srcu"
> > };
>
> Likewise.
>
> > @@ -502,6 +579,8 @@ static struct rcu_torture_ops sched_ops
> > .completed = sched_torture_completed,
> > .deferredfree = rcu_sync_torture_deferred_free,
> > .sync = sched_torture_synchronize,
> > + .preemptstart = NULL,
> > + .preemptend = NULL,
> > .stats = NULL,
> > .name = "sched"
>
> Likewise.
>
> > @@ -856,6 +935,8 @@ rcu_torture_cleanup(void)
> > kthread_stop(stats_task);
> > }
> > stats_task = NULL;
> > + if (cur_ops->preemptend != NULL)
>
> if (cur_ops->preemptend) would work as well.

True, though again there is a lot of existing "!= NULL" in this file
and elsewhere. Many thousands of them through the kernel. ;-)

> > @@ -997,6 +1078,8 @@ rcu_torture_init(void)
> > goto unwind;
> > }
> > }
> > + if (cur_ops->preemptstart != NULL)
>
> Likewise.

I will run this through the mill and repost.

Thanx, Paul

2007-01-25 19:06:38

by Josh Triplett

[permalink] [raw]
Subject: Re: [RFC PATCH -rt 2/2] RCU priority boosting additions to rcutorture

Paul E. McKenney wrote:
> On Thu, Jan 25, 2007 at 12:47:04AM -0800, Josh Triplett wrote:
>> One major item: this new test feature really needs a new module parameter to
>> enable or disable it.
>
> CONFIG_PREEMPT_RCU_BOOST is the parameter -- if not set, then no test.
> This parameter is provided by the accompanying RCU-boost patch.

It seems useful for rcutorture to use or not use the preempting thread
independently of CONFIG_PREEMPT_RCU_BOOST. That would bring you from two
cases to four, and the two new cases both make sense:

* CONFIG_PREEMPT_RCU_BOOST=n, but run rcutorture with the preempting thread.
This configuration allows you to demonstrate the need for
CONFIG_PREEMPT_RCU_BOOST, by showing what happens when you need it and don't
have it.

* CONFIG_PREEMPT_RCU_BOOST=y, but run rcutorture without the preempting
thread. This configuration allows you to test with rcutorture while running
a *real* real-time workload rather than the simple preempting thread, or
just test basic RCU functionality.

A simple boolean module_param would work here.

At some point, we may want to add the ability to run multiple preempting
threads, but that doesn't need to happen for this patch.

>> Paul E. McKenney wrote:
>>> diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/rcutorture.c linux-2.6.20-rc4-rt1-rcubtorture/kernel/rcutorture.c
>>> --- linux-2.6.20-rc4-rt1/kernel/rcutorture.c 2007-01-09 10:59:54.000000000 -0800
>>> +++ linux-2.6.20-rc4-rt1-rcubtorture/kernel/rcutorture.c 2007-01-23 11:27:49.000000000 -0800

>>> +static int rcu_torture_preempt(void *arg)
>>> +{
>>> + int completedstart;
>>> + time_t gcstart;
>>> + struct sched_param sp;
>>> +
>>> + sp.sched_priority = MAX_RT_PRIO - 1;
>>> + sched_setscheduler(current, SCHED_RR, &sp);
>>> + current->flags |= PF_NOFREEZE;
>>> +
>>> + do {
>>> + completedstart = rcu_torture_completed();
>>> + gcstart = xtime.tv_sec;
>>> + while ((xtime.tv_sec - gcstart < 10) &&
>>> + (rcu_torture_completed() == completedstart))
>>> + cond_resched();
>>> + if (rcu_torture_completed() == completedstart)
>>> + rcu_torture_preempt_errors++;
>>> + schedule_timeout_interruptible(shuffle_interval * HZ);
>> Why call schedule_timeout_interruptible here without actually handling
>> interruptions? So that you can send it a signal to cause the shuffle early?
>
> It allows you to kill the process in order to get the module unload to
> happen more quickly in case someone specified an overly long interval.

I didn't actually know that you could kill a kthread from userspace. :)

That rationale makes sense.

> But now that you mention this, a simple one-second sleep is probably
> appropriate here.

OK.

>>> + } while (!kthread_should_stop());
>>> + return NULL;
>>> +}
>>> +
>>> +static void rcu_preempt_start(void)
>>> +{
>>> + rcu_preeempt_task = kthread_run(rcu_torture_preempt, NULL,
>>> + "rcu_torture_preempt");
>>> + if (IS_ERR(rcu_preeempt_task)) {
>>> + VERBOSE_PRINTK_ERRSTRING("Failed to create preempter");
>> This ought to include the errno value, PTR_ERR(rcu_preempt_task).
>
> Good point -- what I should do is return this value so that
> rcu_torture_init() can return it, failing the module-load process
> and unwinding.

Even better, yes.

>>> + rcu_preeempt_task = NULL;
>>> + }
>>> +}
>>> +
>>> +static void rcu_preempt_end(void)
>>> +{
>>> + if (rcu_preeempt_task != NULL) {
>> if (rcu_preempt_task) would work just as well here.
>
> True, but was being consistent with usage elsewhere in this file.

Fair enough; don't worry about it for this patch, then. I'll deal with that
particular style cleanup later, throughout rcutorture.

>>> static struct rcu_torture_ops rcu_ops = {
>>> .init = NULL,
>>> .cleanup = NULL,
>>> @@ -267,7 +334,9 @@ static struct rcu_torture_ops rcu_ops =
>>> .completed = rcu_torture_completed,
>>> .deferredfree = rcu_torture_deferred_free,
>>> .sync = synchronize_rcu,
>>> - .stats = NULL,
>>> + .preemptstart = rcu_preempt_start,
>>> + .preemptend = rcu_preempt_end,
>>> + .stats = rcu_preempt_stats,
>>> .name = "rcu"
>>> };
>>>
>>> @@ -306,6 +375,8 @@ static struct rcu_torture_ops rcu_sync_o
>>> .completed = rcu_torture_completed,
>>> .deferredfree = rcu_sync_torture_deferred_free,
>>> .sync = synchronize_rcu,
>>> + .preemptstart = NULL,
>>> + .preemptend = NULL,
>>> .stats = NULL,
>>> .name = "rcu_sync"
>>> };
>> Much like other common structures such as struct file_operations, no need to
>> explicitly specify members as NULL here; any member you don't specify will get
>> a NULL value. That avoids the need to update every use of this structure
>> whenever you add a new member used by only some of them.
>
> Untrusting, aren't I? ;-)

Heh. I have that problem as well; I always hestitate to trust the compiler to
initialize values.

> I removed all the "= NULL" entries.

Thanks.

>>> @@ -856,6 +935,8 @@ rcu_torture_cleanup(void)
>>> kthread_stop(stats_task);
>>> }
>>> stats_task = NULL;
>>> + if (cur_ops->preemptend != NULL)
>> if (cur_ops->preemptend) would work as well.
>
> True, though again there is a lot of existing "!= NULL" in this file
> and elsewhere. Many thousands of them through the kernel. ;-)

As before, don't worry about it for this patch then.

> I will run this through the mill and repost.

Thanks!

- Josh Triplett

2007-01-25 19:58:32

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [RFC PATCH -rt 1/2] RCU priority boosting that survives semi-vicious testing

On Thu, Jan 25, 2007 at 01:29:23AM -0800, Josh Triplett wrote:
> Overall, this code looks sensible to me. Some comments on the patch below.
>
> Paul E. McKenney wrote:
> > --- linux-2.6.20-rc4-rt1/include/linux/rcupreempt.h 2007-01-09 10:59:54.000000000 -0800
> > +++ linux-2.6.20-rc4-rt1-rcub/include/linux/rcupreempt.h 2007-01-09 11:01:12.000000000 -0800
> [...]
> > +enum rcu_boost_state {
> > + RCU_BOOST_IDLE = 0, /* Not yet blocked if in RCU read-side. */
> > + RCU_BOOST_BLOCKED = 1, /* Blocked from RCU read-side. */
> > + RCU_BOOSTED = 2, /* Boosting complete. */
> > +};
> > +
> > +#define N_RCU_BOOST_STATE (RCU_BOOSTED + 1)
>
> Here, you define the three possible states, the number of states. But
> later...
>
> > diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/rcupreempt.c linux-2.6.20-rc4-rt1-rcub/kernel/rcupreempt.c
> > --- linux-2.6.20-rc4-rt1/kernel/rcupreempt.c 2007-01-09 10:59:54.000000000 -0800
> > +++ linux-2.6.20-rc4-rt1-rcub/kernel/rcupreempt.c 2007-01-23 15:41:33.000000000 -0800
> > @@ -49,6 +49,7 @@
> [...]
> > +#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS
> > + long rbs_stats[N_RCU_BOOST_DAT_EVENTS][N_RCU_BOOST_STATE + 1];
> > +#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */
>
> ...you declare this array to have space for one more than that, and then...
>
> > +#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS
> > +
> > +/*
> > + * Function to increment indicated ->rbs_stats[] element.
> > + */
> > +static inline void rcu_boost_dat_stat(struct rcu_boost_dat *rbdp,
> > + int event,
> > + enum rcu_boost_state oldstate)
> > +{
> > + if (oldstate >= RCU_BOOST_IDLE &&
> > + oldstate <= RCU_BOOSTED) {
> > + rbdp->rbs_stats[event][oldstate]++;
> > + } else {
> > + rbdp->rbs_stats[event][N_RCU_BOOST_STATE]++;
>
> ...you use the last element as what looks like an "unknown" counter. How
> about instead defining the enum to have an "unknown" state, defining
> N_RCU_BOOST_STATE accordingly, and using N_RCU_BOOST_STATE without the +1 in
> rbs_stats? I think that would make the code much more self-documenting, and
> less error-prone.

Excellent point. I have used the N+1 trick for some decades, so perhaps
it is time to retire it anyway. ;-)

> > +#define rcu_boost_dat_stat_block(rbdp, oldstate) \
> > + rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_BLOCK, oldstate)
> > +#define rcu_boost_dat_stat_boost(rbdp, oldstate) \
> > + rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_BOOST, oldstate)
> > +#define rcu_boost_dat_stat_unlock(rbdp, oldstate) \
> > + rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_UNLOCK, oldstate)
> > +
> > +/*
> > + * Prefix for kprint() strings for periodic statistics messages.
> > + */
> > +static char *rcu_boost_state_event[] = {
> > + "block: ",
> > + "boost: ",
> > + "unlock: ",
> > +};
>
> I don't know for sure, but I *think* that GCC may sometimes generate more
> efficient code if you use a char [][], rather than a char *[].

Hmmm... "char *[]" says "array of strings" to me, while char[][] seems
more opaque. Might be just me, though.

> > +
> > +/*
> > + * Indicators for numbers in kprint() strings. "!" indicates a state-event
> > + * pair that should not happen, while "?" indicates a state that should
> > + * not happen.
> > + */
> > +static char *rcu_boost_state_error[] = {
> > + /*ibBe*/
> > + " ?", /* block */
> > + "! ?", /* boost */
> > + "? ?", /* unlock */
> > +};
>
> Likewise. Also, the columns here don't seem to line up, probably because the
> comment uses spaces and the other lines use a tab.

Yeah, the "*" of the comment has to line up with the leading quote of
the following strings. I bumped everything over a space's worth, so
that each line now has a leading tab.

> > +static void rcu_boost_dat_stat_print(void)
> > +{
> > + char buf[N_RCU_BOOST_STATE * (sizeof(long) * 3 + 2) + 2];
>
> This expression really begs for some constants in place of the magic numbers,
> or failing that, a comment.

How about the following?

/* Three decimal digits per byte plus spacing per number and line. */

> > + int cpu;
> > + int event;
> > + int i;
> > + static time_t lastprint = 0;
> > + struct rcu_boost_dat *rbdp;
> > + int state;
> > + struct rcu_boost_dat sum;
> > +
> > + /* Wait a graceful interval between printk spamming. */
> > +
> > + if (xtime.tv_sec - lastprint <
> > + CONFIG_PREEMPT_RCU_BOOST_STATS_INTERVAL)
> > + return;
>
> How about printk_ratelimit? It takes a parameter for the rate. (On the other
> hand, it also takes a spinlock, which you might not want here, even in the
> stats code.)

It also allows bursts. printk_ratelimit() seems to be designed to
prevent log overflow due to excessive error rates. In contrast I want
to print stats at a more-or-less regular interval, and I don't want
to incur the overhead of gathering and formatting stats if they aren't
going to print.

> > + /* Print them out! */
> > +
> > + printk(KERN_ALERT
> > + "rcu_boost_dat: idx=%d "
> > + "b=%ld ul=%ld ub=%ld boost: a=%ld b=%ld\n",
> > + rcu_boost_idx,
> > + sum.rbs_blocked, sum.rbs_unlock, sum.rbs_unboosted,
> > + sum.rbs_boost_attempt, sum.rbs_boost);
> > + for (event = 0; event < N_RCU_BOOST_DAT_EVENTS; event++) {
> > + i = 0;
> > + for (state = 0; state <= N_RCU_BOOST_STATE; state++) {
> > + i += sprintf(&buf[i], " %ld%c",
> > + sum.rbs_stats[event][state],
> > + rcu_boost_state_error[event][state]);
> > + }
> > + printk(KERN_ALERT "rcu_boost_dat %s %s\n",
> > + rcu_boost_state_event[event], buf);
> > + }
>
> Most or all of these counters could likely become unsigned, since negative
> values don't make sense for them.

Good point, fixed.

> > + /* Go away and don't come back for awhile. */
> > +
> > + lastprint = xtime.tv_sec;
> > +}
> > +
> > +/*
> > + * Return the current boost index for boosting target tasks.
> > + * May only be invoked by the booster task, so guaranteed to
> > + * already be initialized.
> > + */
> > +static inline int rcu_boost_idx_boosting(void)
> > +{
> > + return (rcu_boost_idx + 1) & (RCU_BOOST_ELEMENTS - 1);
> > +}
>
> Quoted for later reference.

Right. Removed this unused function, good eyes.

> > +/*
> > + * Initialize RCU-boost state. This happens early in the boot process,
> > + * when the scheduler does not yet exist. So don't try to use it.
> > + */
> > +static void init_rcu_boost_early(void)
> > +{
> > + struct rcu_boost_dat *rbdp;
> > + int cpu;
> > + int i;
> > +
> > + for_each_possible_cpu(cpu) {
> > + rbdp = per_cpu(rcu_boost_dat, cpu);
> > + for (i = 0; i < RCU_BOOST_ELEMENTS; i++) {
> > + rbdp[i].rbs_mutex =
> > + RAW_SPIN_LOCK_UNLOCKED(rbdp[i].rbs_mutex);
> > + INIT_LIST_HEAD(&rbdp[i].rbs_toboost);
> > + INIT_LIST_HEAD(&rbdp[i].rbs_boosted);
> > + rbdp[i].rbs_blocked = 0;
> > + rbdp[i].rbs_boost_attempt = 0;
> > + rbdp[i].rbs_boost = 0;
> > + rbdp[i].rbs_unlock = 0;
> > + rbdp[i].rbs_unboosted = 0;
> > +#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS
> > + {
> > + int j, k;
> > +
> > + for (j = 0; j < N_RCU_BOOST_DAT_EVENTS; j++)
> > + for (k = 0; k <= N_RCU_BOOST_STATE; k++)
> > + rbdp[i].rbs_stats[j][k] = 0;
> > + }
> > +#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */
> > + }
> > + smp_wmb();
>
> Barrier without an accompanying comment. You want to make sure the structure
> gets initialized before setting rcu_boost_idx and thus allowing boosting,
> right?

Excellent point! How about the following:

smp_wmb(); /* Make sure readers see above initialization. */
rcu_boost_idx = 0; /* Allow readers to access data. */

> > + rcu_boost_idx = 0;
> > + }
> > +}
>
> > +/*
> > + * Return the list on which the calling task should add itself, or
> > + * NULL if too early during initialization.
> > + */
> > +static inline struct rcu_boost_dat *rcu_rbd_new(void)
> > +{
> > + int cpu = raw_smp_processor_id(); /* locks used, so preemption OK. */
> > + int idx = rcu_boost_idx;
> > +
> > + smp_read_barrier_depends(); barrier();
>
> Barriers without accompanying comments, though in this case they seem fairly
> obvious.

It probably won't be obvious to me six months from now. :-/

How about the following?

smp_read_barrier_depends(); barrier(); /* rmb() on Alpha for idx. */

I was tempted to use rcu_dereference() here and rcu_assign_pointer() above,
but I was afraid that confusion would result.

> > + if (unlikely(idx < 0))
> > + return (NULL);
> > + return &per_cpu(rcu_boost_dat, cpu)[idx];
> > +}
> > +
> > +/*
> > + * Return the list from which to boost target tasks.
> > + * May only be invoked by the booster task, so guaranteed to
> > + * already be initialized.
> > + */
> > +static inline struct rcu_boost_dat *rcu_rbd_boosting(int cpu)
> > +{
> > + int idx = (rcu_boost_idx + 1) & (RCU_BOOST_ELEMENTS - 1);
>
> You defined rcu_boost_idx_boosting for exactly this expression.
>
> > + return &per_cpu(rcu_boost_dat, cpu)[idx];
>
> For that matter, since you wrapped it in the function, you could just call it
> directly in this array index operation.

Since this is the only use of that function, I left it inline and added
a "Use rcu_boost_dat element least recently the destination for task
blocking in RCU read-side critical sections" to rcu_rbd_boosting()'s
comment header.

Seem reasonable?

> > +}
>
> > +/*
> > + * Priority-boost tasks stuck in RCU read-side critical sections as
> > + * needed (presumably rarely).
> > + */
> > +static int rcu_booster(void *arg)
> > +{
> > + int cpu;
> > + struct sched_param sp;
> > +
> > + sp.sched_priority = PREEMPT_RCU_BOOSTER_PRIO;
> > + sched_setscheduler(current, SCHED_RR, &sp);
> > + current->flags |= PF_NOFREEZE;
> > +
> > + do {
> > +
> > + /* Advance the lists of tasks. */
> > +
> > + rcu_boost_idx = (rcu_boost_idx + 1) % RCU_BOOST_ELEMENTS;
> > + for_each_possible_cpu(cpu) {
> > +
> > + /*
> > + * Boost all sufficiently aged readers.
> > + * Readers must first be preempted or block
> > + * on a mutex in an RCU read-side critical section,
> > + * then remain in that critical section for
> > + * RCU_BOOST_ELEMENTS-1 time intervals.
> > + * So most of the time we should end up doing
> > + * nothing.
> > + */
> > +
> > + rcu_boost_one_reader_list(rcu_rbd_boosting(cpu));
> > +
> > + /*
> > + * Large SMP systems may need to sleep sometimes
> > + * in this loop. Or have multiple RCU-boost tasks.
> > + */
>
> The idea of multiple RCU-boost tasks gives me an idea: what about having
> per-CPU boost queues, and using that to eliminate the need to lock the queues?
> It might not do quite as good a job of scheduling, but it would eliminate
> locking overhead *and* solve the problem you describe here.

Nice try! ;-)

The problem is that the tasks on a given queue might migrate to some other
CPU before they do their rcu_read_unlock(). We already have per-CPU boost
queues, but their purpose is to promote cache locality in the (hopefully)
common case where the task's block time is short enough that it runs on
the same CPU. Should also reduce memory contention should there be a
flurry of blocks on many CPUs from within RCU read-side critical sections.

But locking is still required, sad to say!

Unless I am misunderstanding your suggestion, in which case please
enlighten me.

> > + }
> > +
> > + /*
> > + * Sleep to allow any unstalled RCU read-side critical
> > + * sections to age out of the list. @@@ FIXME: reduce,
> > + * adjust, or eliminate in case of OOM.
> > + */
> > +
> > + schedule_timeout_uninterruptible(HZ / 100);
> > +
> > + /* Print stats if enough time has passed. */
> > +
> > + rcu_boost_dat_stat_print();
> > +
> > + } while (!kthread_should_stop());
> > +
> > + return 0;
> > +}
> > +
> > +/*
> > + * Perform the portions of RCU-boost initialization that require the
> > + * scheduler to be up and running.
> > + */
> > +void init_rcu_boost_late(void)
> > +{
> > + int i;
> > +
> > + /* Spawn RCU-boost task. */
> > +
> > + printk(KERN_ALERT "Starting RCU priority booster\n");
>
> Judging by other initialization code, I think this belongs at KERN_INFO or
> similar.

Works for me!

> > + rcu_boost_task = kthread_run(rcu_booster, NULL, "RCU Prio Booster");
> > + if (IS_ERR(rcu_boost_task)) {
> > + i = PTR_ERR(rcu_boost_task);
> > + printk(KERN_ALERT
> > + "Unable to create RCU Priority Booster, errno %d\n", -i);
>
> No need for a temporary here.

Good point, fixed!

> > +
> > + /*
> > + * Continue running, but tasks permanently blocked
> > + * in RCU read-side critical sections will be able
> > + * to stall grace-period processing, potentially
> > + * OOMing the machine.
> > + */
> > +
> > + rcu_boost_task = NULL;
> > + }
> > +}
> > +
> > +/*
> > + * Update task's RCU-boost state to reflect blocking in RCU read-side
> > + * critical section, so that the RCU-boost task can find it in case it
> > + * later needs its priority boosted.
> > + */
> > +void __rcu_preempt_boost(void)
> > +{
> > + struct rcu_boost_dat *rbdp;
> > + unsigned long oldirq;
> > +
> > + /* Identify list to place task on for possible later boosting. */
> > +
> > + local_irq_save(oldirq);
> > + rbdp = rcu_rbd_new();
> > + if (rbdp == NULL) {
> > + local_irq_restore(oldirq);
> > + printk("Preempted RCU read-side critical section too early.\n");
>
> This printk lacks a priority.

Good catch! I chose KERN_ALERT, but please let me know if one of the
others would be more appropriate.

> > + return;
> > + }
> > + spin_lock(&rbdp->rbs_mutex);
> > + rbdp->rbs_blocked++;
> > +
> > + /*
> > + * Update state. We hold the lock and aren't yet on the list,
> > + * so the booster cannot mess with us yet.
> > + */
> > +
> > + rcu_boost_dat_stat_block(rbdp, current->rcub_state);
> > + if (current->rcub_state != RCU_BOOST_IDLE) {
> > +
> > + /*
> > + * We have been here before, so just update stats.
> > + * It may seem strange to do all this work just to
> > + * accumulate statistics, but this is such a
> > + * low-probability code path that we shouldn't care.
> > + * If it becomes a problem, it can be fixed.
> > + */
> > +
> > + spin_unlock_irqrestore(&rbdp->rbs_mutex, oldirq);
> > + return;
> > + }
> > + current->rcub_state = RCU_BOOST_BLOCKED;
> > +
> > + /* Now add ourselves to the list so that the booster can find use. */
>
> Typo: s/use/us/

Goode catche, fixede!

> > +
> > + list_add_tail(&current->rcub_entry, &rbdp->rbs_toboost);
> > + current->rcub_rbdp = rbdp;
> > + spin_unlock_irqrestore(&rbdp->rbs_mutex, oldirq);
> > +}
>
> > diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/rtmutex.c linux-2.6.20-rc4-rt1-rcub/kernel/rtmutex.c
> > --- linux-2.6.20-rc4-rt1/kernel/rtmutex.c 2007-01-09 10:59:54.000000000 -0800
> > +++ linux-2.6.20-rc4-rt1-rcub/kernel/rtmutex.c 2007-01-09 11:01:12.000000000 -0800
> > @@ -105,11 +105,14 @@ static inline void init_lists(struct rt_
> > */
> > int rt_mutex_getprio(struct task_struct *task)
> > {
> > + int prio = task->normal_prio;
> > +
> > + if (get_rcu_prio(task) < prio)
> > + prio = get_rcu_prio(task);
>
> int prio = min(task->normal_prio, get_rcu_prio(task)); ?

Good point -- and it even fits nicely on the declaration line for prio! ;-)

> > if (likely(!task_has_pi_waiters(task)))
> > - return task->normal_prio;
> > + return prio;
> >
> > - return min(task_top_pi_waiter(task)->pi_list_entry.prio,
> > - task->normal_prio);
> > + return min(task_top_pi_waiter(task)->pi_list_entry.prio, prio);
> > }

Thank you again for the careful and thorough review!!!

I will test these changes and send out an update.

Thanx, Paul

2007-01-26 01:47:29

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [RFC PATCH -rt 1/2] RCU priority boosting that survives semi-vicious testing

On Thu, Jan 25, 2007 at 11:58:16AM -0800, Paul E. McKenney wrote:
> On Thu, Jan 25, 2007 at 01:29:23AM -0800, Josh Triplett wrote:
> > Overall, this code looks sensible to me. Some comments on the patch below.

[ . . . ]

> Thank you again for the careful and thorough review!!!
>
> I will test these changes and send out an update.

And here is the updated RCU priority-boost patch.

Signed-off-by: Paul E. McKenney <[email protected]>
---

include/linux/init_task.h | 12 +
include/linux/rcupdate.h | 12 +
include/linux/rcupreempt.h | 20 +
include/linux/sched.h | 16 +
init/main.c | 1
kernel/Kconfig.preempt | 32 ++
kernel/fork.c | 6
kernel/rcupreempt.c | 528 +++++++++++++++++++++++++++++++++++++++++++++
kernel/rtmutex.c | 7
kernel/sched.c | 5
10 files changed, 636 insertions(+), 3 deletions(-)

diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/include/linux/init_task.h linux-2.6.20-rc4-rt1-rcub/include/linux/init_task.h
--- linux-2.6.20-rc4-rt1/include/linux/init_task.h 2007-01-09 10:59:54.000000000 -0800
+++ linux-2.6.20-rc4-rt1-rcub/include/linux/init_task.h 2007-01-09 11:01:12.000000000 -0800
@@ -87,6 +87,17 @@ extern struct nsproxy init_nsproxy;
.siglock = __SPIN_LOCK_UNLOCKED(sighand.siglock), \
}

+#ifdef CONFIG_PREEMPT_RCU_BOOST
+#define INIT_RCU_BOOST_PRIO .rcu_prio = MAX_PRIO,
+#define INIT_PREEMPT_RCU_BOOST(tsk) \
+ .rcub_rbdp = NULL, \
+ .rcub_state = RCU_BOOST_IDLE, \
+ .rcub_entry = LIST_HEAD_INIT(tsk.rcub_entry),
+#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */
+#define INIT_RCU_BOOST_PRIO
+#define INIT_PREEMPT_RCU_BOOST(tsk)
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */
+
extern struct group_info init_groups;

/*
@@ -143,6 +154,7 @@ extern struct group_info init_groups;
.pi_lock = RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \
INIT_TRACE_IRQFLAGS \
INIT_LOCKDEP \
+ INIT_PREEMPT_RCU_BOOST(tsk) \
}


diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/include/linux/rcupdate.h linux-2.6.20-rc4-rt1-rcub/include/linux/rcupdate.h
--- linux-2.6.20-rc4-rt1/include/linux/rcupdate.h 2007-01-09 10:59:54.000000000 -0800
+++ linux-2.6.20-rc4-rt1-rcub/include/linux/rcupdate.h 2007-01-09 11:01:12.000000000 -0800
@@ -227,6 +227,18 @@ extern void rcu_barrier(void);
extern void rcu_init(void);
extern void rcu_advance_callbacks(int cpu, int user);
extern void rcu_check_callbacks(int cpu, int user);
+#ifdef CONFIG_PREEMPT_RCU_BOOST
+extern void init_rcu_boost_late(void);
+extern void __rcu_preempt_boost(void);
+#define rcu_preempt_boost() \
+ do { \
+ if (unlikely(current->rcu_read_lock_nesting > 0)) \
+ __rcu_preempt_boost(); \
+ } while (0)
+#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */
+#define init_rcu_boost_late()
+#define rcu_preempt_boost()
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */

#endif /* __KERNEL__ */
#endif /* __LINUX_RCUPDATE_H */
diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/include/linux/rcupreempt.h linux-2.6.20-rc4-rt1-rcub/include/linux/rcupreempt.h
--- linux-2.6.20-rc4-rt1/include/linux/rcupreempt.h 2007-01-09 10:59:54.000000000 -0800
+++ linux-2.6.20-rc4-rt1-rcub/include/linux/rcupreempt.h 2007-01-25 16:30:46.000000000 -0800
@@ -42,6 +42,26 @@
#include <linux/cpumask.h>
#include <linux/seqlock.h>

+#ifdef CONFIG_PREEMPT_RCU_BOOST
+/*
+ * Task state with respect to being RCU-boosted. This state is changed
+ * by the task itself in response to the following three events:
+ * 1. Preemption (or block on lock) while in RCU read-side critical section.
+ * 2. Outermost rcu_read_unlock() for blocked RCU read-side critical section.
+ *
+ * The RCU-boost task also updates the state when boosting priority.
+ */
+enum rcu_boost_state {
+ RCU_BOOST_IDLE = 0, /* Not yet blocked if in RCU read-side. */
+ RCU_BOOST_BLOCKED = 1, /* Blocked from RCU read-side. */
+ RCU_BOOSTED = 2, /* Boosting complete. */
+ RCU_BOOST_INVALID = 3, /* For bogus state sightings. */
+};
+
+#define N_RCU_BOOST_STATE (RCU_BOOST_INVALID + 1)
+
+#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST */
+
#define rcu_qsctr_inc(cpu)
#define rcu_bh_qsctr_inc(cpu)
#define call_rcu_bh(head, rcu) call_rcu(head, rcu)
diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/include/linux/sched.h linux-2.6.20-rc4-rt1-rcub/include/linux/sched.h
--- linux-2.6.20-rc4-rt1/include/linux/sched.h 2007-01-09 10:59:54.000000000 -0800
+++ linux-2.6.20-rc4-rt1-rcub/include/linux/sched.h 2007-01-09 11:01:12.000000000 -0800
@@ -699,6 +699,14 @@ struct signal_struct {
#define is_rt_policy(p) ((p) != SCHED_NORMAL && (p) != SCHED_BATCH)
#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy))

+#ifdef CONFIG_PREEMPT_RCU_BOOST
+#define set_rcu_prio(p, prio) ((p)->rcu_prio = (prio))
+#define get_rcu_prio(p) ((p)->rcu_prio)
+#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST */
+#define set_rcu_prio(p, prio) do { } while (0)
+#define get_rcu_prio(p) MAX_PRIO
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST */
+
/*
* Some day this will be a full-fledged user tracking system..
*/
@@ -982,6 +990,9 @@ struct task_struct {
#endif
int load_weight; /* for niceness load balancing purposes */
int prio, static_prio, normal_prio;
+#ifdef CONFIG_PREEMPT_RCU_BOOST
+ int rcu_prio;
+#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST */
struct list_head run_list;
struct prio_array *array;

@@ -1003,6 +1014,11 @@ struct task_struct {
atomic_t *rcu_flipctr1;
atomic_t *rcu_flipctr2;
#endif
+#ifdef CONFIG_PREEMPT_RCU_BOOST
+ struct rcu_boost_dat *rcub_rbdp;
+ enum rcu_boost_state rcub_state;
+ struct list_head rcub_entry;
+#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST */

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/init/main.c linux-2.6.20-rc4-rt1-rcub/init/main.c
--- linux-2.6.20-rc4-rt1/init/main.c 2007-01-09 10:59:54.000000000 -0800
+++ linux-2.6.20-rc4-rt1-rcub/init/main.c 2007-01-09 11:01:12.000000000 -0800
@@ -712,6 +712,7 @@ static void __init do_basic_setup(void)
init_workqueues();
usermodehelper_init();
driver_init();
+ init_rcu_boost_late();

#ifdef CONFIG_SYSCTL
sysctl_init();
diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/Kconfig.preempt linux-2.6.20-rc4-rt1-rcub/kernel/Kconfig.preempt
--- linux-2.6.20-rc4-rt1/kernel/Kconfig.preempt 2007-01-09 10:59:54.000000000 -0800
+++ linux-2.6.20-rc4-rt1-rcub/kernel/Kconfig.preempt 2007-01-09 11:01:12.000000000 -0800
@@ -176,3 +176,35 @@ config RCU_TRACE

Say Y here if you want to enable RCU tracing
Say N if you are unsure.
+
+config PREEMPT_RCU_BOOST
+ bool "Enable priority boosting of RCU read-side critical sections"
+ depends on PREEMPT_RCU
+ default n
+ help
+ This option permits priority boosting of RCU read-side critical
+ sections that have been preempted in order to prevent indefinite
+ delay of grace periods in face of runaway non-realtime processes.
+
+ Say N if you are unsure.
+
+config PREEMPT_RCU_BOOST_STATS
+ bool "Enable RCU priority-boosting statistic printing"
+ depends on PREEMPT_RCU_BOOST
+ default n
+ help
+ This option enables debug printk()s of RCU boost statistics,
+ which are normally only used to debug RCU priority boost
+ implementations.
+
+ Say N if you are unsure.
+
+config PREEMPT_RCU_BOOST_STATS_INTERVAL
+ int "RCU priority-boosting statistic printing interval (seconds)"
+ depends on PREEMPT_RCU_BOOST_STATS
+ default 100
+ range 10 86400
+ help
+ This option controls the timing of debug printk()s of RCU boost
+ statistics, which are normally only used to debug RCU priority
+ boost implementations.
diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/fork.c linux-2.6.20-rc4-rt1-rcub/kernel/fork.c
--- linux-2.6.20-rc4-rt1/kernel/fork.c 2007-01-09 10:59:54.000000000 -0800
+++ linux-2.6.20-rc4-rt1-rcub/kernel/fork.c 2007-01-09 11:01:12.000000000 -0800
@@ -1129,6 +1129,12 @@ static struct task_struct *copy_process(
p->softirq_context = 0;
#endif
p->pagefault_disabled = 0;
+#ifdef CONFIG_PREEMPT_RCU_BOOST
+ p->rcu_prio = MAX_PRIO;
+ p->rcub_rbdp = NULL;
+ p->rcub_state = RCU_BOOST_IDLE;
+ INIT_LIST_HEAD(&p->rcub_entry);
+#endif
#ifdef CONFIG_LOCKDEP
p->lockdep_depth = 0; /* no locks held yet */
p->curr_chain_key = 0;
diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/rcupreempt.c linux-2.6.20-rc4-rt1-rcub/kernel/rcupreempt.c
--- linux-2.6.20-rc4-rt1/kernel/rcupreempt.c 2007-01-09 10:59:54.000000000 -0800
+++ linux-2.6.20-rc4-rt1-rcub/kernel/rcupreempt.c 2007-01-25 11:53:47.000000000 -0800
@@ -49,6 +49,7 @@
#include <linux/byteorder/swabb.h>
#include <linux/cpumask.h>
#include <linux/rcupreempt_trace.h>
+#include <linux/kthread.h>

/*
* PREEMPT_RCU data structures.
@@ -80,6 +81,526 @@ static struct rcu_ctrlblk rcu_ctrlblk =
static DEFINE_PER_CPU(atomic_t [2], rcu_flipctr) =
{ ATOMIC_INIT(0), ATOMIC_INIT(0) };

+#ifndef CONFIG_PREEMPT_RCU_BOOST
+static inline void init_rcu_boost_early(void) { }
+static inline void rcu_read_unlock_unboost(void) { }
+#else /* #ifndef CONFIG_PREEMPT_RCU_BOOST */
+
+/* Defines possible event indices for ->rbs_stats[] (first index). */
+
+#define RCU_BOOST_DAT_BLOCK 0
+#define RCU_BOOST_DAT_BOOST 1
+#define RCU_BOOST_DAT_UNLOCK 2
+#define N_RCU_BOOST_DAT_EVENTS 3
+
+/* RCU-boost per-CPU array element. */
+
+struct rcu_boost_dat {
+ raw_spinlock_t rbs_mutex;
+ struct list_head rbs_toboost;
+ struct list_head rbs_boosted;
+ unsigned long rbs_blocked;
+ unsigned long rbs_boost_attempt;
+ unsigned long rbs_boost;
+ unsigned long rbs_unlock;
+ unsigned long rbs_unboosted;
+#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS
+ unsigned long rbs_stats[N_RCU_BOOST_DAT_EVENTS][N_RCU_BOOST_STATE];
+#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */
+};
+#define RCU_BOOST_ELEMENTS 4
+
+int rcu_boost_idx = -1; /* invalid value in case someone uses RCU early. */
+DEFINE_PER_CPU(struct rcu_boost_dat, rcu_boost_dat[RCU_BOOST_ELEMENTS]);
+static struct task_struct *rcu_boost_task = NULL;
+
+#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS
+
+/*
+ * Function to increment indicated ->rbs_stats[] element.
+ */
+static inline void rcu_boost_dat_stat(struct rcu_boost_dat *rbdp,
+ int event,
+ enum rcu_boost_state oldstate)
+{
+ if (oldstate >= RCU_BOOST_IDLE &&
+ oldstate <= RCU_BOOSTED) {
+ rbdp->rbs_stats[event][oldstate]++;
+ } else {
+ rbdp->rbs_stats[event][RCU_BOOST_INVALID]++;
+ }
+}
+
+#define rcu_boost_dat_stat_block(rbdp, oldstate) \
+ rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_BLOCK, oldstate)
+#define rcu_boost_dat_stat_boost(rbdp, oldstate) \
+ rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_BOOST, oldstate)
+#define rcu_boost_dat_stat_unlock(rbdp, oldstate) \
+ rcu_boost_dat_stat(rbdp, RCU_BOOST_DAT_UNLOCK, oldstate)
+
+/*
+ * Prefix for kprint() strings for periodic statistics messages.
+ */
+static char *rcu_boost_state_event[] = {
+ "block: ",
+ "boost: ",
+ "unlock: ",
+};
+
+/*
+ * Indicators for numbers in kprint() strings. "!" indicates a state-event
+ * pair that should not happen, while "?" indicates a state that should
+ * not happen.
+ */
+static char *rcu_boost_state_error[] = {
+ /*ibBe*/
+ " ?", /* block */
+ "! ?", /* boost */
+ "? ?", /* unlock */
+};
+
+/*
+ * Print out RCU booster task statistics at the specified interval.
+ */
+static void rcu_boost_dat_stat_print(void)
+{
+ /* Three decimal digits per byte plus spacing per number and line. */
+ char buf[N_RCU_BOOST_STATE * (sizeof(long) * 3 + 2) + 2];
+ int cpu;
+ int event;
+ int i;
+ static time_t lastprint = 0;
+ struct rcu_boost_dat *rbdp;
+ int state;
+ struct rcu_boost_dat sum;
+
+ /* Wait a graceful interval between printk spamming. */
+
+ if (xtime.tv_sec - lastprint <
+ CONFIG_PREEMPT_RCU_BOOST_STATS_INTERVAL)
+ return;
+
+ /* Sum up the state/event-independent counters. */
+
+ sum.rbs_blocked = 0;
+ sum.rbs_boost_attempt = 0;
+ sum.rbs_boost = 0;
+ sum.rbs_unlock = 0;
+ sum.rbs_unboosted = 0;
+ for_each_possible_cpu(cpu)
+ for (i = 0; i < RCU_BOOST_ELEMENTS; i++) {
+ rbdp = per_cpu(rcu_boost_dat, cpu);
+ sum.rbs_blocked += rbdp[i].rbs_blocked;
+ sum.rbs_boost_attempt += rbdp[i].rbs_boost_attempt;
+ sum.rbs_boost += rbdp[i].rbs_boost;
+ sum.rbs_unlock += rbdp[i].rbs_unlock;
+ sum.rbs_unboosted += rbdp[i].rbs_unboosted;
+ }
+
+ /* Sum up the state/event-dependent counters. */
+
+ for (event = 0; event < N_RCU_BOOST_DAT_EVENTS; event++)
+ for (state = 0; state < N_RCU_BOOST_STATE; state++) {
+ sum.rbs_stats[event][state] = 0;
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < RCU_BOOST_ELEMENTS; i++) {
+ sum.rbs_stats[event][state]
+ += per_cpu(rcu_boost_dat,
+ cpu)[i].rbs_stats[event][state];
+ }
+ }
+ }
+
+ /* Print them out! */
+
+ printk(KERN_ALERT
+ "rcu_boost_dat: idx=%d "
+ "b=%lu ul=%lu ub=%lu boost: a=%lu b=%lu\n",
+ rcu_boost_idx,
+ sum.rbs_blocked, sum.rbs_unlock, sum.rbs_unboosted,
+ sum.rbs_boost_attempt, sum.rbs_boost);
+ for (event = 0; event < N_RCU_BOOST_DAT_EVENTS; event++) {
+ i = 0;
+ for (state = 0; state < N_RCU_BOOST_STATE; state++) {
+ i += sprintf(&buf[i], " %ld%c",
+ sum.rbs_stats[event][state],
+ rcu_boost_state_error[event][state]);
+ }
+ printk(KERN_ALERT "rcu_boost_dat %s %s\n",
+ rcu_boost_state_event[event], buf);
+ }
+
+ /* Go away and don't come back for awhile. */
+
+ lastprint = xtime.tv_sec;
+}
+
+#else /* #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */
+
+#define rcu_boost_dat_stat_block(rbdp, oldstate)
+#define rcu_boost_dat_stat_boost(rbdp, oldstate)
+#define rcu_boost_dat_stat_unlock(rbdp, oldstate)
+#define rcu_boost_dat_stat_print()
+
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */
+
+/*
+ * Initialize RCU-boost state. This happens early in the boot process,
+ * when the scheduler does not yet exist. So don't try to use it.
+ */
+static void init_rcu_boost_early(void)
+{
+ struct rcu_boost_dat *rbdp;
+ int cpu;
+ int i;
+
+ for_each_possible_cpu(cpu) {
+ rbdp = per_cpu(rcu_boost_dat, cpu);
+ for (i = 0; i < RCU_BOOST_ELEMENTS; i++) {
+ rbdp[i].rbs_mutex =
+ RAW_SPIN_LOCK_UNLOCKED(rbdp[i].rbs_mutex);
+ INIT_LIST_HEAD(&rbdp[i].rbs_toboost);
+ INIT_LIST_HEAD(&rbdp[i].rbs_boosted);
+ rbdp[i].rbs_blocked = 0;
+ rbdp[i].rbs_boost_attempt = 0;
+ rbdp[i].rbs_boost = 0;
+ rbdp[i].rbs_unlock = 0;
+ rbdp[i].rbs_unboosted = 0;
+#ifdef CONFIG_PREEMPT_RCU_BOOST_STATS
+ {
+ int j, k;
+
+ for (j = 0; j < N_RCU_BOOST_DAT_EVENTS; j++)
+ for (k = 0; k < N_RCU_BOOST_STATE; k++)
+ rbdp[i].rbs_stats[j][k] = 0;
+ }
+#endif /* #ifdef CONFIG_PREEMPT_RCU_BOOST_STATS */
+ }
+ smp_wmb(); /* Make sure readers see above initialization. */
+ rcu_boost_idx = 0; /* Allow readers to access data. */
+ }
+}
+
+/*
+ * Return the list on which the calling task should add itself, or
+ * NULL if too early during initialization.
+ */
+static inline struct rcu_boost_dat *rcu_rbd_new(void)
+{
+ int cpu = raw_smp_processor_id(); /* locks used, so preemption OK. */
+ int idx = rcu_boost_idx;
+
+ smp_read_barrier_depends(); barrier(); /* rmb() on Alpha for idx. */
+ if (unlikely(idx < 0))
+ return (NULL);
+ return &per_cpu(rcu_boost_dat, cpu)[idx];
+}
+
+/*
+ * Return the list from which to boost target tasks.
+ * May only be invoked by the booster task, so guaranteed to
+ * already be initialized. Use rcu_boost_dat element least recently
+ * the destination for task blocking in RCU read-side critical sections.
+ */
+static inline struct rcu_boost_dat *rcu_rbd_boosting(int cpu)
+{
+ int idx = (rcu_boost_idx + 1) & (RCU_BOOST_ELEMENTS - 1);
+
+ return &per_cpu(rcu_boost_dat, cpu)[idx];
+}
+
+#define PREEMPT_RCU_BOOSTER_PRIO 49 /* Match curr_irq_prio manually. */
+ /* Administrators can always adjust */
+ /* via the /proc interface. */
+
+/*
+ * Boost the specified task from an RCU viewpoint.
+ * Boost the target task to a priority just a bit less-favored than
+ * that of the RCU-boost task, but boost to a realtime priority even
+ * if the RCU-boost task is running at a non-realtime priority.
+ * We check the priority of the RCU-boost task each time we boost
+ * in case the sysadm manually changes the priority.
+ */
+static void rcu_boost_prio(struct task_struct *taskp)
+{
+ unsigned long oldirq;
+ int rcuprio;
+
+ spin_lock_irqsave(&current->pi_lock, oldirq);
+ rcuprio = rt_mutex_getprio(current) + 1;
+ if (rcuprio >= MAX_USER_RT_PRIO)
+ rcuprio = MAX_USER_RT_PRIO - 1;
+ spin_unlock_irqrestore(&current->pi_lock, oldirq);
+ spin_lock_irqsave(&taskp->pi_lock, oldirq);
+ if (taskp->rcu_prio != rcuprio) {
+ taskp->rcu_prio = rcuprio;
+ if (taskp->rcu_prio != taskp->prio)
+ rt_mutex_setprio(taskp, taskp->rcu_prio);
+ }
+ spin_unlock_irqrestore(&taskp->pi_lock, oldirq);
+}
+
+/*
+ * Unboost the specified task from an RCU viewpoint.
+ */
+static void rcu_unboost_prio(struct task_struct *taskp)
+{
+ int nprio;
+ unsigned long oldirq;
+
+ spin_lock_irqsave(&taskp->pi_lock, oldirq);
+ taskp->rcu_prio = MAX_PRIO;
+ nprio = rt_mutex_getprio(taskp);
+ if (nprio > taskp->prio)
+ rt_mutex_setprio(taskp, nprio);
+ spin_unlock_irqrestore(&taskp->pi_lock, oldirq);
+}
+
+/*
+ * Boost all of the RCU-reader tasks on the specified list.
+ */
+static void rcu_boost_one_reader_list(struct rcu_boost_dat *rbdp)
+{
+ LIST_HEAD(list);
+ unsigned long oldirq;
+ struct task_struct *taskp;
+
+ /*
+ * Splice both lists onto a local list. We will still
+ * need to hold the lock when manipulating the local list
+ * because tasks can remove themselves at any time.
+ * The reason for splicing the rbs_boosted list is that
+ * our priority may have changed, so reboosting may be
+ * required.
+ */
+
+ spin_lock_irqsave(&rbdp->rbs_mutex, oldirq);
+ list_splice_init(&rbdp->rbs_toboost, &list);
+ list_splice_init(&rbdp->rbs_boosted, &list);
+ while (!list_empty(&list)) {
+
+ /*
+ * Pause for a bit before boosting each task.
+ * @@@FIXME: reduce/eliminate pausing in case of OOM.
+ */
+
+ spin_unlock_irqrestore(&rbdp->rbs_mutex, oldirq);
+ schedule_timeout_uninterruptible(1);
+ spin_lock_irqsave(&rbdp->rbs_mutex, oldirq);
+
+ /*
+ * All tasks might have removed themselves while
+ * we were waiting. Recheck list emptiness.
+ */
+
+ if (list_empty(&list))
+ break;
+
+ /* Remove first task in local list, count the attempt. */
+
+ taskp = list_entry(list.next, typeof(*taskp), rcub_entry);
+ list_del_init(&taskp->rcub_entry);
+ rbdp->rbs_boost_attempt++;
+
+ /* Ignore tasks in unexpected states. */
+
+ if (taskp->rcub_state == RCU_BOOST_IDLE) {
+ list_add_tail(&taskp->rcub_entry, &rbdp->rbs_toboost);
+ rcu_boost_dat_stat_boost(rbdp, taskp->rcub_state);
+ continue;
+ }
+
+ /* Boost the task's priority. */
+
+ rcu_boost_prio(taskp);
+ rbdp->rbs_boost++;
+ rcu_boost_dat_stat_boost(rbdp, taskp->rcub_state);
+ taskp->rcub_state = RCU_BOOSTED;
+ list_add_tail(&taskp->rcub_entry, &rbdp->rbs_boosted);
+ }
+ spin_unlock_irqrestore(&rbdp->rbs_mutex, oldirq);
+}
+
+/*
+ * Priority-boost tasks stuck in RCU read-side critical sections as
+ * needed (presumably rarely).
+ */
+static int rcu_booster(void *arg)
+{
+ int cpu;
+ struct sched_param sp;
+
+ sp.sched_priority = PREEMPT_RCU_BOOSTER_PRIO;
+ sched_setscheduler(current, SCHED_RR, &sp);
+ current->flags |= PF_NOFREEZE;
+
+ do {
+
+ /* Advance the lists of tasks. */
+
+ rcu_boost_idx = (rcu_boost_idx + 1) % RCU_BOOST_ELEMENTS;
+ for_each_possible_cpu(cpu) {
+
+ /*
+ * Boost all sufficiently aged readers.
+ * Readers must first be preempted or block
+ * on a mutex in an RCU read-side critical section,
+ * then remain in that critical section for
+ * RCU_BOOST_ELEMENTS-1 time intervals.
+ * So most of the time we should end up doing
+ * nothing.
+ */
+
+ rcu_boost_one_reader_list(rcu_rbd_boosting(cpu));
+
+ /*
+ * Large SMP systems may need to sleep sometimes
+ * in this loop. Or have multiple RCU-boost tasks.
+ */
+ }
+
+ /*
+ * Sleep to allow any unstalled RCU read-side critical
+ * sections to age out of the list. @@@ FIXME: reduce,
+ * adjust, or eliminate in case of OOM.
+ */
+
+ schedule_timeout_uninterruptible(HZ / 100);
+
+ /* Print stats if enough time has passed. */
+
+ rcu_boost_dat_stat_print();
+
+ } while (!kthread_should_stop());
+
+ return 0;
+}
+
+/*
+ * Perform the portions of RCU-boost initialization that require the
+ * scheduler to be up and running.
+ */
+void init_rcu_boost_late(void)
+{
+
+ /* Spawn RCU-boost task. */
+
+ printk(KERN_INFO "Starting RCU priority booster\n");
+ rcu_boost_task = kthread_run(rcu_booster, NULL, "RCU Prio Booster");
+ if (IS_ERR(rcu_boost_task)) {
+ printk(KERN_ALERT
+ "Unable to create RCU Priority Booster, errno %d\n",
+ -PTR_ERR(rcu_boost_task));
+
+ /*
+ * Continue running, but tasks permanently blocked
+ * in RCU read-side critical sections will be able
+ * to stall grace-period processing, potentially
+ * OOMing the machine.
+ */
+
+ rcu_boost_task = NULL;
+ }
+}
+
+/*
+ * Update task's RCU-boost state to reflect blocking in RCU read-side
+ * critical section, so that the RCU-boost task can find it in case it
+ * later needs its priority boosted.
+ */
+void __rcu_preempt_boost(void)
+{
+ struct rcu_boost_dat *rbdp;
+ unsigned long oldirq;
+
+ /* Identify list to place task on for possible later boosting. */
+
+ local_irq_save(oldirq);
+ rbdp = rcu_rbd_new();
+ if (rbdp == NULL) {
+ local_irq_restore(oldirq);
+ printk(KERN_ALERT
+ "Preempted RCU read-side critical section too early.\n");
+ return;
+ }
+ spin_lock(&rbdp->rbs_mutex);
+ rbdp->rbs_blocked++;
+
+ /*
+ * Update state. We hold the lock and aren't yet on the list,
+ * so the booster cannot mess with us yet.
+ */
+
+ rcu_boost_dat_stat_block(rbdp, current->rcub_state);
+ if (current->rcub_state != RCU_BOOST_IDLE) {
+
+ /*
+ * We have been here before, so just update stats.
+ * It may seem strange to do all this work just to
+ * accumulate statistics, but this is such a
+ * low-probability code path that we shouldn't care.
+ * If it becomes a problem, it can be fixed.
+ */
+
+ spin_unlock_irqrestore(&rbdp->rbs_mutex, oldirq);
+ return;
+ }
+ current->rcub_state = RCU_BOOST_BLOCKED;
+
+ /* Now add ourselves to the list so that the booster can find us. */
+
+ list_add_tail(&current->rcub_entry, &rbdp->rbs_toboost);
+ current->rcub_rbdp = rbdp;
+ spin_unlock_irqrestore(&rbdp->rbs_mutex, oldirq);
+}
+
+/*
+ * Do the list-removal and priority-unboosting "heavy lifting" when
+ * required.
+ */
+static void __rcu_read_unlock_unboost(void)
+{
+ unsigned long oldirq;
+ struct rcu_boost_dat *rbdp;
+
+ /* Identify the list structure and acquire the corresponding lock. */
+
+ rbdp = current->rcub_rbdp;
+ spin_lock_irqsave(&rbdp->rbs_mutex, oldirq);
+
+ /* Remove task from the list it was on. */
+
+ list_del_init(&current->rcub_entry);
+ rbdp->rbs_unlock++;
+ current->rcub_rbdp = NULL;
+
+ /* Record stats, unboost if needed, and update state. */
+
+ rcu_boost_dat_stat_unlock(rbdp, current->rcub_state);
+ if (current->rcub_state == RCU_BOOSTED) {
+ rcu_unboost_prio(current);
+ rbdp->rbs_unboosted++;
+ }
+ current->rcub_state = RCU_BOOST_IDLE;
+ spin_unlock_irqrestore(&rbdp->rbs_mutex, oldirq);
+}
+
+/*
+ * Do any state changes and unboosting needed for rcu_read_unlock().
+ * Pass any complex work on to __rcu_read_unlock_unboost().
+ * The vast majority of the time, no work will be needed, as preemption
+ * and blocking within RCU read-side critical sections is comparatively
+ * rare.
+ */
+static inline void rcu_read_unlock_unboost(void)
+{
+
+ if (unlikely(current->rcub_state != RCU_BOOST_IDLE))
+ __rcu_read_unlock_unboost();
+}
+
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU_BOOST */
+
/*
* Return the number of RCU batches processed thus far. Useful
* for debug and statistics.
@@ -155,6 +676,7 @@ void __rcu_read_unlock(void)
atomic_dec(current->rcu_flipctr2);
current->rcu_flipctr2 = NULL;
}
+ rcu_read_unlock_unboost();
}

local_irq_restore(oldirq);
@@ -345,6 +867,11 @@ int notrace rcu_pending(int cpu)
rcu_data.nextlist != NULL);
}

+/*
+ * Initialize RCU. This is called very early in boot, so is restricted
+ * to very simple operations. Don't even think about messing with anything
+ * that involves the scheduler, as it doesn't exist yet.
+ */
void __init __rcu_init(void)
{
/*&&&&*/printk("WARNING: experimental RCU implementation.\n");
@@ -356,6 +883,7 @@ void __init __rcu_init(void)
rcu_data.waittail = &rcu_data.waitlist;
rcu_data.donelist = NULL;
rcu_data.donetail = &rcu_data.donelist;
+ init_rcu_boost_early();
tasklet_init(&rcu_data.rcu_tasklet, rcu_process_callbacks, 0UL);
}

diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/rtmutex.c linux-2.6.20-rc4-rt1-rcub/kernel/rtmutex.c
--- linux-2.6.20-rc4-rt1/kernel/rtmutex.c 2007-01-09 10:59:54.000000000 -0800
+++ linux-2.6.20-rc4-rt1-rcub/kernel/rtmutex.c 2007-01-25 12:35:44.000000000 -0800
@@ -105,11 +105,12 @@ static inline void init_lists(struct rt_
*/
int rt_mutex_getprio(struct task_struct *task)
{
+ int prio = min(task->normal_prio, get_rcu_prio(task));
+
if (likely(!task_has_pi_waiters(task)))
- return task->normal_prio;
+ return prio;

- return min(task_top_pi_waiter(task)->pi_list_entry.prio,
- task->normal_prio);
+ return min(task_top_pi_waiter(task)->pi_list_entry.prio, prio);
}

/*
diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/sched.c linux-2.6.20-rc4-rt1-rcub/kernel/sched.c
--- linux-2.6.20-rc4-rt1/kernel/sched.c 2007-01-09 10:59:54.000000000 -0800
+++ linux-2.6.20-rc4-rt1-rcub/kernel/sched.c 2007-01-09 11:01:12.000000000 -0800
@@ -1962,6 +1962,7 @@ void fastcall sched_fork(struct task_str
* Make sure we do not leak PI boosting priority to the child:
*/
p->prio = current->normal_prio;
+ set_rcu_prio(p, MAX_PRIO);

INIT_LIST_HEAD(&p->run_list);
p->array = NULL;
@@ -2044,6 +2045,7 @@ void fastcall wake_up_new_task(struct ta
else {
p->prio = current->prio;
p->normal_prio = current->normal_prio;
+ set_rcu_prio(p, MAX_PRIO);
__activate_task_after(p, current, rq);
}
set_need_resched();
@@ -3986,6 +3988,8 @@ void __sched __schedule(void)
}
profile_hit(SCHED_PROFILING, __builtin_return_address(0));

+ rcu_preempt_boost();
+
preempt_disable(); // FIXME: disable irqs here
prev = current;
release_kernel_lock(prev);
@@ -5725,6 +5729,7 @@ void __cpuinit init_idle(struct task_str
idle->sleep_avg = 0;
idle->array = NULL;
idle->prio = idle->normal_prio = MAX_PRIO;
+ set_rcu_prio(idle, MAX_PRIO);
idle->state = TASK_RUNNING;
idle->cpus_allowed = cpumask_of_cpu(cpu);
set_task_cpu(idle, cpu);

2007-01-26 01:52:27

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [RFC PATCH -rt 2/2] RCU priority boosting additions to rcutorture

On Thu, Jan 25, 2007 at 11:06:35AM -0800, Josh Triplett wrote:
> Paul E. McKenney wrote:
> > On Thu, Jan 25, 2007 at 12:47:04AM -0800, Josh Triplett wrote:
> >> One major item: this new test feature really needs a new module parameter to
> >> enable or disable it.
> >
> > CONFIG_PREEMPT_RCU_BOOST is the parameter -- if not set, then no test.
> > This parameter is provided by the accompanying RCU-boost patch.
>
> It seems useful for rcutorture to use or not use the preempting thread
> independently of CONFIG_PREEMPT_RCU_BOOST. That would bring you from two
> cases to four, and the two new cases both make sense:
>
> * CONFIG_PREEMPT_RCU_BOOST=n, but run rcutorture with the preempting thread.
> This configuration allows you to demonstrate the need for
> CONFIG_PREEMPT_RCU_BOOST, by showing what happens when you need it and don't
> have it.
>
> * CONFIG_PREEMPT_RCU_BOOST=y, but run rcutorture without the preempting
> thread. This configuration allows you to test with rcutorture while running
> a *real* real-time workload rather than the simple preempting thread, or
> just test basic RCU functionality.
>
> A simple boolean module_param would work here.

OK, sold! I will add this. Perhaps CONFIG_PREEMPT_RCU_TORTURE.

> At some point, we may want to add the ability to run multiple preempting
> threads, but that doesn't need to happen for this patch.

I considered that for this initial round, but you only need to preempt
a single RCU reader to force the RCU booster to do something. ;-)

> >> Paul E. McKenney wrote:
> >>> diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/rcutorture.c linux-2.6.20-rc4-rt1-rcubtorture/kernel/rcutorture.c
> >>> --- linux-2.6.20-rc4-rt1/kernel/rcutorture.c 2007-01-09 10:59:54.000000000 -0800
> >>> +++ linux-2.6.20-rc4-rt1-rcubtorture/kernel/rcutorture.c 2007-01-23 11:27:49.000000000 -0800
>
> >>> +static int rcu_torture_preempt(void *arg)
> >>> +{
> >>> + int completedstart;
> >>> + time_t gcstart;
> >>> + struct sched_param sp;
> >>> +
> >>> + sp.sched_priority = MAX_RT_PRIO - 1;
> >>> + sched_setscheduler(current, SCHED_RR, &sp);
> >>> + current->flags |= PF_NOFREEZE;
> >>> +
> >>> + do {
> >>> + completedstart = rcu_torture_completed();
> >>> + gcstart = xtime.tv_sec;
> >>> + while ((xtime.tv_sec - gcstart < 10) &&
> >>> + (rcu_torture_completed() == completedstart))
> >>> + cond_resched();
> >>> + if (rcu_torture_completed() == completedstart)
> >>> + rcu_torture_preempt_errors++;
> >>> + schedule_timeout_interruptible(shuffle_interval * HZ);
> >> Why call schedule_timeout_interruptible here without actually handling
> >> interruptions? So that you can send it a signal to cause the shuffle early?
> >
> > It allows you to kill the process in order to get the module unload to
> > happen more quickly in case someone specified an overly long interval.
>
> I didn't actually know that you could kill a kthread from userspace. :)
>
> That rationale makes sense.

It won't actually die, but if I understand correctly (a big "if") the
signal would cause schedule_timeout_interruptible() to return, allowing
the kthread_should_stop() check to happen.

> > But now that you mention this, a simple one-second sleep is probably
> > appropriate here.
>
> OK.
>
> >>> + } while (!kthread_should_stop());
> >>> + return NULL;
> >>> +}
> >>> +
> >>> +static void rcu_preempt_start(void)
> >>> +{
> >>> + rcu_preeempt_task = kthread_run(rcu_torture_preempt, NULL,
> >>> + "rcu_torture_preempt");
> >>> + if (IS_ERR(rcu_preeempt_task)) {
> >>> + VERBOSE_PRINTK_ERRSTRING("Failed to create preempter");
> >> This ought to include the errno value, PTR_ERR(rcu_preempt_task).
> >
> > Good point -- what I should do is return this value so that
> > rcu_torture_init() can return it, failing the module-load process
> > and unwinding.
>
> Even better, yes.
>
> >>> + rcu_preeempt_task = NULL;
> >>> + }
> >>> +}
> >>> +
> >>> +static void rcu_preempt_end(void)
> >>> +{
> >>> + if (rcu_preeempt_task != NULL) {
> >> if (rcu_preempt_task) would work just as well here.
> >
> > True, but was being consistent with usage elsewhere in this file.
>
> Fair enough; don't worry about it for this patch, then. I'll deal with that
> particular style cleanup later, throughout rcutorture.

Sounds good to me! ;-)

> >>> static struct rcu_torture_ops rcu_ops = {
> >>> .init = NULL,
> >>> .cleanup = NULL,
> >>> @@ -267,7 +334,9 @@ static struct rcu_torture_ops rcu_ops =
> >>> .completed = rcu_torture_completed,
> >>> .deferredfree = rcu_torture_deferred_free,
> >>> .sync = synchronize_rcu,
> >>> - .stats = NULL,
> >>> + .preemptstart = rcu_preempt_start,
> >>> + .preemptend = rcu_preempt_end,
> >>> + .stats = rcu_preempt_stats,
> >>> .name = "rcu"
> >>> };
> >>>
> >>> @@ -306,6 +375,8 @@ static struct rcu_torture_ops rcu_sync_o
> >>> .completed = rcu_torture_completed,
> >>> .deferredfree = rcu_sync_torture_deferred_free,
> >>> .sync = synchronize_rcu,
> >>> + .preemptstart = NULL,
> >>> + .preemptend = NULL,
> >>> .stats = NULL,
> >>> .name = "rcu_sync"
> >>> };
> >> Much like other common structures such as struct file_operations, no need to
> >> explicitly specify members as NULL here; any member you don't specify will get
> >> a NULL value. That avoids the need to update every use of this structure
> >> whenever you add a new member used by only some of them.
> >
> > Untrusting, aren't I? ;-)
>
> Heh. I have that problem as well; I always hestitate to trust the compiler to
> initialize values.
>
> > I removed all the "= NULL" entries.
>
> Thanks.
>
> >>> @@ -856,6 +935,8 @@ rcu_torture_cleanup(void)
> >>> kthread_stop(stats_task);
> >>> }
> >>> stats_task = NULL;
> >>> + if (cur_ops->preemptend != NULL)
> >> if (cur_ops->preemptend) would work as well.
> >
> > True, though again there is a lot of existing "!= NULL" in this file
> > and elsewhere. Many thousands of them through the kernel. ;-)
>
> As before, don't worry about it for this patch then.
>
> > I will run this through the mill and repost.

But with the new kernel parameter this time. ;-)

Thanx, Paul

2007-01-26 06:13:48

by Josh Triplett

[permalink] [raw]
Subject: Re: [RFC PATCH -rt 1/2] RCU priority boosting that survives semi-vicious testing

Paul E. McKenney wrote:
> On Thu, Jan 25, 2007 at 11:58:16AM -0800, Paul E. McKenney wrote:
>> On Thu, Jan 25, 2007 at 01:29:23AM -0800, Josh Triplett wrote:
>>> Overall, this code looks sensible to me. Some comments on the patch below.
>
> [ . . . ]
>
>> Thank you again for the careful and thorough review!!!
>>
>> I will test these changes and send out an update.
>
> And here is the updated RCU priority-boost patch.

Thanks for integrating my changes, and so quickly! :)

> Signed-off-by: Paul E. McKenney <[email protected]>

Acked-by: Josh Triplett <[email protected]>

- Josh Triplett

2007-01-26 06:30:04

by Josh Triplett

[permalink] [raw]
Subject: Re: [RFC PATCH -rt 2/2] RCU priority boosting additions to rcutorture

Paul E. McKenney wrote:
> On Thu, Jan 25, 2007 at 11:06:35AM -0800, Josh Triplett wrote:
>> Paul E. McKenney wrote:
>>> On Thu, Jan 25, 2007 at 12:47:04AM -0800, Josh Triplett wrote:
>>>> One major item: this new test feature really needs a new module parameter to
>>>> enable or disable it.
>>> CONFIG_PREEMPT_RCU_BOOST is the parameter -- if not set, then no test.
>>> This parameter is provided by the accompanying RCU-boost patch.
>> It seems useful for rcutorture to use or not use the preempting thread
>> independently of CONFIG_PREEMPT_RCU_BOOST. That would bring you from two
>> cases to four, and the two new cases both make sense:
>>
>> * CONFIG_PREEMPT_RCU_BOOST=n, but run rcutorture with the preempting thread.
>> This configuration allows you to demonstrate the need for
>> CONFIG_PREEMPT_RCU_BOOST, by showing what happens when you need it and don't
>> have it.
>>
>> * CONFIG_PREEMPT_RCU_BOOST=y, but run rcutorture without the preempting
>> thread. This configuration allows you to test with rcutorture while running
>> a *real* real-time workload rather than the simple preempting thread, or
>> just test basic RCU functionality.
>>
>> A simple boolean module_param would work here.
>
> OK, sold! I will add this. Perhaps CONFIG_PREEMPT_RCU_TORTURE.

Why a config option? Why not a module parameter, settable at module load time?

static int enable_preempter;
...
module_param(enable_preempter, bool, 0);
MODULE_PARM_DESC(enable_preempter, "Enable preempting thread, to test RCU priority boosting");
...
rcu_torture_cleanup(void)
{
...
if (enable_preempter && cur_ops->preemptend)
cur_ops->preemptend();
...
if (enable_preempter && cur_ops->preemptstart)
cur_ops->preemptstart();

Then just remove the #ifdef CONFIG_PREEMPT_RCU_BOOST from rcutorture entirely,
and always supply the preempter functions. rcutorture then doesn't depend on
CONFIG_PREEMPT_RCU_BOOST at all, and the module parameter determines whether
to run the preempter thread.

>>>> Paul E. McKenney wrote:
>>>>> diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/rcutorture.c linux-2.6.20-rc4-rt1-rcubtorture/kernel/rcutorture.c
>>>>> --- linux-2.6.20-rc4-rt1/kernel/rcutorture.c 2007-01-09 10:59:54.000000000 -0800
>>>>> +++ linux-2.6.20-rc4-rt1-rcubtorture/kernel/rcutorture.c 2007-01-23 11:27:49.000000000 -0800
>>>>> +static int rcu_torture_preempt(void *arg)
>>>>> +{
>>>>> + int completedstart;
>>>>> + time_t gcstart;
>>>>> + struct sched_param sp;
>>>>> +
>>>>> + sp.sched_priority = MAX_RT_PRIO - 1;
>>>>> + sched_setscheduler(current, SCHED_RR, &sp);
>>>>> + current->flags |= PF_NOFREEZE;
>>>>> +
>>>>> + do {
>>>>> + completedstart = rcu_torture_completed();
>>>>> + gcstart = xtime.tv_sec;
>>>>> + while ((xtime.tv_sec - gcstart < 10) &&
>>>>> + (rcu_torture_completed() == completedstart))
>>>>> + cond_resched();
>>>>> + if (rcu_torture_completed() == completedstart)
>>>>> + rcu_torture_preempt_errors++;
>>>>> + schedule_timeout_interruptible(shuffle_interval * HZ);
>>>> Why call schedule_timeout_interruptible here without actually handling
>>>> interruptions? So that you can send it a signal to cause the shuffle early?
>>> It allows you to kill the process in order to get the module unload to
>>> happen more quickly in case someone specified an overly long interval.
>> I didn't actually know that you could kill a kthread from userspace. :)
>>
>> That rationale makes sense.
>
> It won't actually die, but if I understand correctly (a big "if") the
> signal would cause schedule_timeout_interruptible() to return, allowing
> the kthread_should_stop() check to happen.

Ah, that makes much more sense; thanks.

- Josh Triplett

2007-01-29 02:11:17

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [RFC PATCH -rt 2/2] RCU priority boosting additions to rcutorture

On Thu, Jan 25, 2007 at 11:06:35AM -0800, Josh Triplett wrote:
> Paul E. McKenney wrote:
> > On Thu, Jan 25, 2007 at 12:47:04AM -0800, Josh Triplett wrote:
> >> One major item: this new test feature really needs a new module parameter to
> >> enable or disable it.
> >
> > CONFIG_PREEMPT_RCU_BOOST is the parameter -- if not set, then no test.
> > This parameter is provided by the accompanying RCU-boost patch.
>
> It seems useful for rcutorture to use or not use the preempting thread
> independently of CONFIG_PREEMPT_RCU_BOOST. That would bring you from two
> cases to four, and the two new cases both make sense:
>
> * CONFIG_PREEMPT_RCU_BOOST=n, but run rcutorture with the preempting thread.
> This configuration allows you to demonstrate the need for
> CONFIG_PREEMPT_RCU_BOOST, by showing what happens when you need it and don't
> have it.
>
> * CONFIG_PREEMPT_RCU_BOOST=y, but run rcutorture without the preempting
> thread. This configuration allows you to test with rcutorture while running
> a *real* real-time workload rather than the simple preempting thread, or
> just test basic RCU functionality.
>
> A simple boolean module_param would work here.

OK, am finally with you. See below for updated patch.

> At some point, we may want to add the ability to run multiple preempting
> threads, but that doesn't need to happen for this patch.

Or to make it bounce around onto multiple CPUs, I suppose. Leaving
these out for the moment. ;-)

Signed-off-by: Paul E. McKenney <[email protected]>
---

rcutorture.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++----------
1 file changed, 76 insertions(+), 14 deletions(-)

diff -urpNa -X dontdiff linux-2.6.20-rc4-rt1/kernel/rcutorture.c linux-2.6.20-rc4-rt1-rcubtorture/kernel/rcutorture.c
--- linux-2.6.20-rc4-rt1/kernel/rcutorture.c 2007-01-09 10:59:54.000000000 -0800
+++ linux-2.6.20-rc4-rt1-rcubtorture/kernel/rcutorture.c 2007-01-26 22:16:13.000000000 -0800
@@ -58,6 +58,7 @@ static int stat_interval; /* Interval be
static int verbose; /* Print more debug info. */
static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
+static int preempt_torture = 0; /* Realtime task preempts torture readers. */
static char *torture_type = "rcu"; /* What RCU implementation to torture. */

module_param(nreaders, int, 0);
@@ -72,6 +73,8 @@ module_param(test_no_idle_hz, bool, 0);
MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
module_param(shuffle_interval, int, 0);
MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
+module_param(preempt_torture, bool, 0);
+MODULE_PARM_DESC(preempt_torture, "Enable realtime preemption torture");
module_param(torture_type, charp, 0);
MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");

@@ -194,6 +197,8 @@ struct rcu_torture_ops {
int (*completed)(void);
void (*deferredfree)(struct rcu_torture *p);
void (*sync)(void);
+ long (*preemptstart)(void);
+ void (*preemptend)(void);
int (*stats)(char *page);
char *name;
};
@@ -258,16 +263,74 @@ static void rcu_torture_deferred_free(st
call_rcu(&p->rtort_rcu, rcu_torture_cb);
}

+static struct task_struct *rcu_preeempt_task;
+static unsigned long rcu_torture_preempt_errors = 0;
+
+static int rcu_torture_preempt(void *arg)
+{
+ int completedstart;
+ int err;
+ time_t gcstart;
+ struct sched_param sp;
+
+ sp.sched_priority = MAX_RT_PRIO - 1;
+ err = sched_setscheduler(current, SCHED_RR, &sp);
+ if (err != 0)
+ printk(KERN_ALERT "rcu_torture_preempt() priority err: %d\n",
+ err);
+ current->flags |= PF_NOFREEZE;
+
+ do {
+ completedstart = rcu_torture_completed();
+ gcstart = xtime.tv_sec;
+ while ((xtime.tv_sec - gcstart < 10) &&
+ (rcu_torture_completed() == completedstart))
+ cond_resched();
+ if (rcu_torture_completed() == completedstart)
+ rcu_torture_preempt_errors++;
+ schedule_timeout_interruptible(HZ);
+ } while (!kthread_should_stop());
+ return NULL;
+}
+
+static long rcu_preempt_start(void)
+{
+ long retval = 0;
+
+ rcu_preeempt_task = kthread_run(rcu_torture_preempt, NULL,
+ "rcu_torture_preempt");
+ if (IS_ERR(rcu_preeempt_task)) {
+ VERBOSE_PRINTK_ERRSTRING("Failed to create preempter");
+ retval = PTR_ERR(rcu_preeempt_task);
+ rcu_preeempt_task = NULL;
+ }
+ return retval;
+}
+
+static void rcu_preempt_end(void)
+{
+ if (rcu_preeempt_task != NULL) {
+ VERBOSE_PRINTK_STRING("Stopping rcu_preempt task");
+ kthread_stop(rcu_preeempt_task);
+ }
+ rcu_preeempt_task = NULL;
+}
+
+static int rcu_preempt_stats(char *page) {
+ return sprintf(page,
+ "Preemption stalls: %lu\n", rcu_torture_preempt_errors);
+}
+
static struct rcu_torture_ops rcu_ops = {
- .init = NULL,
- .cleanup = NULL,
.readlock = rcu_torture_read_lock,
.readdelay = rcu_read_delay,
.readunlock = rcu_torture_read_unlock,
.completed = rcu_torture_completed,
.deferredfree = rcu_torture_deferred_free,
.sync = synchronize_rcu,
- .stats = NULL,
+ .preemptstart = rcu_preempt_start,
+ .preemptend = rcu_preempt_end,
+ .stats = rcu_preempt_stats,
.name = "rcu"
};

@@ -299,14 +362,12 @@ static void rcu_sync_torture_init(void)

static struct rcu_torture_ops rcu_sync_ops = {
.init = rcu_sync_torture_init,
- .cleanup = NULL,
.readlock = rcu_torture_read_lock,
.readdelay = rcu_read_delay,
.readunlock = rcu_torture_read_unlock,
.completed = rcu_torture_completed,
.deferredfree = rcu_sync_torture_deferred_free,
.sync = synchronize_rcu,
- .stats = NULL,
.name = "rcu_sync"
};

@@ -362,28 +423,23 @@ static void rcu_bh_torture_synchronize(v
}

static struct rcu_torture_ops rcu_bh_ops = {
- .init = NULL,
- .cleanup = NULL,
.readlock = rcu_bh_torture_read_lock,
.readdelay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_bh_torture_read_unlock,
.completed = rcu_bh_torture_completed,
.deferredfree = rcu_bh_torture_deferred_free,
.sync = rcu_bh_torture_synchronize,
- .stats = NULL,
.name = "rcu_bh"
};

static struct rcu_torture_ops rcu_bh_sync_ops = {
.init = rcu_sync_torture_init,
- .cleanup = NULL,
.readlock = rcu_bh_torture_read_lock,
.readdelay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_bh_torture_read_unlock,
.completed = rcu_bh_torture_completed,
.deferredfree = rcu_sync_torture_deferred_free,
.sync = rcu_bh_torture_synchronize,
- .stats = NULL,
.name = "rcu_bh_sync"
};

@@ -495,14 +551,12 @@ static void sched_torture_synchronize(vo

static struct rcu_torture_ops sched_ops = {
.init = rcu_sync_torture_init,
- .cleanup = NULL,
.readlock = sched_torture_read_lock,
.readdelay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = sched_torture_read_unlock,
.completed = sched_torture_completed,
.deferredfree = rcu_sync_torture_deferred_free,
.sync = sched_torture_synchronize,
- .stats = NULL,
.name = "sched"
};

@@ -801,9 +855,10 @@ rcu_torture_print_module_parms(char *tag
printk(KERN_ALERT "%s" TORTURE_FLAG
"--- %s: nreaders=%d nfakewriters=%d "
"stat_interval=%d verbose=%d test_no_idle_hz=%d "
- "shuffle_interval = %d\n",
+ "shuffle_interval=%d preempt_torture=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
- stat_interval, verbose, test_no_idle_hz, shuffle_interval);
+ stat_interval, verbose, test_no_idle_hz, shuffle_interval,
+ preempt_torture);
}

static void
@@ -856,6 +911,8 @@ rcu_torture_cleanup(void)
kthread_stop(stats_task);
}
stats_task = NULL;
+ if (preempt_torture && (cur_ops->preemptend != NULL))
+ cur_ops->preemptend();

/* Wait for all RCU callbacks to fire. */
rcu_barrier();
@@ -997,6 +1054,11 @@ rcu_torture_init(void)
goto unwind;
}
}
+ if (preempt_torture && (cur_ops->preemptstart != NULL)) {
+ firsterr = cur_ops->preemptstart();
+ if (firsterr != 0)
+ goto unwind;
+ }
return 0;

unwind:

2007-01-29 06:05:34

by Josh Triplett

[permalink] [raw]
Subject: Re: [RFC PATCH -rt 2/2] RCU priority boosting additions to rcutorture

Paul E. McKenney wrote:
> On Thu, Jan 25, 2007 at 11:06:35AM -0800, Josh Triplett wrote:
>> Paul E. McKenney wrote:
>>> On Thu, Jan 25, 2007 at 12:47:04AM -0800, Josh Triplett wrote:
>>>> One major item: this new test feature really needs a new module parameter to
>>>> enable or disable it.
>>> CONFIG_PREEMPT_RCU_BOOST is the parameter -- if not set, then no test.
>>> This parameter is provided by the accompanying RCU-boost patch.
>> It seems useful for rcutorture to use or not use the preempting thread
>> independently of CONFIG_PREEMPT_RCU_BOOST. That would bring you from two
>> cases to four, and the two new cases both make sense:
>>
>> * CONFIG_PREEMPT_RCU_BOOST=n, but run rcutorture with the preempting thread.
>> This configuration allows you to demonstrate the need for
>> CONFIG_PREEMPT_RCU_BOOST, by showing what happens when you need it and don't
>> have it.
>>
>> * CONFIG_PREEMPT_RCU_BOOST=y, but run rcutorture without the preempting
>> thread. This configuration allows you to test with rcutorture while running
>> a *real* real-time workload rather than the simple preempting thread, or
>> just test basic RCU functionality.
>>
>> A simple boolean module_param would work here.
>
> OK, am finally with you. See below for updated patch.

Looks good to me.

> Signed-off-by: Paul E. McKenney <[email protected]>

Signed-off-by: Josh Triplett <[email protected]>

- Josh Triplett