2002-02-05 15:46:30

by Dipankar Sarma

[permalink] [raw]
Subject: [PATCH] New Read-Copy Update patch

Here is a "new" Read-Copy update patch that tries to address
many of the concerns with the previous RCU patches. The
details of RCU are documented as usual at
http://lse.sourceforge.net/locking/rcupdate.html.

Main features of rcu-2.5.3.patch -

1. Unlike my previous patch based on the old DYNIX/ptx algorithm
this does not have any code in arch-dependent directories. This
should make Andrea happy ;-)
2. No overhead in fastpath other than a per-cpu counter increment
during context switch.
3. RCU callbacks maintained in per-cpu lists, so global locking
needs to be used only once in every quiescent cycle, not
for queueing RCU callbacks.
4. No changes to scheduler code.
5. No RCU, no overhead other than the context switch counter increment.

Future cleanups todo -

1. When Rusty's per-cpu data area patch is included in linus tree,
this patch can be cleaned up signficantly - delete rcudata.h
and make all the per-cpu data static except qsctr. Can we have
Rusty's patch in soon please ?
2. We probably shouldn't care about feature #5 above. If we ever
use RCU, why bother ? This simplifies the patch signficantly
by getting rid of rcu_active_cpumask logic.
3. A per-cpu timer support ? - This will allow us to get rid of the krcud
stuff and make RCU even simpler.
4. Ingo will provide task_idle(task_t *p) in the next version
of O(1) schduler, so my hack goes away.

Thanks
Dipankar
--
Dipankar Sarma <[email protected]> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.

rcu-2.5.3.patch
---------------

diff -urN linux-2.5.3/include/linux/rcudata.h linux-2.5.3-rcu/include/linux/rcudata.h
--- linux-2.5.3/include/linux/rcudata.h Thu Jan 1 05:30:00 1970
+++ linux-2.5.3-rcu/include/linux/rcudata.h Mon Feb 4 11:59:28 2002
@@ -0,0 +1,109 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (c) International Business Machines Corp., 2001
+ *
+ * Author: Dipankar Sarma <[email protected]>
+ *
+ * Based on the original work by Paul McKenney <[email protected]>
+ * and inputs from Andrea Arcangeli, Rusty Russell, Andi Kleen etc.
+ *
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * http://lse.sourceforge.net/locking/rcupdate.html
+ *
+ */
+
+#ifndef __LINUX_RCUDATA_H
+#define __LINUX_RCUDATA_H
+
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/timer.h>
+#include <linux/rcupdate.h>
+
+/* Batch counter type. */
+typedef long rcu_batch_t;
+
+/*
+ * RCU_BATCH_XX(rcu_batch_t a, rcu_batch_t b)
+ *
+ * Returns true if batch number ``a'' compares as specified to ``b''.
+ * This comparison allows for the fact that the batch numbers wrap.
+ */
+#define RCU_BATCH_EQ(a, b) ((a) - (b) == 0)
+#define RCU_BATCH_GE(a, b) ((a) - (b) >= 0)
+#define RCU_BATCH_GT(a, b) ((a) - (b) > 0)
+#define RCU_BATCH_LE(a, b) ((a) - (b) <= 0)
+#define RCU_BATCH_LT(a, b) ((a) - (b) < 0)
+#define RCU_BATCH_NE(a, b) ((a) - (b) != 0)
+
+/*
+ * Per-CPU data for Read-Copy UPdate.
+ * We maintain callbacks in 3 per-cpu lists. Each list represents
+ * a batch and every batch is given a number that is global across
+ * all CPUs. The global current batch number (curbatch) represents
+ * the current batch of callbacks for which the quiescent cycle
+ * has started.
+ * nxtlist - new callbacks are added here
+ * curlist - current batch for which quiescent cycle started if any
+ * donelist - one or more batches that have finished waiting to be invoked
+ */
+struct rcu_data {
+ /*
+ * Per-cpu counters maintained to indicate quiscent state
+ * transition. A queiscent state can be context switch,
+ * idle loop or user mode code. A quiesent state transition
+ * indicates that the CPU no longer has kernel data references.
+ */
+ long qsctr; /* cswitch/idle loop etc. */
+
+ /*
+ * Per-CPU variables used by the write side of the read-copy update
+ * mechanism. See kernel/rcupdate.c.
+ */
+ long last_qsctr; /* value of qsctr at beginning */
+ /* of last rcu interval. */
+ rcu_batch_t batch; /* Batch # for current RCU batch */
+
+ struct list_head nxtlist;
+ struct list_head curlist;
+ struct list_head donelist;
+ struct tasklet_struct tasklet;
+ struct task_struct *task;
+ struct semaphore sema;
+} ____cacheline_aligned_in_smp;
+
+extern struct rcu_data rcu_data[NR_CPUS];
+
+#define RCU_qsctr(cpu) (rcu_data[(cpu)].qsctr)
+#define RCU_last_qsctr(cpu) (rcu_data[(cpu)].last_qsctr)
+#define RCU_batch(cpu) (rcu_data[(cpu)].batch)
+#define RCU_nxtlist(cpu) (rcu_data[(cpu)].nxtlist)
+#define RCU_curlist(cpu) (rcu_data[(cpu)].curlist)
+#define RCU_donelist(cpu) (rcu_data[(cpu)].donelist)
+#define RCU_tasklet(cpu) (rcu_data[(cpu)].tasklet)
+#define krcud_sema(cpu) (rcu_data[(cpu)].sema)
+#define krcud_task(cpu) (rcu_data[(cpu)].task)
+
+#define RCU_QSCTR_INVALID 0
+
+#endif
diff -urN linux-2.5.3/include/linux/rcupdate.h linux-2.5.3-rcu/include/linux/rcupdate.h
--- linux-2.5.3/include/linux/rcupdate.h Thu Jan 1 05:30:00 1970
+++ linux-2.5.3-rcu/include/linux/rcupdate.h Mon Feb 4 11:59:28 2002
@@ -0,0 +1,57 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (c) International Business Machines Corp., 2001
+ *
+ * Author: Dipankar Sarma <[email protected]>
+ *
+ * Based on the original work by Paul McKenney <[email protected]>
+ * and inputs from Andrea Arcangeli, Rusty Russell, Andi Kleen etc.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * http://lse.sourceforge.net/locking/rcupdate.html
+ *
+ */
+
+#ifndef __LINUX_RCUPDATE_H
+#define __LINUX_RCUPDATE_H
+
+#include <linux/list.h>
+
+/*
+ * Callback structure for use with call_rcu().
+ */
+struct rcu_head {
+ struct list_head list;
+ void (*func)(void *obj);
+ void *arg;
+};
+
+#define RCU_HEAD_INIT(head) { LIST_HEAD_INIT(head.list), NULL, NULL }
+#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT(head)
+#define INIT_RCU_HEAD(ptr) do { \
+ INIT_LIST_HEAD(&(ptr)->list); (ptr)->func = NULL; (ptr)->arg = NULL; \
+} while (0)
+
+
+extern void call_rcu(struct rcu_head *head, void (*func)(void *arg), void *arg);
+extern void synchronize_kernel(void);
+
+#endif /* __LINUX_RCUPDATE_H */
diff -urN linux-2.5.3/include/linux/sched.h linux-2.5.3-rcu/include/linux/sched.h
--- linux-2.5.3/include/linux/sched.h Wed Jan 30 11:11:12 2002
+++ linux-2.5.3-rcu/include/linux/sched.h Mon Feb 4 18:49:08 2002
@@ -161,6 +161,7 @@
extern void flush_scheduled_tasks(void);
extern int start_context_thread(void);
extern int current_is_keventd(void);
+extern int task_idle(void);

struct namespace;

diff -urN linux-2.5.3/kernel/Makefile linux-2.5.3-rcu/kernel/Makefile
--- linux-2.5.3/kernel/Makefile Fri Jan 25 03:37:46 2002
+++ linux-2.5.3-rcu/kernel/Makefile Mon Feb 4 13:57:09 2002
@@ -10,12 +10,12 @@
O_TARGET := kernel.o

export-objs = signal.o sys.o kmod.o context.o ksyms.o pm.o exec_domain.o \
- printk.o
+ printk.o rcupdate.o

obj-y = sched.o dma.o fork.o exec_domain.o panic.o printk.o \
module.o exit.o itimer.o info.o time.o softirq.o resource.o \
sysctl.o acct.o capability.o ptrace.o timer.o user.o \
- signal.o sys.o kmod.o context.o
+ signal.o sys.o kmod.o context.o rcupdate.o

obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += ksyms.o
diff -urN linux-2.5.3/kernel/rcupdate.c linux-2.5.3-rcu/kernel/rcupdate.c
--- linux-2.5.3/kernel/rcupdate.c Thu Jan 1 05:30:00 1970
+++ linux-2.5.3-rcu/kernel/rcupdate.c Mon Feb 4 18:51:38 2002
@@ -0,0 +1,363 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (c) International Business Machines Corp., 2001
+ *
+ * Author: Dipankar Sarma <[email protected]>
+ *
+ * Based on the original work by Paul McKenney <[email protected]>
+ * and inputs from Andrea Arcangeli, Rusty Russell, Andi Kleen etc.
+ *
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * http://lse.sourceforge.net/locking/rcupdate.html
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <asm/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/rcudata.h>
+
+static spinlock_t rcu_lock = SPIN_LOCK_UNLOCKED;
+static rcu_batch_t rcu_curbatch = 1; /* Current batch number */
+static rcu_batch_t rcu_maxbatch = 1; /* Max requested batch number*/
+
+/* CPUs that need to switch in order for current RCU batch to proceed */
+static unsigned long rcu_cpumask = 0;
+/* CPUs that have active RCUs */
+static unsigned long rcu_active_cpumask = 0;
+/* Global timer to nudge CPUs */
+static struct timer_list rcu_timer;
+struct rcu_data rcu_data[NR_CPUS] __cacheline_aligned;
+
+asmlinkage long sys_sched_get_priority_max(int policy);
+
+/*
+ * Register a new rcu callback. This will be invoked as soon
+ * as all CPUs have performed a context switch or been seen in the
+ * idle loop or in a user process.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(void *arg), void *arg)
+{
+ int cpu = cpu_number_map(smp_processor_id());
+ unsigned long flags;
+
+ head->func = func;
+ head->arg = arg;
+ local_irq_save(flags);
+ list_add_tail(&head->list, &RCU_nxtlist(cpu));
+ local_irq_restore(flags);
+ tasklet_schedule(&RCU_tasklet(cpu));
+}
+
+/*
+ * Invoke the completed RCU callbacks. They are expected to be in
+ * a per-cpu list.
+ */
+static inline void rcu_invoke_callbacks(struct list_head *list)
+{
+ struct list_head *entry;
+ struct rcu_head *head;
+
+ while (!list_empty(list)) {
+ entry = list->next;
+ list_del(entry);
+ head = list_entry(entry, struct rcu_head, list);
+ head->func(head->arg);
+ }
+}
+
+/*
+ * Register a new batch of callbacks with the global state maintainers.
+ * Caller must hold the rcu global lock.
+ */
+static inline void rcu_reg_batch(rcu_batch_t newbatch)
+{
+ if (RCU_BATCH_LT(rcu_maxbatch, newbatch)) {
+ rcu_maxbatch = newbatch;
+ }
+ if (RCU_BATCH_LT(rcu_maxbatch, rcu_curbatch) || (rcu_cpumask != 0)) {
+ return;
+ }
+ rcu_cpumask = cpu_online_map;
+}
+
+/*
+ * Check if the cpu has gone through a quiescent state.
+ * If so and if it already hasn't done so in this RCU
+ * quiescent cycle, then indicate that it has done so.
+ */
+static void rcu_check_quiescent_state(void)
+{
+ int cpu = cpu_number_map(smp_processor_id());
+
+ if (!test_bit(cpu, &rcu_cpumask)) {
+ return;
+ }
+
+ /*
+ * May race with rcu per-cpu tick - in the worst case
+ * we may miss one quiescent state of that CPU. That is
+ * tolerable. So no need to disable interrupts.
+ */
+ if (RCU_last_qsctr(cpu) == RCU_QSCTR_INVALID) {
+ RCU_last_qsctr(cpu) = RCU_qsctr(cpu);
+ return;
+ }
+ if (RCU_qsctr(cpu) == RCU_last_qsctr(cpu)) {
+ return;
+ }
+
+ spin_lock(&rcu_lock);
+ if (!test_bit(cpu, &rcu_cpumask)) {
+ spin_unlock(&rcu_lock);
+ return;
+ }
+ clear_bit(cpu, &rcu_cpumask);
+ RCU_last_qsctr(cpu) = RCU_QSCTR_INVALID;
+ if (rcu_cpumask != 0) {
+ /* All CPUs haven't gone through a quiescent state */
+ spin_unlock(&rcu_lock);
+ return;
+ }
+ rcu_curbatch++;
+ rcu_reg_batch(rcu_maxbatch);
+ spin_unlock(&rcu_lock);
+}
+
+static void rcu_move_next_batch(void)
+{
+ int rcu_timer_active;
+ int cpu = cpu_number_map(smp_processor_id());
+
+ local_irq_disable();
+ if (!list_empty(&RCU_nxtlist(cpu)) && list_empty(&RCU_curlist(cpu))) {
+ list_splice(&RCU_nxtlist(cpu), &RCU_curlist(cpu));
+ INIT_LIST_HEAD(&RCU_nxtlist(cpu));
+ local_irq_enable();
+
+ /*
+ * start the next batch of callbacks
+ */
+ spin_lock(&rcu_lock);
+ rcu_timer_active = (rcu_active_cpumask != 0);
+ if (!test_bit(cpu, &rcu_active_cpumask))
+ set_bit(cpu, &rcu_active_cpumask);
+ if (!rcu_timer_active &&
+ !timer_pending(&rcu_timer)) {
+ rcu_timer.expires = jiffies + 5;
+ add_timer(&rcu_timer);
+ }
+ RCU_batch(cpu) = rcu_curbatch + 1;
+ rcu_reg_batch(RCU_batch(cpu));
+ spin_unlock(&rcu_lock);
+ } else {
+ local_irq_enable();
+ }
+}
+
+/*
+ * Look into the per-cpu callback information to see if there is
+ * any processing necessary - if so do it.
+ */
+static void rcu_process_callbacks(unsigned long data)
+{
+ int cpu = cpu_number_map(smp_processor_id());
+
+ if (!list_empty(&RCU_curlist(cpu)) &&
+ RCU_BATCH_GT(rcu_curbatch, RCU_batch(cpu))) {
+ list_splice(&RCU_curlist(cpu), &RCU_donelist(cpu));
+ INIT_LIST_HEAD(&RCU_curlist(cpu));
+ }
+
+ rcu_move_next_batch();
+
+ /*
+ * No race between nxtlist here & call_rcu() from irq -
+ * call_rcu() will anyway reschedule the tasklet so if the
+ * bit in cpu_active_mask gets cleared, it will get set again.
+ */
+ if (list_empty(&RCU_nxtlist(cpu)) && list_empty(&RCU_curlist(cpu))) {
+ spin_lock(&rcu_lock);
+ clear_bit(cpu, &rcu_active_cpumask);
+ spin_unlock(&rcu_lock);
+ }
+ rcu_check_quiescent_state();
+
+ if (!list_empty(&RCU_donelist(cpu))) {
+ rcu_invoke_callbacks(&RCU_donelist(cpu));
+ }
+}
+
+/*
+ * Per-cpu tick that processes per-cpu queues
+ */
+static void rcu_percpu_tick_common(void)
+{
+ /* Take this opportunity to check for idle loop */
+ if (task_idle(current))
+ RCU_qsctr(cpu_logical_map(smp_processor_id()))++;
+ rcu_process_callbacks(0);
+}
+
+
+#ifdef CONFIG_SMP
+static void rcu_percpu_tick(void)
+{
+ int cpu;
+
+ for (cpu = 0; cpu < smp_num_cpus; cpu++) {
+ up(&krcud_sema(cpu));
+ }
+}
+#else
+static void rcu_percpu_tick(void)
+{
+ rcu_percpu_tick_common();
+}
+#endif
+
+/*
+ * RCU timer handler - called one in every 50ms only if there is
+ * any RCU pending in the system. It nudges every CPU.
+ */
+static void rcu_timer_handler(unsigned long data)
+{
+ rcu_percpu_tick();
+ spin_lock(&rcu_lock);
+ if (rcu_active_cpumask) {
+ rcu_timer.expires = jiffies + 5;
+ add_timer(&rcu_timer);
+ }
+ spin_unlock(&rcu_lock);
+
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Per-CPU RCU dameon. It runs at an absurdly high priority so
+ * that it is not starved out by the scheduler thereby holding
+ * up RC updates.
+ */
+static int krcud(void * __bind_cpu)
+{
+ int bind_cpu = *(int *) __bind_cpu;
+ int cpu = cpu_logical_map(bind_cpu);
+
+ daemonize();
+ current->policy = SCHED_FIFO;
+ current->rt_priority = 1001 + sys_sched_get_priority_max(SCHED_FIFO);
+
+ sigfillset(&current->blocked);
+
+ /* Migrate to the right CPU */
+ set_cpus_allowed(current, 1UL << cpu);
+
+ sprintf(current->comm, "krcud_CPU%d", bind_cpu);
+ sema_init(&krcud_sema(cpu), 0);
+
+ krcud_task(cpu) = current;
+
+ for (;;) {
+ while (down_interruptible(&krcud_sema(cpu)));
+ local_bh_disable();
+ rcu_percpu_tick_common();
+ local_bh_enable();
+ }
+}
+
+static void spawn_krcud(void)
+{
+ int cpu;
+
+ for (cpu = 0; cpu < smp_num_cpus; cpu++) {
+ if (kernel_thread(krcud, (void *) &cpu,
+ CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0)
+ printk(KERN_ERR "spawn_krcud() failed for cpu %d\n",
+ cpu);
+ else {
+ while (!krcud_task(cpu_logical_map(cpu))) {
+ schedule();
+ }
+ }
+ }
+}
+
+#endif
+
+/*
+ * Initializes rcu mechanism. Assumed to be called early.
+ * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
+ * Note that rcu_qsctr and friends are implicitly
+ * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
+ */
+static __init int rcu_init(void)
+{
+ int i;
+
+ memset(&rcu_data[0], 0, sizeof(rcu_data));
+ for (i = 0; i < NR_CPUS; i++) {
+ tasklet_init(&RCU_tasklet(i), rcu_process_callbacks, 0UL);
+ INIT_LIST_HEAD(&RCU_nxtlist(i));
+ INIT_LIST_HEAD(&RCU_curlist(i));
+ INIT_LIST_HEAD(&RCU_donelist(i));
+ }
+ init_timer(&rcu_timer);
+ rcu_timer.function = rcu_timer_handler;
+#ifdef CONFIG_SMP
+ spawn_krcud();
+#endif
+ return 0;
+}
+
+/* Because of FASTCALL declaration of complete, we use this wrapper */
+static void wakeme_after_rcu(void *completion)
+{
+ complete(completion);
+}
+
+/*
+ * Wait until all the CPUs have gone through a "quiescent" state.
+ */
+void synchronize_kernel(void)
+{
+ struct rcu_head rcu;
+ DECLARE_COMPLETION(completion);
+
+ /* Will wake me after RCU finished */
+ call_rcu(&rcu, wakeme_after_rcu, &completion);
+
+ /* Wait for it */
+ wait_for_completion(&completion);
+}
+
+__initcall(rcu_init);
+
+EXPORT_SYMBOL(call_rcu);
+EXPORT_SYMBOL(synchronize_kernel);
diff -urN linux-2.5.3/kernel/sched.c linux-2.5.3-rcu/kernel/sched.c
--- linux-2.5.3/kernel/sched.c Tue Jan 29 04:42:47 2002
+++ linux-2.5.3-rcu/kernel/sched.c Mon Feb 4 18:48:54 2002
@@ -18,6 +18,7 @@
#include <asm/uaccess.h>
#include <linux/smp_lock.h>
#include <linux/interrupt.h>
+#include <linux/rcudata.h>
#include <asm/mmu_context.h>

#define BITMAP_SIZE ((((MAX_PRIO+7)/8)+sizeof(long)-1)/sizeof(long))
@@ -685,6 +686,7 @@
switch_tasks:
prefetch(next);
prev->work.need_resched = 0;
+ RCU_qsctr(prev->cpu)++;

if (likely(prev != next)) {
rq->nr_switches++;
@@ -1187,6 +1189,13 @@
return retval;
}

+int task_idle(task_t *p)
+{
+ if (task_rq(p)->idle == p)
+ return 1;
+ return 0;
+}
+
static void show_task(task_t * p)
{
unsigned long free = 0;


2002-02-05 16:55:15

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH] New Read-Copy Update patch

In article <[email protected]> you wrote:
> 3. A per-cpu timer support ? - This will allow us to get rid of the krcud
> stuff and make RCU even simpler.

Something like http://people.redhat.com/mingo/scalable-timers-patches/smptimers-2.4.16-A0?

Ingo, Linus: Any chance to see that in 2.5 soon?

Christoph

--
Of course it doesn't work. We've performed a software upgrade.

2002-02-05 16:57:36

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] New Read-Copy Update patch


On Tue, 5 Feb 2002, Christoph Hellwig wrote:

> > 3. A per-cpu timer support ? - This will allow us to get rid of the krcud
> > stuff and make RCU even simpler.
>
> Something like http://people.redhat.com/mingo/scalable-timers-patches/smptimers-2.4.16-A0?
>
> Ingo, Linus: Any chance to see that in 2.5 soon?

i'll post an updated smptimers patch soon after the scheduler has settled
down.

Ingo

2002-02-05 17:13:19

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [PATCH] New Read-Copy Update patch

On Tue, Feb 05, 2002 at 09:18:26PM +0530, Dipankar Sarma wrote:
> Here is a "new" Read-Copy update patch that tries to address
> many of the concerns with the previous RCU patches. The
> details of RCU are documented as usual at
> http://lse.sourceforge.net/locking/rcupdate.html.
>
> Main features of rcu-2.5.3.patch -
>
> 1. Unlike my previous patch based on the old DYNIX/ptx algorithm
> this does not have any code in arch-dependent directories. This
> should make Andrea happy ;-)
> 2. No overhead in fastpath other than a per-cpu counter increment
> during context switch.
> 3. RCU callbacks maintained in per-cpu lists, so global locking
> needs to be used only once in every quiescent cycle, not
> for queueing RCU callbacks.
> 4. No changes to scheduler code.
> 5. No RCU, no overhead other than the context switch counter increment.

I think you attached the wrong patch, the below one is based on the
kernel thread that we don't need with the scheduler counter.

BTW, I'd like to point out that the schedule counter is likely to be at
zero cacheline cost.

Andrea

2002-02-05 18:16:18

by Dipankar Sarma

[permalink] [raw]
Subject: Re: [PATCH] New Read-Copy Update patch

On Tue, Feb 05, 2002 at 05:54:32PM +0100, Christoph Hellwig wrote:
> In article <[email protected]> you wrote:
> > 3. A per-cpu timer support ? - This will allow us to get rid of the krcud
> > stuff and make RCU even simpler.
>
> Something like http://people.redhat.com/mingo/scalable-timers-patches/smptimers-2.4.16-A0?

Almost. IIUC, there is still the possibility that a timer queued
in one CPU may get executed in another. While this by itself
doesn't cause a problem for this RCU implementation (we end up
checking some other CPU's queue), if one CPU is starved of timers,
it could be problematic.

Since we have many per-cpu data structures, it would be a useful thing
to have a per-cpu mechanism to manipulate these data structures
in a cache-sensitive way.

>
> Ingo, Linus: Any chance to see that in 2.5 soon?

IIRC, timerlist_lock used to show up in lockmetering in some workloads
we were using some time ago. It would be a good thing to get rid
of this global lock soon.

Thanks
Dipankar
--
Dipankar Sarma <[email protected]> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.

2002-02-05 18:32:21

by Dipankar Sarma

[permalink] [raw]
Subject: Re: [PATCH] New Read-Copy Update patch

Hi Andrea,

On Tue, Feb 05, 2002 at 06:13:54PM +0100, Andrea Arcangeli wrote:
> On Tue, Feb 05, 2002 at 09:18:26PM +0530, Dipankar Sarma wrote:
> > Main features of rcu-2.5.3.patch -
> >
> > 1. Unlike my previous patch based on the old DYNIX/ptx algorithm
> > this does not have any code in arch-dependent directories. This
> > should make Andrea happy ;-)
> > 2. No overhead in fastpath other than a per-cpu counter increment
> > during context switch.
> > 3. RCU callbacks maintained in per-cpu lists, so global locking
> > needs to be used only once in every quiescent cycle, not
> > for queueing RCU callbacks.
> > 4. No changes to scheduler code.
> > 5. No RCU, no overhead other than the context switch counter increment.
>
> I think you attached the wrong patch, the below one is based on the
> kernel thread that we don't need with the scheduler counter.

Actually that was a different patch from the earlier krcud based
patches - in this we don't have krcud for UP and use krcuds only to move
the per-cpu callback lists. My idea was to get it reviewed in lkml
and see if this would be acceptable.

>
> BTW, I'd like to point out that the schedule counter is likely to be at
> zero cacheline cost.

Yes. Otherwise you wouldn't have it in schedule().

I apologize for the confusion. To set the records straight, we are
experimenting with two patches - one I mailed earlier
(rcu-2.5.3.patch) and rcu_poll which is a part of aa kernels.
I am including rcu_poll-2.5.2.patch along with this. BTW, the
sched.h changes in this can be avoided once we have Rusty's per-CPU
data patch.

The main difference between rcu and rcu_poll are these -

1. rcu uses per-cpu callback lists.
2. rcu_poll forces all CPUs to go through quiescent state
whereas rcu uses a periodic per-cpu check to see if this
happened.

In our measurements, we find that rcu is slightly faster than
rcu_poll. We have an ongoing work of determining exactly
what adds to overheads in different RCU implementations and
what helps in performance. Our initial measurements a few months
ago showed that any delay in call_rcu() significantly affects
the performance. There is more work to do here.
At the moment however, I would go with whichever approach
is more acceptable.

Thanks
Dipankar
--
Dipankar Sarma <[email protected]> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.


diff -urN linux-2.5.2/include/linux/rcupdate.h linux-2.5.2-rcu_poll/include/linux/rcupdate.h
--- linux-2.5.2/include/linux/rcupdate.h Thu Jan 1 05:30:00 1970
+++ linux-2.5.2-rcu_poll/include/linux/rcupdate.h Thu Jan 31 22:00:38 2002
@@ -0,0 +1,59 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (c) International Business Machines Corp., 2001
+ *
+ * Author: Dipankar Sarma <[email protected]>
+ *
+ * Based on the original work by Paul McKenney <[email protected]>
+ * and inputs from Andrea Arcangeli, Rusty Russell, Andi Kleen etc.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * http://lse.sourceforge.net/locking/rcupdate.html
+ *
+ */
+
+#ifndef __LINUX_RCUPDATE_H
+#define __LINUX_RCUPDATE_H
+
+#include <linux/list.h>
+
+/*
+ * Callback structure for use with call_rcu().
+ */
+struct rcu_head {
+ struct list_head list;
+ void (*func)(void *obj);
+ void *arg;
+};
+
+#define RCU_HEAD_INIT(head) { LIST_HEAD_INIT(head.list), NULL, NULL }
+#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT(head)
+#define INIT_RCU_HEAD(ptr) do { \
+ INIT_LIST_HEAD(&(ptr)->list); (ptr)->func = NULL; (ptr)->arg = NULL; \
+} while (0)
+
+
+extern void FASTCALL(call_rcu(struct rcu_head *head, void (*func)(void *arg), void *arg));
+extern void synchronize_kernel(void);
+
+extern void rcu_init(void);
+
+#endif /* __LINUX_RCUPDATE_H */
diff -urN linux-2.5.2/include/linux/sched.h linux-2.5.2-rcu_poll/include/linux/sched.h
--- linux-2.5.2/include/linux/sched.h Tue Jan 15 02:57:08 2002
+++ linux-2.5.2-rcu_poll/include/linux/sched.h Thu Jan 31 22:01:06 2002
@@ -155,6 +155,7 @@
extern void flush_scheduled_tasks(void);
extern int start_context_thread(void);
extern int current_is_keventd(void);
+extern void force_cpu_reschedule(int cpu);

/*
* The default fd array needs to be at least BITS_PER_LONG,
@@ -566,6 +567,39 @@
extern struct mm_struct init_mm;
extern struct task_struct *init_tasks[NR_CPUS];

+#define BITMAP_SIZE ((MAX_PRIO+7)/8)
+#define PRIO_INTERACTIVE (MAX_RT_PRIO + (MAX_PRIO - MAX_RT_PRIO) / 4)
+#define TASK_INTERACTIVE(p) ((p)->prio >= MAX_RT_PRIO && (p)->prio <= PRIO_INTERACTIVE)
+#define JSLEEP_TO_PRIO(t) (((t) * 20) / HZ)
+
+typedef struct runqueue runqueue_t;
+
+struct prio_array {
+ int nr_active;
+ spinlock_t *lock;
+ runqueue_t *rq;
+ char bitmap[BITMAP_SIZE];
+ list_t queue[MAX_PRIO];
+};
+
+struct runqueue {
+ int cpu;
+ spinlock_t lock;
+ unsigned long nr_running, nr_switches;
+ task_t *curr, *idle;
+ prio_array_t *active, *expired, arrays[2];
+ long quiescent;
+} ____cacheline_aligned_in_smp;
+
+extern struct runqueue runqueues[];
+
+#define this_rq() (runqueues + smp_processor_id())
+#define task_rq(p) (runqueues + (p)->cpu)
+#define cpu_rq(cpu) (runqueues + (cpu))
+#define cpu_curr(cpu) (runqueues[(cpu)].curr)
+#define rt_task(p) ((p)->policy != SCHED_OTHER)
+#define RCU_quiescent(cpu) ((cpu_rq((cpu)))->quiescent)
+
/* PID hashing. (shouldnt this be dynamic?) */
#define PIDHASH_SZ (4096 >> 2)
extern struct task_struct *pidhash[PIDHASH_SZ];
diff -urN linux-2.5.2/init/main.c linux-2.5.2-rcu_poll/init/main.c
--- linux-2.5.2/init/main.c Wed Jan 9 23:15:17 2002
+++ linux-2.5.2-rcu_poll/init/main.c Tue Jan 29 21:29:22 2002
@@ -27,6 +27,7 @@
#include <linux/iobuf.h>
#include <linux/bootmem.h>
#include <linux/tty.h>
+#include <linux/rcupdate.h>

#include <asm/io.h>
#include <asm/bugs.h>
@@ -352,6 +353,7 @@
printk("Kernel command line: %s\n", saved_command_line);
parse_options(command_line);
trap_init();
+ rcu_init();
init_IRQ();
sched_init();
softirq_init();
diff -urN linux-2.5.2/kernel/Makefile linux-2.5.2-rcu_poll/kernel/Makefile
--- linux-2.5.2/kernel/Makefile Fri Nov 30 06:23:45 2001
+++ linux-2.5.2-rcu_poll/kernel/Makefile Thu Jan 31 21:53:22 2002
@@ -10,12 +10,12 @@
O_TARGET := kernel.o

export-objs = signal.o sys.o kmod.o context.o ksyms.o pm.o exec_domain.o \
- printk.o device.o
+ printk.o device.o rcupdate.o

obj-y = sched.o dma.o fork.o exec_domain.o panic.o printk.o \
module.o exit.o itimer.o info.o time.o softirq.o resource.o \
sysctl.o acct.o capability.o ptrace.o timer.o user.o \
- signal.o sys.o kmod.o context.o device.o
+ signal.o sys.o kmod.o context.o device.o rcupdate.o

obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += ksyms.o
diff -urN linux-2.5.2/kernel/rcupdate.c linux-2.5.2-rcu_poll/kernel/rcupdate.c
--- linux-2.5.2/kernel/rcupdate.c Thu Jan 1 05:30:00 1970
+++ linux-2.5.2-rcu_poll/kernel/rcupdate.c Thu Jan 31 21:38:46 2002
@@ -0,0 +1,212 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (c) International Business Machines Corp., 2001
+ * Copyright (C) Andrea Arcangeli <[email protected]> SuSE, 2001
+ *
+ * Author: Dipankar Sarma <[email protected]>,
+ * Andrea Arcangeli <[email protected]>
+ *
+ * Based on the original work by Paul McKenney <[email protected]>
+ * and inputs from Andrea Arcangeli, Rusty Russell, Andi Kleen etc.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * http://lse.sourceforge.net/locking/rcupdate.html
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/rcupdate.h>
+
+/* Definition for rcupdate control block. */
+static spinlock_t rcu_lock;
+static struct list_head rcu_nxtlist;
+static struct list_head rcu_curlist;
+static struct tasklet_struct rcu_tasklet;
+static unsigned long rcu_qsmask;
+static int rcu_polling_in_progress;
+static long rcu_quiescent_checkpoint[NR_CPUS];
+
+/*
+ * Register a new rcu callback. This will be invoked as soon
+ * as all CPUs have performed a context switch or been seen in the
+ * idle loop or in a user process.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(void *arg), void *arg)
+{
+ head->func = func;
+ head->arg = arg;
+
+ spin_lock_bh(&rcu_lock);
+ list_add(&head->list, &rcu_nxtlist);
+ spin_unlock_bh(&rcu_lock);
+
+ tasklet_hi_schedule(&rcu_tasklet);
+}
+
+static int rcu_prepare_polling(void)
+{
+ int stop;
+ int i;
+
+#ifdef DEBUG
+ if (!list_empty(&rcu_curlist))
+ BUG();
+#endif
+
+ stop = 1;
+ if (!list_empty(&rcu_nxtlist)) {
+ list_splice(&rcu_nxtlist, &rcu_curlist);
+ INIT_LIST_HEAD(&rcu_nxtlist);
+
+ rcu_polling_in_progress = 1;
+
+ for (i = 0; i < smp_num_cpus; i++) {
+ int cpu = cpu_logical_map(i);
+
+ if (cpu != smp_processor_id()) {
+ rcu_qsmask |= 1UL << cpu;
+ rcu_quiescent_checkpoint[cpu] = RCU_quiescent(cpu);
+ force_cpu_reschedule(cpu);
+ }
+ }
+ stop = 0;
+ }
+
+ return stop;
+}
+
+/*
+ * Invoke the completed RCU callbacks.
+ */
+static void rcu_invoke_callbacks(void)
+{
+ struct list_head *entry;
+ struct rcu_head *head;
+
+#ifdef DEBUG
+ if (list_empty(&rcu_curlist))
+ BUG();
+#endif
+
+ entry = rcu_curlist.prev;
+ do {
+ head = list_entry(entry, struct rcu_head, list);
+ entry = entry->prev;
+
+ head->func(head->arg);
+ } while (entry != &rcu_curlist);
+
+ INIT_LIST_HEAD(&rcu_curlist);
+}
+
+static int rcu_completion(void)
+{
+ int stop;
+
+ rcu_polling_in_progress = 0;
+ rcu_invoke_callbacks();
+
+ stop = rcu_prepare_polling();
+
+ return stop;
+}
+
+static int rcu_polling(void)
+{
+ int i;
+ int stop;
+
+ for (i = 0; i < smp_num_cpus; i++) {
+ int cpu = cpu_logical_map(i);
+
+ if (rcu_qsmask & (1UL << cpu))
+ if (rcu_quiescent_checkpoint[cpu] != RCU_quiescent(cpu))
+ rcu_qsmask &= ~(1UL << cpu);
+ }
+
+ stop = 0;
+ if (!rcu_qsmask)
+ stop = rcu_completion();
+
+ return stop;
+}
+
+/*
+ * Look into the per-cpu callback information to see if there is
+ * any processing necessary - if so do it.
+ */
+static void rcu_process_callbacks(unsigned long data)
+{
+ int stop;
+
+ spin_lock(&rcu_lock);
+ if (!rcu_polling_in_progress)
+ stop = rcu_prepare_polling();
+ else
+ stop = rcu_polling();
+ spin_unlock(&rcu_lock);
+
+ if (!stop)
+ tasklet_hi_schedule(&rcu_tasklet);
+}
+
+/* Because of FASTCALL declaration of complete, we use this wrapper */
+static void wakeme_after_rcu(void *completion)
+{
+ complete(completion);
+}
+
+/*
+ * Initializes rcu mechanism. Assumed to be called early.
+ * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
+ */
+void __init rcu_init(void)
+{
+ tasklet_init(&rcu_tasklet, rcu_process_callbacks, 0UL);
+ INIT_LIST_HEAD(&rcu_nxtlist);
+ INIT_LIST_HEAD(&rcu_curlist);
+ spin_lock_init(&rcu_lock);
+}
+
+/*
+ * Wait until all the CPUs have gone through a "quiescent" state.
+ */
+void synchronize_kernel(void)
+{
+ struct rcu_head rcu;
+ DECLARE_COMPLETION(completion);
+
+ /* Will wake me after RCU finished */
+ call_rcu(&rcu, wakeme_after_rcu, &completion);
+
+ /* Wait for it */
+ wait_for_completion(&completion);
+}
+
+EXPORT_SYMBOL(call_rcu);
+EXPORT_SYMBOL(synchronize_kernel);
diff -urN linux-2.5.2/kernel/sched.c linux-2.5.2-rcu_poll/kernel/sched.c
--- linux-2.5.2/kernel/sched.c Tue Jan 15 01:09:31 2002
+++ linux-2.5.2-rcu_poll/kernel/sched.c Fri Feb 1 17:17:39 2002
@@ -20,21 +20,6 @@
#include <linux/interrupt.h>
#include <asm/mmu_context.h>

-#define BITMAP_SIZE ((MAX_PRIO+7)/8)
-#define PRIO_INTERACTIVE (MAX_RT_PRIO + (MAX_PRIO - MAX_RT_PRIO) / 4)
-#define TASK_INTERACTIVE(p) ((p)->prio >= MAX_RT_PRIO && (p)->prio <= PRIO_INTERACTIVE)
-#define JSLEEP_TO_PRIO(t) (((t) * 20) / HZ)
-
-typedef struct runqueue runqueue_t;
-
-struct prio_array {
- int nr_active;
- spinlock_t *lock;
- runqueue_t *rq;
- char bitmap[BITMAP_SIZE];
- list_t queue[MAX_PRIO];
-};
-
/*
* This is the main, per-CPU runqueue data structure.
*
@@ -46,20 +31,7 @@
* if there is a RT task active in an SMP system but there is no
* RT scheduling activity otherwise.
*/
-static struct runqueue {
- int cpu;
- spinlock_t lock;
- unsigned long nr_running, nr_switches;
- task_t *curr, *idle;
- prio_array_t *active, *expired, arrays[2];
- char __pad [SMP_CACHE_BYTES];
-} runqueues [NR_CPUS] __cacheline_aligned;
-
-#define this_rq() (runqueues + smp_processor_id())
-#define task_rq(p) (runqueues + (p)->cpu)
-#define cpu_rq(cpu) (runqueues + (cpu))
-#define cpu_curr(cpu) (runqueues[(cpu)].curr)
-#define rt_task(p) ((p)->policy != SCHED_OTHER)
+struct runqueue runqueues [NR_CPUS] __cacheline_aligned;

#define lock_task_rq(rq,p,flags) \
do { \
@@ -515,6 +487,7 @@
switch_tasks:
prev->need_resched = 0;

+ RCU_quiescent(prev->cpu)++;
if (likely(prev != next)) {
rq->nr_switches++;
rq->curr = next;
@@ -765,6 +738,20 @@
unlock_task_rq(rq, p, flags);
}

+void force_cpu_reschedule(int cpu)
+{
+ unsigned long flags;
+ struct runqueue *rq, *newrq;
+ struct task_struct *p;
+
+ rq = cpu_rq(cpu);
+ p = rq->curr;
+ newrq = lock_task_rq(p, &flags);
+ if (newrq == rq)
+ resched_task(p);
+ unlock_task_rq(newrq, &flags);
+}
+
#ifndef __alpha__

/*

2002-02-05 18:38:55

by Dipankar Sarma

[permalink] [raw]
Subject: Re: [PATCH] New Read-Copy Update patch

On Tue, Feb 05, 2002 at 11:57:07AM -0500, Ingo Molnar wrote:
>
> On Tue, 5 Feb 2002, Christoph Hellwig wrote:
>
> > > 3. A per-cpu timer support ? - This will allow us to get rid of the krcud
> > > stuff and make RCU even simpler.
> >
> > Something like http://people.redhat.com/mingo/scalable-timers-patches/smptimers-2.4.16-A0?
> >
> > Ingo, Linus: Any chance to see that in 2.5 soon?
>
> i'll post an updated smptimers patch soon after the scheduler has settled
> down.

Hi Ingo,

Unless you have new code that updates the patch, I can take a stab
at porting it to 2.5. Let me know if that is ok.

Thanks
Dipankar
--
Dipankar Sarma <[email protected]> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.

2002-02-06 08:40:09

by Daniel Phillips

[permalink] [raw]
Subject: Re: [PATCH] New Read-Copy Update patch

On February 5, 2002 07:34 pm, Dipankar Sarma wrote:
>
> + * Based on the original work by Paul McKenney <[email protected]>
> + * and inputs from Andrea Arcangeli, Rusty Russell, Andi Kleen etc.
> + * Papers:
> + * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf

$ wget http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
--09:40:31-- http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
=> `rclockpdcsproof.pdf'
Connecting to http://www.rdrop.com:80... connected!
HTTP request sent, awaiting response... 403 Forbidden
09:40:32 ERROR 403: Forbidden.

> + * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)

$ wget http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf
--09:41:59-- http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf
=> `rclock_OLS.2001.05.01c.sc.pdf'
Connecting to lse.sourceforge.net:80... connected!
HTTP request sent, awaiting response... 404 Not Found
09:41:59 ERROR 404: Not Found.

--
Daniel

2002-02-06 08:51:31

by Dipankar Sarma

[permalink] [raw]
Subject: Re: [PATCH] New Read-Copy Update patch

On Wed, Feb 06, 2002 at 09:44:33AM +0100, Daniel Phillips wrote:
> On February 5, 2002 07:34 pm, Dipankar Sarma wrote:
> >
> > + * Based on the original work by Paul McKenney <[email protected]>
> > + * and inputs from Andrea Arcangeli, Rusty Russell, Andi Kleen etc.
> > + * Papers:
> > + * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
>
> $ wget http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
> --09:40:31-- http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
> => `rclockpdcsproof.pdf'
> Connecting to http://www.rdrop.com:80... connected!
> HTTP request sent, awaiting response... 403 Forbidden
> 09:40:32 ERROR 403: Forbidden.
>
> > + * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
>
> $ wget http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf
> --09:41:59-- http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf
> => `rclock_OLS.2001.05.01c.sc.pdf'
> Connecting to lse.sourceforge.net:80... connected!
> HTTP request sent, awaiting response... 404 Not Found
> 09:41:59 ERROR 404: Not Found.
>
> --
> Daniel

Could you try this for the original paper -

http://www.rdrop.com/users/paulmck/rclock/rclockpdcsproof.pdf

Looks like Paul has rearranged his paper site.

And I messed up while rearranging the stuff in LSE. I have now restored
the link to the OLS paper. Could you please retry ?

Sorry about the inconvenience.

Thanks
Dipankar
--
Dipankar Sarma <[email protected]> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.

2002-02-06 08:55:01

by Daniel Phillips

[permalink] [raw]
Subject: Re: [PATCH] New Read-Copy Update patch

On February 6, 2002 09:53 am, Dipankar Sarma wrote:
> On Wed, Feb 06, 2002 at 09:44:33AM +0100, Daniel Phillips wrote:
> > On February 5, 2002 07:34 pm, Dipankar Sarma wrote:
> > >
> > > + * Based on the original work by Paul McKenney <[email protected]>
> > > + * and inputs from Andrea Arcangeli, Rusty Russell, Andi Kleen etc.
> > > + * Papers:
> > > + * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
> >
> > $ wget http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
> > --09:40:31-- http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
> > => `rclockpdcsproof.pdf'
> > Connecting to http://www.rdrop.com:80... connected!
> > HTTP request sent, awaiting response... 403 Forbidden
> > 09:40:32 ERROR 403: Forbidden.
> >
> > > + * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
> >
> > $ wget http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf
> > --09:41:59-- http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf
> > => `rclock_OLS.2001.05.01c.sc.pdf'
> > Connecting to lse.sourceforge.net:80... connected!
> > HTTP request sent, awaiting response... 404 Not Found
> > 09:41:59 ERROR 404: Not Found.
> >
> > --
> > Daniel
>
> Could you try this for the original paper -
>
> http://www.rdrop.com/users/paulmck/rclock/rclockpdcsproof.pdf
>
> Looks like Paul has rearranged his paper site.
>
> And I messed up while rearranging the stuff in LSE. I have now restored
> the link to the OLS paper. Could you please retry ?

Both work fine.

> Sorry about the inconvenience.

No problem, thanks for the fixes!

--
Daniel

2002-02-06 11:46:23

by Dipankar Sarma

[permalink] [raw]
Subject: Re: [PATCH] New Read-Copy Update patch [fixed]

On Tue, Feb 05, 2002 at 09:18:26PM +0530, Dipankar Sarma wrote:
> Here is a "new" Read-Copy update patch that tries to address
> many of the concerns with the previous RCU patches. The
> details of RCU are documented as usual at
> http://lse.sourceforge.net/locking/rcupdate.html.
>
> Main features of rcu-2.5.3.patch -
>
> 1. Unlike my previous patch based on the old DYNIX/ptx algorithm
> this does not have any code in arch-dependent directories. This
> should make Andrea happy ;-)
> 2. No overhead in fastpath other than a per-cpu counter increment
> during context switch.
> 3. RCU callbacks maintained in per-cpu lists, so global locking
> needs to be used only once in every quiescent cycle, not
> for queueing RCU callbacks.
> 4. No changes to scheduler code.
> 5. No RCU, no overhead other than the context switch counter increment.
>
> Future cleanups todo -
>
> 1. When Rusty's per-cpu data area patch is included in linus tree,
> this patch can be cleaned up signficantly - delete rcudata.h
> and make all the per-cpu data static except qsctr. Can we have
> Rusty's patch in soon please ?
> 2. We probably shouldn't care about feature #5 above. If we ever
> use RCU, why bother ? This simplifies the patch signficantly
> by getting rid of rcu_active_cpumask logic.
> 3. A per-cpu timer support ? - This will allow us to get rid of the krcud
> stuff and make RCU even simpler.
> 4. Ingo will provide task_idle(task_t *p) in the next version
> of O(1) schduler, so my hack goes away.
>

Last minute changes left a broken prototype for task_idle() in
sched.h. It should be task_idle(task_t *). Here is the fixed patch.

Thanks
Dipankar
--
Dipankar Sarma <[email protected]> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.

rcu-2.5.3.patch
---------------

diff -urN linux-2.5.3/include/linux/rcudata.h linux-2.5.3-rcu/include/linux/rcudata.h
--- linux-2.5.3/include/linux/rcudata.h Thu Jan 1 05:30:00 1970
+++ linux-2.5.3-rcu/include/linux/rcudata.h Mon Feb 4 11:59:28 2002
@@ -0,0 +1,109 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (c) International Business Machines Corp., 2001
+ *
+ * Author: Dipankar Sarma <[email protected]>
+ *
+ * Based on the original work by Paul McKenney <[email protected]>
+ * and inputs from Andrea Arcangeli, Rusty Russell, Andi Kleen etc.
+ *
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * http://lse.sourceforge.net/locking/rcupdate.html
+ *
+ */
+
+#ifndef __LINUX_RCUDATA_H
+#define __LINUX_RCUDATA_H
+
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/timer.h>
+#include <linux/rcupdate.h>
+
+/* Batch counter type. */
+typedef long rcu_batch_t;
+
+/*
+ * RCU_BATCH_XX(rcu_batch_t a, rcu_batch_t b)
+ *
+ * Returns true if batch number ``a'' compares as specified to ``b''.
+ * This comparison allows for the fact that the batch numbers wrap.
+ */
+#define RCU_BATCH_EQ(a, b) ((a) - (b) == 0)
+#define RCU_BATCH_GE(a, b) ((a) - (b) >= 0)
+#define RCU_BATCH_GT(a, b) ((a) - (b) > 0)
+#define RCU_BATCH_LE(a, b) ((a) - (b) <= 0)
+#define RCU_BATCH_LT(a, b) ((a) - (b) < 0)
+#define RCU_BATCH_NE(a, b) ((a) - (b) != 0)
+
+/*
+ * Per-CPU data for Read-Copy UPdate.
+ * We maintain callbacks in 3 per-cpu lists. Each list represents
+ * a batch and every batch is given a number that is global across
+ * all CPUs. The global current batch number (curbatch) represents
+ * the current batch of callbacks for which the quiescent cycle
+ * has started.
+ * nxtlist - new callbacks are added here
+ * curlist - current batch for which quiescent cycle started if any
+ * donelist - one or more batches that have finished waiting to be invoked
+ */
+struct rcu_data {
+ /*
+ * Per-cpu counters maintained to indicate quiscent state
+ * transition. A queiscent state can be context switch,
+ * idle loop or user mode code. A quiesent state transition
+ * indicates that the CPU no longer has kernel data references.
+ */
+ long qsctr; /* cswitch/idle loop etc. */
+
+ /*
+ * Per-CPU variables used by the write side of the read-copy update
+ * mechanism. See kernel/rcupdate.c.
+ */
+ long last_qsctr; /* value of qsctr at beginning */
+ /* of last rcu interval. */
+ rcu_batch_t batch; /* Batch # for current RCU batch */
+
+ struct list_head nxtlist;
+ struct list_head curlist;
+ struct list_head donelist;
+ struct tasklet_struct tasklet;
+ struct task_struct *task;
+ struct semaphore sema;
+} ____cacheline_aligned_in_smp;
+
+extern struct rcu_data rcu_data[NR_CPUS];
+
+#define RCU_qsctr(cpu) (rcu_data[(cpu)].qsctr)
+#define RCU_last_qsctr(cpu) (rcu_data[(cpu)].last_qsctr)
+#define RCU_batch(cpu) (rcu_data[(cpu)].batch)
+#define RCU_nxtlist(cpu) (rcu_data[(cpu)].nxtlist)
+#define RCU_curlist(cpu) (rcu_data[(cpu)].curlist)
+#define RCU_donelist(cpu) (rcu_data[(cpu)].donelist)
+#define RCU_tasklet(cpu) (rcu_data[(cpu)].tasklet)
+#define krcud_sema(cpu) (rcu_data[(cpu)].sema)
+#define krcud_task(cpu) (rcu_data[(cpu)].task)
+
+#define RCU_QSCTR_INVALID 0
+
+#endif
diff -urN linux-2.5.3/include/linux/rcupdate.h linux-2.5.3-rcu/include/linux/rcupdate.h
--- linux-2.5.3/include/linux/rcupdate.h Thu Jan 1 05:30:00 1970
+++ linux-2.5.3-rcu/include/linux/rcupdate.h Mon Feb 4 11:59:28 2002
@@ -0,0 +1,57 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (c) International Business Machines Corp., 2001
+ *
+ * Author: Dipankar Sarma <[email protected]>
+ *
+ * Based on the original work by Paul McKenney <[email protected]>
+ * and inputs from Andrea Arcangeli, Rusty Russell, Andi Kleen etc.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * http://lse.sourceforge.net/locking/rcupdate.html
+ *
+ */
+
+#ifndef __LINUX_RCUPDATE_H
+#define __LINUX_RCUPDATE_H
+
+#include <linux/list.h>
+
+/*
+ * Callback structure for use with call_rcu().
+ */
+struct rcu_head {
+ struct list_head list;
+ void (*func)(void *obj);
+ void *arg;
+};
+
+#define RCU_HEAD_INIT(head) { LIST_HEAD_INIT(head.list), NULL, NULL }
+#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT(head)
+#define INIT_RCU_HEAD(ptr) do { \
+ INIT_LIST_HEAD(&(ptr)->list); (ptr)->func = NULL; (ptr)->arg = NULL; \
+} while (0)
+
+
+extern void call_rcu(struct rcu_head *head, void (*func)(void *arg), void *arg);
+extern void synchronize_kernel(void);
+
+#endif /* __LINUX_RCUPDATE_H */
diff -urN linux-2.5.3/include/linux/sched.h linux-2.5.3-rcu/include/linux/sched.h
--- linux-2.5.3/include/linux/sched.h Wed Jan 30 11:11:12 2002
+++ linux-2.5.3-rcu/include/linux/sched.h Mon Feb 4 18:49:08 2002
@@ -161,6 +161,7 @@
extern void flush_scheduled_tasks(void);
extern int start_context_thread(void);
extern int current_is_keventd(void);
+extern int task_idle(task_t *p);

struct namespace;

diff -urN linux-2.5.3/kernel/Makefile linux-2.5.3-rcu/kernel/Makefile
--- linux-2.5.3/kernel/Makefile Fri Jan 25 03:37:46 2002
+++ linux-2.5.3-rcu/kernel/Makefile Mon Feb 4 13:57:09 2002
@@ -10,12 +10,12 @@
O_TARGET := kernel.o

export-objs = signal.o sys.o kmod.o context.o ksyms.o pm.o exec_domain.o \
- printk.o
+ printk.o rcupdate.o

obj-y = sched.o dma.o fork.o exec_domain.o panic.o printk.o \
module.o exit.o itimer.o info.o time.o softirq.o resource.o \
sysctl.o acct.o capability.o ptrace.o timer.o user.o \
- signal.o sys.o kmod.o context.o
+ signal.o sys.o kmod.o context.o rcupdate.o

obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += ksyms.o
diff -urN linux-2.5.3/kernel/rcupdate.c linux-2.5.3-rcu/kernel/rcupdate.c
--- linux-2.5.3/kernel/rcupdate.c Thu Jan 1 05:30:00 1970
+++ linux-2.5.3-rcu/kernel/rcupdate.c Mon Feb 4 18:51:38 2002
@@ -0,0 +1,363 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (c) International Business Machines Corp., 2001
+ *
+ * Author: Dipankar Sarma <[email protected]>
+ *
+ * Based on the original work by Paul McKenney <[email protected]>
+ * and inputs from Andrea Arcangeli, Rusty Russell, Andi Kleen etc.
+ *
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * http://lse.sourceforge.net/locking/rcupdate.html
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <asm/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/rcudata.h>
+
+static spinlock_t rcu_lock = SPIN_LOCK_UNLOCKED;
+static rcu_batch_t rcu_curbatch = 1; /* Current batch number */
+static rcu_batch_t rcu_maxbatch = 1; /* Max requested batch number*/
+
+/* CPUs that need to switch in order for current RCU batch to proceed */
+static unsigned long rcu_cpumask = 0;
+/* CPUs that have active RCUs */
+static unsigned long rcu_active_cpumask = 0;
+/* Global timer to nudge CPUs */
+static struct timer_list rcu_timer;
+struct rcu_data rcu_data[NR_CPUS] __cacheline_aligned;
+
+asmlinkage long sys_sched_get_priority_max(int policy);
+
+/*
+ * Register a new rcu callback. This will be invoked as soon
+ * as all CPUs have performed a context switch or been seen in the
+ * idle loop or in a user process.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(void *arg), void *arg)
+{
+ int cpu = cpu_number_map(smp_processor_id());
+ unsigned long flags;
+
+ head->func = func;
+ head->arg = arg;
+ local_irq_save(flags);
+ list_add_tail(&head->list, &RCU_nxtlist(cpu));
+ local_irq_restore(flags);
+ tasklet_schedule(&RCU_tasklet(cpu));
+}
+
+/*
+ * Invoke the completed RCU callbacks. They are expected to be in
+ * a per-cpu list.
+ */
+static inline void rcu_invoke_callbacks(struct list_head *list)
+{
+ struct list_head *entry;
+ struct rcu_head *head;
+
+ while (!list_empty(list)) {
+ entry = list->next;
+ list_del(entry);
+ head = list_entry(entry, struct rcu_head, list);
+ head->func(head->arg);
+ }
+}
+
+/*
+ * Register a new batch of callbacks with the global state maintainers.
+ * Caller must hold the rcu global lock.
+ */
+static inline void rcu_reg_batch(rcu_batch_t newbatch)
+{
+ if (RCU_BATCH_LT(rcu_maxbatch, newbatch)) {
+ rcu_maxbatch = newbatch;
+ }
+ if (RCU_BATCH_LT(rcu_maxbatch, rcu_curbatch) || (rcu_cpumask != 0)) {
+ return;
+ }
+ rcu_cpumask = cpu_online_map;
+}
+
+/*
+ * Check if the cpu has gone through a quiescent state.
+ * If so and if it already hasn't done so in this RCU
+ * quiescent cycle, then indicate that it has done so.
+ */
+static void rcu_check_quiescent_state(void)
+{
+ int cpu = cpu_number_map(smp_processor_id());
+
+ if (!test_bit(cpu, &rcu_cpumask)) {
+ return;
+ }
+
+ /*
+ * May race with rcu per-cpu tick - in the worst case
+ * we may miss one quiescent state of that CPU. That is
+ * tolerable. So no need to disable interrupts.
+ */
+ if (RCU_last_qsctr(cpu) == RCU_QSCTR_INVALID) {
+ RCU_last_qsctr(cpu) = RCU_qsctr(cpu);
+ return;
+ }
+ if (RCU_qsctr(cpu) == RCU_last_qsctr(cpu)) {
+ return;
+ }
+
+ spin_lock(&rcu_lock);
+ if (!test_bit(cpu, &rcu_cpumask)) {
+ spin_unlock(&rcu_lock);
+ return;
+ }
+ clear_bit(cpu, &rcu_cpumask);
+ RCU_last_qsctr(cpu) = RCU_QSCTR_INVALID;
+ if (rcu_cpumask != 0) {
+ /* All CPUs haven't gone through a quiescent state */
+ spin_unlock(&rcu_lock);
+ return;
+ }
+ rcu_curbatch++;
+ rcu_reg_batch(rcu_maxbatch);
+ spin_unlock(&rcu_lock);
+}
+
+static void rcu_move_next_batch(void)
+{
+ int rcu_timer_active;
+ int cpu = cpu_number_map(smp_processor_id());
+
+ local_irq_disable();
+ if (!list_empty(&RCU_nxtlist(cpu)) && list_empty(&RCU_curlist(cpu))) {
+ list_splice(&RCU_nxtlist(cpu), &RCU_curlist(cpu));
+ INIT_LIST_HEAD(&RCU_nxtlist(cpu));
+ local_irq_enable();
+
+ /*
+ * start the next batch of callbacks
+ */
+ spin_lock(&rcu_lock);
+ rcu_timer_active = (rcu_active_cpumask != 0);
+ if (!test_bit(cpu, &rcu_active_cpumask))
+ set_bit(cpu, &rcu_active_cpumask);
+ if (!rcu_timer_active &&
+ !timer_pending(&rcu_timer)) {
+ rcu_timer.expires = jiffies + 5;
+ add_timer(&rcu_timer);
+ }
+ RCU_batch(cpu) = rcu_curbatch + 1;
+ rcu_reg_batch(RCU_batch(cpu));
+ spin_unlock(&rcu_lock);
+ } else {
+ local_irq_enable();
+ }
+}
+
+/*
+ * Look into the per-cpu callback information to see if there is
+ * any processing necessary - if so do it.
+ */
+static void rcu_process_callbacks(unsigned long data)
+{
+ int cpu = cpu_number_map(smp_processor_id());
+
+ if (!list_empty(&RCU_curlist(cpu)) &&
+ RCU_BATCH_GT(rcu_curbatch, RCU_batch(cpu))) {
+ list_splice(&RCU_curlist(cpu), &RCU_donelist(cpu));
+ INIT_LIST_HEAD(&RCU_curlist(cpu));
+ }
+
+ rcu_move_next_batch();
+
+ /*
+ * No race between nxtlist here & call_rcu() from irq -
+ * call_rcu() will anyway reschedule the tasklet so if the
+ * bit in cpu_active_mask gets cleared, it will get set again.
+ */
+ if (list_empty(&RCU_nxtlist(cpu)) && list_empty(&RCU_curlist(cpu))) {
+ spin_lock(&rcu_lock);
+ clear_bit(cpu, &rcu_active_cpumask);
+ spin_unlock(&rcu_lock);
+ }
+ rcu_check_quiescent_state();
+
+ if (!list_empty(&RCU_donelist(cpu))) {
+ rcu_invoke_callbacks(&RCU_donelist(cpu));
+ }
+}
+
+/*
+ * Per-cpu tick that processes per-cpu queues
+ */
+static void rcu_percpu_tick_common(void)
+{
+ /* Take this opportunity to check for idle loop */
+ if (task_idle(current))
+ RCU_qsctr(cpu_logical_map(smp_processor_id()))++;
+ rcu_process_callbacks(0);
+}
+
+
+#ifdef CONFIG_SMP
+static void rcu_percpu_tick(void)
+{
+ int cpu;
+
+ for (cpu = 0; cpu < smp_num_cpus; cpu++) {
+ up(&krcud_sema(cpu));
+ }
+}
+#else
+static void rcu_percpu_tick(void)
+{
+ rcu_percpu_tick_common();
+}
+#endif
+
+/*
+ * RCU timer handler - called one in every 50ms only if there is
+ * any RCU pending in the system. It nudges every CPU.
+ */
+static void rcu_timer_handler(unsigned long data)
+{
+ rcu_percpu_tick();
+ spin_lock(&rcu_lock);
+ if (rcu_active_cpumask) {
+ rcu_timer.expires = jiffies + 5;
+ add_timer(&rcu_timer);
+ }
+ spin_unlock(&rcu_lock);
+
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Per-CPU RCU dameon. It runs at an absurdly high priority so
+ * that it is not starved out by the scheduler thereby holding
+ * up RC updates.
+ */
+static int krcud(void * __bind_cpu)
+{
+ int bind_cpu = *(int *) __bind_cpu;
+ int cpu = cpu_logical_map(bind_cpu);
+
+ daemonize();
+ current->policy = SCHED_FIFO;
+ current->rt_priority = 1001 + sys_sched_get_priority_max(SCHED_FIFO);
+
+ sigfillset(&current->blocked);
+
+ /* Migrate to the right CPU */
+ set_cpus_allowed(current, 1UL << cpu);
+
+ sprintf(current->comm, "krcud_CPU%d", bind_cpu);
+ sema_init(&krcud_sema(cpu), 0);
+
+ krcud_task(cpu) = current;
+
+ for (;;) {
+ while (down_interruptible(&krcud_sema(cpu)));
+ local_bh_disable();
+ rcu_percpu_tick_common();
+ local_bh_enable();
+ }
+}
+
+static void spawn_krcud(void)
+{
+ int cpu;
+
+ for (cpu = 0; cpu < smp_num_cpus; cpu++) {
+ if (kernel_thread(krcud, (void *) &cpu,
+ CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0)
+ printk(KERN_ERR "spawn_krcud() failed for cpu %d\n",
+ cpu);
+ else {
+ while (!krcud_task(cpu_logical_map(cpu))) {
+ schedule();
+ }
+ }
+ }
+}
+
+#endif
+
+/*
+ * Initializes rcu mechanism. Assumed to be called early.
+ * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
+ * Note that rcu_qsctr and friends are implicitly
+ * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
+ */
+static __init int rcu_init(void)
+{
+ int i;
+
+ memset(&rcu_data[0], 0, sizeof(rcu_data));
+ for (i = 0; i < NR_CPUS; i++) {
+ tasklet_init(&RCU_tasklet(i), rcu_process_callbacks, 0UL);
+ INIT_LIST_HEAD(&RCU_nxtlist(i));
+ INIT_LIST_HEAD(&RCU_curlist(i));
+ INIT_LIST_HEAD(&RCU_donelist(i));
+ }
+ init_timer(&rcu_timer);
+ rcu_timer.function = rcu_timer_handler;
+#ifdef CONFIG_SMP
+ spawn_krcud();
+#endif
+ return 0;
+}
+
+/* Because of FASTCALL declaration of complete, we use this wrapper */
+static void wakeme_after_rcu(void *completion)
+{
+ complete(completion);
+}
+
+/*
+ * Wait until all the CPUs have gone through a "quiescent" state.
+ */
+void synchronize_kernel(void)
+{
+ struct rcu_head rcu;
+ DECLARE_COMPLETION(completion);
+
+ /* Will wake me after RCU finished */
+ call_rcu(&rcu, wakeme_after_rcu, &completion);
+
+ /* Wait for it */
+ wait_for_completion(&completion);
+}
+
+__initcall(rcu_init);
+
+EXPORT_SYMBOL(call_rcu);
+EXPORT_SYMBOL(synchronize_kernel);
diff -urN linux-2.5.3/kernel/sched.c linux-2.5.3-rcu/kernel/sched.c
--- linux-2.5.3/kernel/sched.c Tue Jan 29 04:42:47 2002
+++ linux-2.5.3-rcu/kernel/sched.c Mon Feb 4 18:48:54 2002
@@ -18,6 +18,7 @@
#include <asm/uaccess.h>
#include <linux/smp_lock.h>
#include <linux/interrupt.h>
+#include <linux/rcudata.h>
#include <asm/mmu_context.h>

#define BITMAP_SIZE ((((MAX_PRIO+7)/8)+sizeof(long)-1)/sizeof(long))
@@ -685,6 +686,7 @@
switch_tasks:
prefetch(next);
prev->work.need_resched = 0;
+ RCU_qsctr(prev->cpu)++;

if (likely(prev != next)) {
rq->nr_switches++;
@@ -1187,6 +1189,13 @@
return retval;
}

+int task_idle(task_t *p)
+{
+ if (task_rq(p)->idle == p)
+ return 1;
+ return 0;
+}
+
static void show_task(task_t * p)
{
unsigned long free = 0;