2003-03-01 04:20:31

by Adam J. Richter

[permalink] [raw]
Subject: Patch/resubmit linux-2.5.63-bk4 try_module_get simplification

The following patch shrinks changes the implementation of
try_module_get() and friends to eliminate the special stopping of all
CPU's when a module is unloaded. Instead, it uses a read/write
semaphore in a perhaps slightly non-intuitive way.

Note that because try_module_get uses the non-blocking
down_read_trylock, it is apparently safe for use in an interrupt
context. The only complaints about this patch in the past were
apparently misunderstandings of this, where people thought that
this version of try_module_get could block.

I have been using this change on numerous machines for the
past seven weeks without problems, including on a router using
iptables.

Can you please apply it? Thanks in advance.

--
Adam J. Richter __ ______________ 575 Oroville Road
[email protected] \ / Milpitas, California 95035
+1 408 309-6081 | g g d r a s i l United States of America
"Free Software For The Rest Of Us."


Attachments:
(No filename) (987.00 B)
module.diff (6.21 kB)
Download all attachments

2003-03-02 14:03:10

by Roman Zippel

[permalink] [raw]
Subject: Re: Patch/resubmit linux-2.5.63-bk4 try_module_get simplification

Hi,

On Fri, 28 Feb 2003, Adam J. Richter wrote:

> The following patch shrinks changes the implementation of
> try_module_get() and friends to eliminate the special stopping of all
> CPU's when a module is unloaded. Instead, it uses a read/write
> semaphore in a perhaps slightly non-intuitive way.

Hmm, I was waiting a bit for Rusty's comment, but there isn't any...
Anyway the patch below does the same, but it gets the module ref
speculative and calls module_get_sync() if there is a problem.
The patch is against an older kernel, but it should still apply and
someone else has to get it past Rusty.
BTW making the module ref functions not inline saves about 5KB with the
standard config.

bye, Roman

diff -pur -X /home/roman/nodiff linux-2.5.59.org/include/linux/module.h linux-2.5.59.mod/include/linux/module.h
--- linux-2.5.59.org/include/linux/module.h 2003-01-20 20:23:33.000000000 +0100
+++ linux-2.5.59.mod/include/linux/module.h 2003-02-08 14:16:54.000000000 +0100
@@ -259,16 +259,18 @@ void symbol_put_addr(void *addr);
#define local_dec(x) atomic_dec(x)
#endif

+extern int module_get_sync(struct module *module);
+
static inline int try_module_get(struct module *module)
{
int ret = 1;

if (module) {
unsigned int cpu = get_cpu();
- if (likely(module_is_live(module)))
- local_inc(&module->ref[cpu].count);
- else
- ret = 0;
+ local_inc(&module->ref[cpu].count);
+ smp_mb();
+ if (unlikely(!module_is_live(module)))
+ ret = module_get_sync(module);
put_cpu();
}
return ret;
diff -pur -X /home/roman/nodiff linux-2.5.59.org/kernel/module.c linux-2.5.59.mod/kernel/module.c
--- linux-2.5.59.org/kernel/module.c 2003-01-20 20:23:33.000000000 +0100
+++ linux-2.5.59.mod/kernel/module.c 2003-02-08 15:05:02.000000000 +0100
@@ -203,154 +203,6 @@ static void module_unload_free(struct mo
}
}

-#ifdef CONFIG_SMP
-/* Thread to stop each CPU in user context. */
-enum stopref_state {
- STOPREF_WAIT,
- STOPREF_PREPARE,
- STOPREF_DISABLE_IRQ,
- STOPREF_EXIT,
-};
-
-static enum stopref_state stopref_state;
-static unsigned int stopref_num_threads;
-static atomic_t stopref_thread_ack;
-
-static int stopref(void *cpu)
-{
- int irqs_disabled = 0;
- int prepared = 0;
-
- sprintf(current->comm, "kmodule%lu\n", (unsigned long)cpu);
-
- /* Highest priority we can manage, and move to right CPU. */
-#if 0 /* FIXME */
- struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
- setscheduler(current->pid, SCHED_FIFO, &param);
-#endif
- set_cpus_allowed(current, 1UL << (unsigned long)cpu);
-
- /* Ack: we are alive */
- atomic_inc(&stopref_thread_ack);
-
- /* Simple state machine */
- while (stopref_state != STOPREF_EXIT) {
- if (stopref_state == STOPREF_DISABLE_IRQ && !irqs_disabled) {
- local_irq_disable();
- irqs_disabled = 1;
- /* Ack: irqs disabled. */
- atomic_inc(&stopref_thread_ack);
- } else if (stopref_state == STOPREF_PREPARE && !prepared) {
- /* Everyone is in place, hold CPU. */
- preempt_disable();
- prepared = 1;
- atomic_inc(&stopref_thread_ack);
- }
- if (irqs_disabled || prepared)
- cpu_relax();
- else
- yield();
- }
-
- /* Ack: we are exiting. */
- atomic_inc(&stopref_thread_ack);
-
- if (irqs_disabled)
- local_irq_enable();
- if (prepared)
- preempt_enable();
-
- return 0;
-}
-
-/* Change the thread state */
-static void stopref_set_state(enum stopref_state state, int sleep)
-{
- atomic_set(&stopref_thread_ack, 0);
- wmb();
- stopref_state = state;
- while (atomic_read(&stopref_thread_ack) != stopref_num_threads) {
- if (sleep)
- yield();
- else
- cpu_relax();
- }
-}
-
-/* Stop the machine. Disables irqs. */
-static int stop_refcounts(void)
-{
- unsigned int i, cpu;
- unsigned long old_allowed;
- int ret = 0;
-
- /* One thread per cpu. We'll do our own. */
- cpu = smp_processor_id();
-
- /* FIXME: racy with set_cpus_allowed. */
- old_allowed = current->cpus_allowed;
- set_cpus_allowed(current, 1UL << (unsigned long)cpu);
-
- atomic_set(&stopref_thread_ack, 0);
- stopref_num_threads = 0;
- stopref_state = STOPREF_WAIT;
-
- /* No CPUs can come up or down during this. */
- down(&cpucontrol);
-
- for (i = 0; i < NR_CPUS; i++) {
- if (i == cpu || !cpu_online(i))
- continue;
- ret = kernel_thread(stopref, (void *)(long)i, CLONE_KERNEL);
- if (ret < 0)
- break;
- stopref_num_threads++;
- }
-
- /* Wait for them all to come to life. */
- while (atomic_read(&stopref_thread_ack) != stopref_num_threads)
- yield();
-
- /* If some failed, kill them all. */
- if (ret < 0) {
- stopref_set_state(STOPREF_EXIT, 1);
- up(&cpucontrol);
- return ret;
- }
-
- /* Don't schedule us away at this point, please. */
- preempt_disable();
-
- /* Now they are all scheduled, make them hold the CPUs, ready. */
- stopref_set_state(STOPREF_PREPARE, 0);
-
- /* Make them disable irqs. */
- stopref_set_state(STOPREF_DISABLE_IRQ, 0);
-
- local_irq_disable();
- return 0;
-}
-
-/* Restart the machine. Re-enables irqs. */
-static void restart_refcounts(void)
-{
- stopref_set_state(STOPREF_EXIT, 0);
- local_irq_enable();
- preempt_enable();
- up(&cpucontrol);
-}
-#else /* ...!SMP */
-static inline int stop_refcounts(void)
-{
- local_irq_disable();
- return 0;
-}
-static inline void restart_refcounts(void)
-{
- local_irq_enable();
-}
-#endif
-
static unsigned int module_refcount(struct module *mod)
{
unsigned int i, total = 0;
@@ -381,12 +233,30 @@ void cleanup_module(void)
}
EXPORT_SYMBOL(cleanup_module);

+static spinlock_t unload_lock = SPIN_LOCK_UNLOCKED;
+
+int module_get_sync(struct module *module)
+{
+ unsigned long flags;
+ int ret = 1;
+
+ spin_lock_irqsave(&unload_lock, flags);
+ if (!module_is_live(module)) {
+ local_dec(&module->ref[smp_processor_id()].count);
+ ret = 0;
+ }
+ spin_unlock_irqrestore(&unload_lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(module_get_sync);
+
asmlinkage long
sys_delete_module(const char *name_user, unsigned int flags)
{
struct module *mod;
char name[MODULE_NAME_LEN];
int ret, forced = 0;
+ unsigned long flags;

if (!capable(CAP_SYS_MODULE))
return -EPERM;
@@ -439,22 +309,15 @@ sys_delete_module(const char *name_user,
goto out;
}
}
- /* Stop the machine so refcounts can't move: irqs disabled. */
- DEBUGP("Stopping refcounts...\n");
- ret = stop_refcounts();
- if (ret != 0)
- goto out;

- /* If it's not unused, quit unless we are told to block. */
- if ((flags & O_NONBLOCK) && module_refcount(mod) != 0) {
- forced = try_force(flags);
- if (!forced)
- ret = -EWOULDBLOCK;
- } else {
- mod->waiter = current;
- mod->state = MODULE_STATE_GOING;
- }
- restart_refcounts();
+ spin_lock_irqsave(&unload_lock, flags);
+ mod->state = MODULE_STATE_GOING;
+ smp_mb();
+ if (module_refcount(mod))
+ mod->state = MODULE_STATE_LIVE;
+ else
+ ret = -EBUSY;
+ spin_unlock_irqrestore(&unload_lock, flags);

if (ret != 0)
goto out;

2003-03-02 17:23:23

by Adam J. Richter

[permalink] [raw]
Subject: Re: Patch/resubmit linux-2.5.63-bk4 try_module_get simplification

On Sun, 2 Mar 2003, Roman Zippel wrote:
>On Fri, 28 Feb 2003, Adam J. Richter wrote:

>> The following patch shrinks changes the implementation of
>> try_module_get() and friends to eliminate the special stopping of all
>> CPU's when a module is unloaded. Instead, it uses a read/write
>> semaphore in a perhaps slightly non-intuitive way.

>Hmm, I was waiting a bit for Rusty's comment, but there isn't any...
>Anyway the patch below does the same, but it gets the module ref
>speculative and calls module_get_sync() if there is a problem.

That is a clever implemenation!

I do have a few questions and comments though.

Is there enough traffic on the module reference counts to make
this trade-off worthwhile? On x86, the module_ref array is 512 bytes
per module (SMP_CACHE_BYTES=16 x NR_CPUS=32). For example, my gateway
machine has 49 modules loaded right now, so that would be 24kB. Even
in iptables, I would think that module reference counts should only be
modified when a rule is added or removed (because you still need to
maintain a separate usage count for each rule to know whether you can
remove it, even if it's not from a loadable module).

If it's worthwhile to trade off that amount of memory usage
for that amount of reduction in cross-cpu bus traffic, then you
probably should move unload_lock into each struct module rather than
having it be a single statically variable, as it is not protecting any
statically allocated data.

I also see a bigger corrolary of that trade-off. If there is
enough traffic to warrant a per-cpu approach for module reference
counts, surely there should be other rw_semaphore users that
experience more traffic on a smaller number of instances than module
references. So, perhaps your code should be generalized to "struct
big_fast_rw_sem". In particular, I think such a facility might be
useful for the semaphore guarding name lists, such as network device
names or filesystem type names (for example, file_systems_lock in
fs/filesystems.c).

I posted a patch some time ago for a module_get() that never
failed but which could only be called when a one of these semaphore
was held with at least a read lock, and required registration of the
relevant semaphores during the module's initialization routine. ~90%
of users of try_module_get users could use this interface and thereby
avoid rarely used potentially buggy error branches; the remaining
users would continue to try_module_get. It is precisely these cases
where big_fast_rw_sem might be useful.

One common characteristic of all of the big_fast_rw_sem uses
that I have in mind, including module reference counts, is that the
counter is statically allocated. This means that once per-cpu
variables are supported in modules, it will make sense to use
DEFINE_PER_CPU et al instead of declaring an array of NR_CPUS.
This has the advantage that it may use less memory if the platform
is able to determine a smaller maximum number of cpu's at run time,
and can potentially produce faster code if the platform implements
per-cpu using different memory mappings per cpu.

Adam J. Richter __ ______________ 575 Oroville Road
[email protected] \ / Milpitas, California 95035
+1 408 309-6081 | g g d r a s i l United States of America
"Free Software For The Rest Of Us."

2003-03-02 18:26:55

by Roman Zippel

[permalink] [raw]
Subject: Re: Patch/resubmit linux-2.5.63-bk4 try_module_get simplification

Hi,

On Sun, 2 Mar 2003, Adam J. Richter wrote:

> Is there enough traffic on the module reference counts to make
> this trade-off worthwhile?

I don't know, you have to ask that Rusty.
BTW the same trick is also possible with the old module count:

int try_inc_mod_count(struct module *mod)
{
int res;

if (mod) {
__MOD_INC_USE_COUNT(mod);
smp_mb__after_atomic_inc()
if (unlikely(mod->flags & MOD_DELETED))
goto check;
}
return 1;
check:
res = 1;
spin_lock(&unload_lock);
if (mod->flags & MOD_DELETED) {
__MOD_DEC_USE_COUNT(mod);
res = 0;
}
spin_unlock(&unload_lock);
return res;
}

(and a similiar change to sys_delete_module.)

bye, Roman

2003-03-07 06:56:14

by Rusty Russell

[permalink] [raw]
Subject: Re: Patch/resubmit linux-2.5.63-bk4 try_module_get simplification

On Sun, 2 Mar 2003 15:12:37 +0100 (CET)
Roman Zippel <[email protected]> wrote:

> Hi,
>
> On Fri, 28 Feb 2003, Adam J. Richter wrote:
>
> > The following patch shrinks changes the implementation of
> > try_module_get() and friends to eliminate the special stopping of all
> > CPU's when a module is unloaded. Instead, it uses a read/write
> > semaphore in a perhaps slightly non-intuitive way.
>
> Hmm, I was waiting a bit for Rusty's comment, but there isn't any...

(Sorry, on holiday, skimming mail)

I like it *very* much! There's a small race, but it's easily fixed by
turning the local_dec into a module_put (or variant) (otherwise an
rmmod --wait can sleep forever, waking up and seeing a non-zero
refcount, and going back to sleep).

Three other requests, if I may.

It'd be nice to have a comment for the two smp_mb() eg. /* Must increment
before checking state */ and vice-versa above the one in module.c. Secondly
probably nicer to just rename the modlist_lock to module_lock and use that,
and thirdly merge with the racefix patch below if Linus hasn't already
taken it.

BTW, I'm away from the office for a couple of weeks, and even if I weren't,
my test box doesn't stay up for more than minutes under any recent kernel 8(
Can you stress test this and send to Linus?

> BTW making the module ref functions not inline saves about 5KB with the
> standard config.

Interesting. The "if (!module)" could theortically be dropped, but I
just looked at the sizes of try_module_get as a standalone function
for various config options (x86, gcc 3.2.3):

UNLOAD=y
SMP=y
PREEMPT=y
try_module_get() 78 bytes

UNLOAD=y
SMP=y
PREEMPT=n
try_module_get() 48 bytes

UNLOAD=y
SMP=n
PREEMPT=n
try_module_get() 28 bytes

UNLOAD=n
try_module_get() 21 bytes

These numbers are a bit exaggerated due to function prologue and
epilogues, but on x86 PREEMPT=y case at least I definitely think that
it's worth out-of-lining it (a straight function call costs about 11
bytes). Don't know about other archs.

Thanks for the patch!
Rusty.
--
there are those who do and those who hang on and you don't see too
many doers quoting their contemporaries. -- Larry McVoy

Name: Fix two module races
Author: Bob Miller, Rusty Russell
Status: Trivial

D: Bob Miller points out that the try_module_get in use_module() can,
D: of course, fail. Secondly, there is a race between setting the module
D: live, and a simultaneous removal of it.

diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.62-bk6/kernel/module.c working-2.5.62-bk6-modraces/kernel/module.c
--- linux-2.5.62-bk6/kernel/module.c 2003-02-18 11:18:57.000000000 +1100
+++ working-2.5.62-bk6-modraces/kernel/module.c 2003-02-24 13:42:44.000000000 +1100
@@ -173,16 +173,19 @@ static int use_module(struct module *a,
struct module_use *use;
if (b == NULL || already_uses(a, b)) return 1;

+ if (!strong_try_module_get(b))
+ return 0;
+
DEBUGP("Allocating new usage for %s.\n", a->name);
use = kmalloc(sizeof(*use), GFP_ATOMIC);
if (!use) {
printk("%s: out of memory loading\n", a->name);
+ module_put(b);
return 0;
}

use->module_which_uses = a;
list_add(&use->list, &b->modules_which_use_me);
- try_module_get(b); /* Can't fail */
return 1;
}

@@ -1456,10 +1459,12 @@ sys_init_module(void *umod,
}

/* Now it's a first class citizen! */
+ down(&module_mutex);
mod->state = MODULE_STATE_LIVE;
module_free(mod, mod->module_init);
mod->module_init = NULL;
mod->init_size = 0;
+ up(&module_mutex);

return 0;
}

2003-03-07 06:56:26

by Rusty Russell

[permalink] [raw]
Subject: Re: Patch/resubmit linux-2.5.63-bk4 try_module_get simplification

In message <[email protected]> you write:
> Is there enough traffic on the module reference counts to make
> this trade-off worthwhile? On x86, the module_ref array is 512 bytes
> per module (SMP_CACHE_BYTES=16 x NR_CPUS=32). For example, my gateway

Good question. NR_CPUS=32 is a bad default for x86, to begin with. I
have also been working on a better kmalloc_percpu routine, so modules
(and, of course, anything else which can benefit) can switch across to
that (if DECLARE_PER_CPU is to work in modules, we *need* something
like this, which was my motivation).

I did write a "bigref" implementation a while back (attached below,
probably bitrotted), which uses a fairly simple "go to single refcount
when someone is watching" approach. It uses synchronize_kernel() to
ensure that noone is currently doing the "if (!slow_mode)
local_inc()", but I think a cleverer implementation might use
something like Roman's speculative approach maybe?

Cheers,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

Name: Bigrefs Implementation
Author: Rusty Russell
Status: Tested on 2.5.38

D: This is an implementation of cache-friendly reference counts. The
D: refcounters work in two modes: the normal mode just incs and decs a
D: cache-aligned per-cpu counter. When someone is waiting for the
D: reference count to hit 0, a flag is set and a shared reference
D: counter is decremented (which is slower).
D:
D: This uses a simple non-intrusive synchronize_kernel() primitive.

diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.42/include/linux/bigref.h working-2.5.42-bigrefs/include/linux/bigref.h
--- linux-2.5.42/include/linux/bigref.h Thu Jan 1 10:00:00 1970
+++ working-2.5.42-bigrefs/include/linux/bigref.h Tue Oct 15 17:51:16 2002
@@ -0,0 +1,81 @@
+#ifndef _LINUX_BIGREF_H
+#define _LINUX_BIGREF_H
+#include <linux/cache.h>
+#include <linux/smp.h>
+#include <linux/compiler.h>
+#include <linux/thread_info.h>
+#include <linux/preempt.h>
+#include <asm/atomic.h>
+
+/* Big reference counts for Linux.
+ * (C) Copyright 2002 Paul Russell, IBM Corporation.
+ */
+
+struct bigref_percpu
+{
+ atomic_t counter;
+ int slow_mode;
+} ____cacheline_aligned_in_smp;
+
+struct bigref
+{
+ struct bigref_percpu ref[NR_CPUS];
+
+ atomic_t slow_count;
+ struct task_struct *waiter;
+};
+
+/* Initialize the bigref in slow mode */
+extern void bigref_init_slow(struct bigref *ref, int value);
+
+/* Move it to fast mode */
+extern void bigref_start_fast(struct bigref *ref);
+
+/* Get the approximate value */
+extern int bigref_val(struct bigref *ref);
+
+/* Wait for it refcount to hit zero (sleeps) */
+extern int bigref_wait_for_zero(struct bigref *ref, long task_state);
+
+/* Is it currently in fast mode? */
+static inline int bigref_is_fast(struct bigref *ref)
+{
+ /* Could use any cpu's, but this is cache-friendly if we're
+ about to frob the counter */
+ return !ref->ref[smp_processor_id()].slow_mode;
+}
+
+extern void __bigref_inc(struct bigref *ref);
+extern void __bigref_dec(struct bigref *ref);
+
+/* We only need protection against local interrupts. */
+#ifndef __HAVE_LOCAL_INC
+#define local_inc(x) atomic_inc(x)
+#define local_dec(x) atomic_dec(x)
+#endif
+
+static inline void bigref_inc(struct bigref *ref)
+{
+ struct bigref_percpu *cpu;
+
+ cpu = &ref->ref[get_cpu()];
+ if (likely(!cpu->slow_mode))
+ local_inc(&cpu->counter);
+ else
+ __bigref_inc(ref);
+ put_cpu();
+}
+
+/* Return true if we were the last one to decrement it */
+static inline void bigref_dec(struct bigref *ref)
+{
+ struct bigref_percpu *cpu;
+
+ cpu = &ref->ref[get_cpu()];
+ if (likely(!cpu->slow_mode))
+ local_dec(&cpu->counter);
+ else
+ __bigref_dec(ref);
+ put_cpu();
+}
+#endif /* _LINUX_BIGREF_H */
diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.42/include/linux/sched.h working-2.5.42-bigrefs/include/linux/sched.h
--- linux-2.5.42/include/linux/sched.h Tue Oct 15 15:30:04 2002
+++ working-2.5.42-bigrefs/include/linux/sched.h Tue Oct 15 17:51:16 2002
@@ -449,8 +449,10 @@ do { if (atomic_dec_and_test(&(tsk)->usa

#if CONFIG_SMP
extern void set_cpus_allowed(task_t *p, unsigned long new_mask);
+extern void synchronize_kernel(void);
#else
# define set_cpus_allowed(p, new_mask) do { } while (0)
+# define synchronize_kernel() do { } while (0)
#endif

extern void set_user_nice(task_t *p, long nice);
diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.42/kernel/Makefile working-2.5.42-bigrefs/kernel/Makefile
--- linux-2.5.42/kernel/Makefile Tue Oct 15 15:30:05 2002
+++ working-2.5.42-bigrefs/kernel/Makefile Tue Oct 15 17:51:16 2002
@@ -3,12 +3,12 @@
#

export-objs = signal.o sys.o kmod.o workqueue.o ksyms.o pm.o exec_domain.o \
- printk.o platform.o suspend.o dma.o module.o cpufreq.o
+ printk.o platform.o suspend.o dma.o module.o cpufreq.o bigref.o

obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
module.o exit.o itimer.o time.o softirq.o resource.o \
sysctl.o capability.o ptrace.o timer.o user.o \
- signal.o sys.o kmod.o workqueue.o futex.o platform.o pid.o
+ signal.o sys.o kmod.o workqueue.o futex.o platform.o pid.o bigref.o

obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += cpu.o
diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.42/kernel/bigref.c working-2.5.42-bigrefs/kernel/bigref.c
--- linux-2.5.42/kernel/bigref.c Thu Jan 1 10:00:00 1970
+++ working-2.5.42-bigrefs/kernel/bigref.c Tue Oct 15 18:02:43 2002
@@ -0,0 +1,122 @@
+/* Big reference counts for Linux.
+ * (C) Copyright 2002 Paul Russell, IBM Corporation.
+ */
+#include <linux/bigref.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+
+/* Atomic is 24 bits on sparc, so make this 23. */
+#define BIGREF_BIAS (1 << 23)
+
+void __bigref_inc(struct bigref *ref)
+{
+ /* They *must* have read slow_mode before they touch slow
+ count, which is not guaranteed on all architectures. */
+ rmb();
+ atomic_inc(&ref->slow_count);
+}
+
+void __bigref_dec(struct bigref *ref)
+{
+ /* They *must* have read slow_mode before they touch slow
+ count, which is not guaranteed on all architectures. */
+ rmb();
+ if (atomic_dec_and_test(&ref->slow_count))
+ wake_up_process(ref->waiter);
+}
+
+void bigref_init_slow(struct bigref *ref, int value)
+{
+ unsigned int i;
+
+ /* Bias by 1 so it doesn't fall to zero with noone waiting. */
+ atomic_set(&ref->slow_count, value+1);
+ for (i = 0; i < NR_CPUS; i++) {
+ atomic_set(&ref->ref[i].counter, 0);
+ ref->ref[i].slow_mode = 1;
+ }
+ ref->waiter = NULL; /* To trap bugs */
+}
+
+/* Start it in fast mode */
+void bigref_start_fast(struct bigref *ref)
+{
+ unsigned int i;
+
+ /* Remove bias. */
+ atomic_sub(1, &ref->slow_count);
+ for (i = 0; i < NR_CPUS; i++)
+ ref->ref[i].slow_mode = 0;
+}
+
+/* Get the approximate value */
+int bigref_val(struct bigref *ref)
+{
+ unsigned int i;
+ int total;
+
+ total = atomic_read(&ref->slow_count);
+ for (i = 0; i < NR_CPUS; i++)
+ total += atomic_read(&ref->ref[i].counter);
+
+ return total;
+}
+
+int bigref_wait_for_zero(struct bigref *ref, long task_state)
+{
+ unsigned int i;
+ int total;
+
+ /* Boost it high so noone drops it to zero. */
+ atomic_add(BIGREF_BIAS, &ref->slow_count);
+ wmb();
+ for (i = 0; i < NR_CPUS; i++)
+ ref->ref[i].slow_mode = 1;
+ wmb();
+
+ /* Wait for that to sink in everywhere... */
+ synchronize_kernel();
+
+ /* Sum all the (now inactive) per-cpu counters */
+ total = BIGREF_BIAS;
+ for (i = 0; i < NR_CPUS; i++)
+ total += atomic_read(&ref->ref[i].counter);
+
+ /* Now we move those counters into the slow counter, and take
+ away the bias again. Leave one refcount for us. */
+ atomic_sub(total - 1, &ref->slow_count);
+
+ /* Someone may dec to zero after the next step, so be ready. */
+ ref->waiter = current;
+ current->state = task_state;
+ wmb();
+
+ /* Drop (probably final) refcount */
+ __bigref_dec(ref);
+ schedule();
+
+ /* Not interrupted? */
+ if (atomic_read(&ref->slow_count) == 0)
+ return 0;
+
+ /* Revert the bigref to fast mode (assumes once it hits zero
+ it won't increase again, so code is safe) */
+ for (i = 0; i < NR_CPUS; i++) {
+ /* We already included previous per-cpu counters into
+ total, so reset them */
+ atomic_set(&ref->ref[i].counter, 0);
+ wmb();
+ ref->ref[i].slow_mode = 0;
+ }
+
+ /* Might have decremented to zero in the meantime. */
+ if (bigref_val(ref) == 0)
+ return 0;
+ else
+ return -EINTR;
+}
+
+EXPORT_SYMBOL(__bigref_inc);
+EXPORT_SYMBOL(__bigref_dec);
+EXPORT_SYMBOL(bigref_val);
+EXPORT_SYMBOL(bigref_wait_for_zero);
diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.42/kernel/sched.c working-2.5.42-bigrefs/kernel/sched.c
--- linux-2.5.42/kernel/sched.c Tue Oct 15 15:31:06 2002
+++ working-2.5.42-bigrefs/kernel/sched.c Tue Oct 15 17:51:16 2002
@@ -1910,6 +1910,50 @@ void __init init_idle(task_t *idle, int
}

#if CONFIG_SMP
+/* This scales quite well (eg. 64 processors, average time to wait for
+ first schedule = jiffie/64. Total time for all processors =
+ jiffie/63 + jiffie/62...
+
+ At 1024 cpus, this is about 7.5 jiffies. And that assumes noone
+ schedules early. --RR */
+void synchronize_kernel(void)
+{
+ unsigned long cpus_allowed, old_cpus_allowed, old_prio, old_policy;
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+ unsigned int i;
+
+ /* Save old values. */
+ read_lock_irq(&tasklist_lock);
+ old_cpus_allowed = current->cpus_allowed;
+ old_prio = current->rt_priority;
+ old_policy = current->policy;
+ read_unlock_irq(&tasklist_lock);
+
+ /* Highest priority we can manage. */
+ setscheduler(current->pid, SCHED_FIFO, &param);
+
+ /* Make us schedulable on all other online CPUs: if we get
+ preempted here it doesn't really matter, since it means we
+ *did* run on the cpu returned by smp_processor_id(), which
+ is all we care about. */
+ cpus_allowed = 0;
+ for (i = 0; i < NR_CPUS; i++)
+ if (cpu_online(i) && i != smp_processor_id())
+ cpus_allowed |= (1 << i);
+
+ while (cpus_allowed) {
+ /* Change CPUs */
+ set_cpus_allowed(current, cpus_allowed);
+ /* Eliminate this one */
+ cpus_allowed &= ~(1 << smp_processor_id());
+ }
+
+ /* Back to normal. */
+ set_cpus_allowed(current, old_cpus_allowed);
+ param.sched_priority = old_prio;
+ setscheduler(current->pid, old_policy, &param);
+}
+
/*
* This is how migration works:
*

2003-03-07 17:02:11

by Bob Miller

[permalink] [raw]
Subject: Re: Patch/resubmit linux-2.5.63-bk4 try_module_get simplification

The patch below was included by Linus around 2/14/03.

On Fri, Mar 07, 2003 at 05:34:18PM +1100, Rusty Russell wrote:
> Three other requests, if I may.
>
> It'd be nice to have a comment for the two smp_mb() eg. /* Must increment
> before checking state */ and vice-versa above the one in module.c. Secondly
> probably nicer to just rename the modlist_lock to module_lock and use that,
> and thirdly merge with the racefix patch below if Linus hasn't already
> taken it.
>
>
> Name: Fix two module races
> Author: Bob Miller, Rusty Russell
> Status: Trivial
>
> D: Bob Miller points out that the try_module_get in use_module() can,
> D: of course, fail. Secondly, there is a race between setting the module
> D: live, and a simultaneous removal of it.
>
> diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.62-bk6/kernel/module.c working-2.5.62-bk6-modraces/kernel/module.c
> --- linux-2.5.62-bk6/kernel/module.c 2003-02-18 11:18:57.000000000 +1100
> +++ working-2.5.62-bk6-modraces/kernel/module.c 2003-02-24 13:42:44.000000000 +1100
> @@ -173,16 +173,19 @@ static int use_module(struct module *a,
> struct module_use *use;
> if (b == NULL || already_uses(a, b)) return 1;
>
> + if (!strong_try_module_get(b))
> + return 0;
> +
> DEBUGP("Allocating new usage for %s.\n", a->name);
> use = kmalloc(sizeof(*use), GFP_ATOMIC);
> if (!use) {
> printk("%s: out of memory loading\n", a->name);
> + module_put(b);
> return 0;
> }
>
> use->module_which_uses = a;
> list_add(&use->list, &b->modules_which_use_me);
> - try_module_get(b); /* Can't fail */
> return 1;
> }
>
> @@ -1456,10 +1459,12 @@ sys_init_module(void *umod,
> }
>
> /* Now it's a first class citizen! */
> + down(&module_mutex);
> mod->state = MODULE_STATE_LIVE;
> module_free(mod, mod->module_init);
> mod->module_init = NULL;
> mod->init_size = 0;
> + up(&module_mutex);
>
> return 0;
> }
>

--
Bob Miller Email: [email protected]
Open Source Development Lab Phone: 503.626.2455 Ext. 17