Here is a way to replace all of the specialized "stop CPU"
locking code in kernel/module.c with an rw_semaphore by using
down_read_trylock in try_module_get() and down_write when beginning to
unload the module.
The following UNTESTED patch, a net deletion of 136 lines,
illustrates the basic idea. I've only verified that the resulting
kernel/module.c compiles. I'm sorry that I can't test it right now as
I'm in the midst of working on something else.
By the way, I think this change could also help eliminate the
module->state variable, which I believe currently can change while
a reader could be trying to read it.
Adam J. Richter __ ______________ 575 Oroville Road
[email protected] \ / Milpitas, California 95035
+1 408 309-6081 | g g d r a s i l United States of America
"Free Software For The Rest Of Us."
--- linux-2.5.54/include/linux/module.h 2003-01-01 19:22:53.000000000 -0800
+++ linux/include/linux/module.h 2003-01-06 17:17:17.000000000 -0800
@@ -13,12 +13,13 @@
#include <linux/stat.h>
#include <linux/compiler.h>
#include <linux/cache.h>
#include <linux/kmod.h>
#include <linux/elf.h>
#include <linux/stringify.h>
+#include <linux/rwsem.h>
#include <asm/module.h>
#include <asm/uaccess.h> /* For struct exception_table_entry */
/* Not Yet Implemented */
#define MODULE_LICENSE(name)
@@ -147,12 +148,14 @@
struct mod_arch_specific arch;
/* Am I unsafe to unload? */
int unsafe;
#ifdef CONFIG_MODULE_UNLOAD
+ struct rw_semaphore unload_rwsem;
+
/* Reference counts */
struct module_ref ref[NR_CPUS];
/* What modules depend on me? */
struct list_head modules_which_use_me;
@@ -198,15 +201,16 @@
static inline int try_module_get(struct module *module)
{
int ret = 1;
if (module) {
unsigned int cpu = get_cpu();
- if (likely(module_is_live(module)))
+ if (down_read_trylock(&module->unload_rwsem)) {
local_inc(&module->ref[cpu].count);
- else
+ up_read(&module->unload_rwsem);
+ } else
ret = 0;
put_cpu();
}
return ret;
}
--- linux-2.5.54/kernel/module.c 2003-01-01 19:22:35.000000000 -0800
+++ linux/kernel/module.c 2003-01-06 17:29:09.000000000 -0800
@@ -18,6 +18,7 @@
*/
#include <linux/config.h>
#include <linux/module.h>
+#include <linux/module_rwsem.h>
#include <linux/moduleloader.h>
#include <linux/init.h>
#include <linux/slab.h>
@@ -126,6 +127,10 @@
{
unsigned int i;
+ init_rwsem(&mod->unload_rwsem);
+ down_read(&mod->unload_rwsem);
+ /* Prevent unloading of module until module_init() completes. */
+
INIT_LIST_HEAD(&mod->modules_which_use_me);
for (i = 0; i < NR_CPUS; i++)
atomic_set(&mod->ref[i].count, 0);
@@ -195,154 +200,6 @@
}
}
-#ifdef CONFIG_SMP
-/* Thread to stop each CPU in user context. */
-enum stopref_state {
- STOPREF_WAIT,
- STOPREF_PREPARE,
- STOPREF_DISABLE_IRQ,
- STOPREF_EXIT,
-};
-
-static enum stopref_state stopref_state;
-static unsigned int stopref_num_threads;
-static atomic_t stopref_thread_ack;
-
-static int stopref(void *cpu)
-{
- int irqs_disabled = 0;
- int prepared = 0;
-
- sprintf(current->comm, "kmodule%lu\n", (unsigned long)cpu);
-
- /* Highest priority we can manage, and move to right CPU. */
-#if 0 /* FIXME */
- struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
- setscheduler(current->pid, SCHED_FIFO, ¶m);
-#endif
- set_cpus_allowed(current, 1UL << (unsigned long)cpu);
-
- /* Ack: we are alive */
- atomic_inc(&stopref_thread_ack);
-
- /* Simple state machine */
- while (stopref_state != STOPREF_EXIT) {
- if (stopref_state == STOPREF_DISABLE_IRQ && !irqs_disabled) {
- local_irq_disable();
- irqs_disabled = 1;
- /* Ack: irqs disabled. */
- atomic_inc(&stopref_thread_ack);
- } else if (stopref_state == STOPREF_PREPARE && !prepared) {
- /* Everyone is in place, hold CPU. */
- preempt_disable();
- prepared = 1;
- atomic_inc(&stopref_thread_ack);
- }
- if (irqs_disabled || prepared)
- cpu_relax();
- else
- yield();
- }
-
- /* Ack: we are exiting. */
- atomic_inc(&stopref_thread_ack);
-
- if (irqs_disabled)
- local_irq_enable();
- if (prepared)
- preempt_enable();
-
- return 0;
-}
-
-/* Change the thread state */
-static void stopref_set_state(enum stopref_state state, int sleep)
-{
- atomic_set(&stopref_thread_ack, 0);
- wmb();
- stopref_state = state;
- while (atomic_read(&stopref_thread_ack) != stopref_num_threads) {
- if (sleep)
- yield();
- else
- cpu_relax();
- }
-}
-
-/* Stop the machine. Disables irqs. */
-static int stop_refcounts(void)
-{
- unsigned int i, cpu;
- unsigned long old_allowed;
- int ret = 0;
-
- /* One thread per cpu. We'll do our own. */
- cpu = smp_processor_id();
-
- /* FIXME: racy with set_cpus_allowed. */
- old_allowed = current->cpus_allowed;
- set_cpus_allowed(current, 1UL << (unsigned long)cpu);
-
- atomic_set(&stopref_thread_ack, 0);
- stopref_num_threads = 0;
- stopref_state = STOPREF_WAIT;
-
- /* No CPUs can come up or down during this. */
- down(&cpucontrol);
-
- for (i = 0; i < NR_CPUS; i++) {
- if (i == cpu || !cpu_online(i))
- continue;
- ret = kernel_thread(stopref, (void *)(long)i, CLONE_KERNEL);
- if (ret < 0)
- break;
- stopref_num_threads++;
- }
-
- /* Wait for them all to come to life. */
- while (atomic_read(&stopref_thread_ack) != stopref_num_threads)
- yield();
-
- /* If some failed, kill them all. */
- if (ret < 0) {
- stopref_set_state(STOPREF_EXIT, 1);
- up(&cpucontrol);
- return ret;
- }
-
- /* Don't schedule us away at this point, please. */
- preempt_disable();
-
- /* Now they are all scheduled, make them hold the CPUs, ready. */
- stopref_set_state(STOPREF_PREPARE, 0);
-
- /* Make them disable irqs. */
- stopref_set_state(STOPREF_DISABLE_IRQ, 0);
-
- local_irq_disable();
- return 0;
-}
-
-/* Restart the machine. Re-enables irqs. */
-static void restart_refcounts(void)
-{
- stopref_set_state(STOPREF_EXIT, 0);
- local_irq_enable();
- preempt_enable();
- up(&cpucontrol);
-}
-#else /* ...!SMP */
-static inline int stop_refcounts(void)
-{
- local_irq_disable();
- return 0;
-}
-static inline void restart_refcounts(void)
-{
- local_irq_enable();
-}
-#endif
-
static unsigned int module_refcount(struct module *mod)
{
unsigned int i, total = 0;
@@ -378,7 +235,8 @@
{
struct module *mod;
char name[MODULE_NAME_LEN];
- int ret, forced = 0;
+ int ret = 0;
+ int forced = 0;
if (!capable(CAP_SYS_MODULE))
return -EPERM;
@@ -431,25 +289,21 @@
goto out;
}
}
- /* Stop the machine so refcounts can't move: irqs disabled. */
- DEBUGP("Stopping refcounts...\n");
- ret = stop_refcounts();
- if (ret != 0)
- goto out;
+
+ down_write(&mod->unload_rwsem);
/* If it's not unused, quit unless we are told to block. */
if ((flags & O_NONBLOCK) && module_refcount(mod) != 0) {
forced = try_force(flags);
- if (!forced)
+ if (!forced) {
+ up_write(&mod->unload_rwsem);
ret = -EWOULDBLOCK;
+ goto out;
+ }
} else {
mod->waiter = current;
mod->state = MODULE_STATE_GOING;
}
- restart_refcounts();
-
- if (ret != 0)
- goto out;
if (forced)
goto destroy;
@@ -471,9 +325,12 @@
destroy:
/* Final destruction now noone is using it. */
mod->exit();
+ up_write(&mod->unload_rwsem);
free_module(mod);
+ mod = NULL;
out:
+
up(&module_mutex);
return ret;
}
@@ -1236,6 +1093,9 @@
/* Start the module */
ret = mod->init();
+#ifdef CONFIG_MODULE_UNLOAD
+ up_read(&mod->unload_rwsem);
+#endif
if (ret < 0) {
/* Init routine failed: abort. Try to protect us from
buggy refcounters. */
I wrote:
> Here is a way to replace all of the specialized "stop CPU"
>locking code in kernel/module.c with an rw_semaphore by using
>down_read_trylock in try_module_get() and down_write when beginning to
>unload the module.
>
> The following UNTESTED patch, a net deletion of 136 lines,
I am running that patch now on two computers. It seems to
be OK.
Rusty, I'd be interested in knowing what you think of the
patch (likewise for other lkml readers).
Adam J. Richter __ ______________ 575 Oroville Road
[email protected] \ / Milpitas, California 95035
+1 408 309-6081 | g g d r a s i l United States of America
"Free Software For The Rest Of Us."
At 12:53 AM 1/7/2003 -0800, Adam J. Richter wrote:
>I wrote:
>> Here is a way to replace all of the specialized "stop CPU"
>>locking code in kernel/module.c with an rw_semaphore by using
>>down_read_trylock in try_module_get() and down_write when beginning to
>>unload the module.
>>
>> The following UNTESTED patch, a net deletion of 136 lines,
>
> I am running that patch now on two computers. It seems to
>be OK.
>
> Rusty, I'd be interested in knowing what you think of the
>patch (likewise for other lkml readers).
We have to be able to call try_module_get() from interrupt context.
Max
In message <[email protected]> you write:
> Here is a way to replace all of the specialized "stop CPU"
> locking code in kernel/module.c with an rw_semaphore by using
> down_read_trylock in try_module_get() and down_write when beginning to
> unload the module.
And now you can't modularize netfilter modules.
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
Rusty Russell wrote:
>In message <[email protected]> you write:
>> Here is a way to replace all of the specialized "stop CPU"
>> locking code in kernel/module.c with an rw_semaphore by using
>> down_read_trylock in try_module_get() and down_write when beginning to
>> unload the module.
>And now you can't modularize netfilter modules.
Why not? Last time you went looking in the networking code
for an example of something that had to increment a module reference
in a context where blocking was not allowed you ended up conceding
that you example was incorrect. If you've now found an example,
please let me know.
I just booted my gateway machine to a kernel using my
aforemetioned patch and various netfilter modules. I've surfed the
web, FTP'ed file and run irc through it. It seems to be okay.
Here is the lsmod listing from it.
Module Size Used by
iptable_nat 28324 1 [unsafe],
ip_conntrack 39788 2 iptable_nat,[unsafe],
ext2 60712 0 -
pcmcia_core 55744 0 -
i810 65816 0 -
atkbd 6820 0 -
i8042 8864 0 -
serio 4564 2 atkbd,i8042,
autofs 14992 1 -
ipt_TCPMSS 3568 1 [unsafe],
iptable_filter 2272 1 [unsafe],
ip_tables 17296 8 iptable_nat,ipt_TCPMSS,iptable_filter,[unsafe],
nfsd 111824 1 [unsafe],
exportfs 6160 1 nfsd,
nfs 132092 0 -
lockd 58736 3 nfsd,nfs,[unsafe],
sunrpc 114136 4 nfsd,nfs,lockd,[unsafe],
pppoe 14336 1 [unsafe],
pppox 3480 1 pppoe,
af_packet 20760 2 [unsafe],
ppp_async 11296 0 -
ppp_generic 30264 5 pppoe,pppox,ppp_async,
slhc 6576 1 ppp_generic,
ipv4 392804 88 iptable_nat,ip_conntrack,sunrpc,af_packet,[unsafe],
unix 23884 11 [unsafe],
sis_agp 4224 0 -
agpgart 23248 2 sis_agp,
ohci_hcd 28400 0 -
usbcore 101108 3 ohci_hcd,
ac97_codec 12432 0 -
sis900 16548 0 -
tulip 52960 2 [unsafe],
crc32 4272 2 sis900,tulip,
ext3 112232 3 -
jbd 65840 1 ext3,
mbcache 7764 2 ext2,ext3,
ide_disk 17332 5 -
ide_mod 143684 1 ide_disk,[unsafe],
Adam J. Richter __ ______________ 575 Oroville Road
[email protected] \ / Milpitas, California 95035
+1 408 309-6081 | g g d r a s i l United States of America
"Free Software For The Rest Of Us."
On Thu, 09 Jan 2003, Max Krasnyansky wrote:
>At 12:53 AM 1/7/2003 -0800, Adam J. Richter wrote:
>>I wrote:
>>> Here is a way to replace all of the specialized "stop CPU"
>>>locking code in kernel/module.c with an rw_semaphore by using
>>>down_read_trylock in try_module_get() and down_write when beginning to
>>>unload the module.
>>>
>>> The following UNTESTED patch, a net deletion of 136 lines,
>>
>> I am running that patch now on two computers. It seems to
>>be OK.
>>
>> Rusty, I'd be interested in knowing what you think of the
>>patch (likewise for other lkml readers).
>We have to be able to call try_module_get() from interrupt context.
Where? Why? Please show me one or more examples.
Adam J. Richter __ ______________ 575 Oroville Road
[email protected] \ / Milpitas, California 95035
+1 408 309-6081 | g g d r a s i l United States of America
"Free Software For The Rest Of Us."
I'll bite .... what the flip is [unsafe] ??
On Fri, 10 Jan 2003, Adam J. Richter wrote:
> Rusty Russell wrote:
> >In message <[email protected]> you write:
> >> Here is a way to replace all of the specialized "stop CPU"
> >> locking code in kernel/module.c with an rw_semaphore by using
> >> down_read_trylock in try_module_get() and down_write when beginning to
> >> unload the module.
>
> >And now you can't modularize netfilter modules.
>
> Why not? Last time you went looking in the networking code
> for an example of something that had to increment a module reference
> in a context where blocking was not allowed you ended up conceding
> that you example was incorrect. If you've now found an example,
> please let me know.
>
> I just booted my gateway machine to a kernel using my
> aforemetioned patch and various netfilter modules. I've surfed the
> web, FTP'ed file and run irc through it. It seems to be okay.
> Here is the lsmod listing from it.
>
> Module Size Used by
> iptable_nat 28324 1 [unsafe],
> ip_conntrack 39788 2 iptable_nat,[unsafe],
> ext2 60712 0 -
> pcmcia_core 55744 0 -
> i810 65816 0 -
> atkbd 6820 0 -
> i8042 8864 0 -
> serio 4564 2 atkbd,i8042,
> autofs 14992 1 -
> ipt_TCPMSS 3568 1 [unsafe],
> iptable_filter 2272 1 [unsafe],
> ip_tables 17296 8 iptable_nat,ipt_TCPMSS,iptable_filter,[unsafe],
> nfsd 111824 1 [unsafe],
> exportfs 6160 1 nfsd,
> nfs 132092 0 -
> lockd 58736 3 nfsd,nfs,[unsafe],
> sunrpc 114136 4 nfsd,nfs,lockd,[unsafe],
> pppoe 14336 1 [unsafe],
> pppox 3480 1 pppoe,
> af_packet 20760 2 [unsafe],
> ppp_async 11296 0 -
> ppp_generic 30264 5 pppoe,pppox,ppp_async,
> slhc 6576 1 ppp_generic,
> ipv4 392804 88 iptable_nat,ip_conntrack,sunrpc,af_packet,[unsafe],
> unix 23884 11 [unsafe],
> sis_agp 4224 0 -
> agpgart 23248 2 sis_agp,
> ohci_hcd 28400 0 -
> usbcore 101108 3 ohci_hcd,
> ac97_codec 12432 0 -
> sis900 16548 0 -
> tulip 52960 2 [unsafe],
> crc32 4272 2 sis900,tulip,
> ext3 112232 3 -
> jbd 65840 1 ext3,
> mbcache 7764 2 ext2,ext3,
> ide_disk 17332 5 -
> ide_mod 143684 1 ide_disk,[unsafe],
>
>
> Adam J. Richter __ ______________ 575 Oroville Road
> [email protected] \ / Milpitas, California 95035
> +1 408 309-6081 | g g d r a s i l United States of America
> "Free Software For The Rest Of Us."
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
Andre Hedrick
LAD Storage Consulting Group
In message <[email protected]> you
write:
>
> I'll bite .... what the flip is [unsafe] ??
Use of obsolete (racy) module reference count interface. ie. someone's
upping the reference count with __MOD_INC_USE_COUNT() or
MOD_INC_USE_COUNT(), not "try_module_get()", so we can't tell them
that the module is going away so they couldn't get a reference.
Going through and fixing these up is generally fairly easy.
Hope that clarifies,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
In message <[email protected]> you write:
> Rusty Russell wrote:
> >In message <[email protected]> you write:
> >> Here is a way to replace all of the specialized "stop CPU"
> >> locking code in kernel/module.c with an rw_semaphore by using
> >> down_read_trylock in try_module_get() and down_write when beginning to
> >> unload the module.
>
> >And now you can't modularize netfilter modules.
>
> Why not? Last time you went looking in the networking code
> for an example of something that had to increment a module reference
> in a context where blocking was not allowed you ended up conceding
> that you example was incorrect.
No, you're thinking of the IPv4 stack. I didn't use netfilter as an
example, because that opens me to "well, FIX NETFILTER then!". If it
were the only case, it's probably arguable.
The problems with netfilter modules are exactly why I started looking
at module locking over two years ago.
> I just booted my gateway machine to a kernel using my
> aforemetioned patch and various netfilter modules. I've surfed the
> web, FTP'ed file and run irc through it. It seems to be okay.
Sure! That's because the netfilter modules use a horrific hack, by
keeping their own "usage" counts and then spinning (potentially
forever) on unload until it hits zero.
Logically the skb->nfct would have an owner field in it.
Now, performance. You want a brlock, at least: the performance of
either the security infrastructure or netfilter modules is going to
suck horribly with anything else. And the bogolock used in module.c
is even lighter weight than a brlock, with its associated atomic ops.
Hope that clarifies...
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
I wrote:
>On Thu, 09 Jan 2003, Max Krasnyansky wrote:
>>We have to be able to call try_module_get() from interrupt context.
> Where? Why? Please show me one or more examples.
Come to think of it, I don't think you even have to answer
that question. You should be able to use my try_module_get() from
interrupt context. It never blocks.
Adam J. Richter __ ______________ 575 Oroville Road
[email protected] \ / Milpitas, California 95035
+1 408 309-6081 | g g d r a s i l United States of America
"Free Software For The Rest Of Us."
Rusty Russell wrote:
>In message <[email protected]> you write:
>> Rusty Russell wrote:
>> >In message <[email protected]> you write:
>> >> Here is a way to replace all of the specialized "stop CPU"
>> >> locking code in kernel/module.c with an rw_semaphore by using
>> >> down_read_trylock in try_module_get() and down_write when beginning to
>> >> unload the module.
>>
>> >And now you can't modularize netfilter modules.
>>
>> Why not? Last time you went looking in the networking code
>> for an example of something that had to increment a module reference
>> in a context where blocking was not allowed you ended up conceding
>> that you example was incorrect.
>No, you're thinking of the IPv4 stack. I didn't use netfilter as an
>example, because that opens me to "well, FIX NETFILTER then!". If it
>were the only case, it's probably arguable.
>The problems with netfilter modules are exactly why I started looking
>at module locking over two years ago.
>> I just booted my gateway machine to a kernel using my
>> aforemetioned patch and various netfilter modules. I've surfed the
>> web, FTP'ed file and run irc through it. It seems to be okay.
>Sure! That's because the netfilter modules use a horrific hack, by
>keeping their own "usage" counts and then spinning (potentially
>forever) on unload until it hits zero.
[...]
Although I suspect that this could be fixed so that the
spinning is guanteed not to be forever, it happens that my
module_put(), which is the same as your module_put() is non-blocking
(as is my try_module_get(), by the way). So I don't see what the
problem is. You should still be able to call try_module_get and
module_put from an interrupt context.
Adam J. Richter __ ______________ 575 Oroville Road
[email protected] \ / Milpitas, California 95035
+1 408 309-6081 | g g d r a s i l United States of America
"Free Software For The Rest Of Us."
In message <[email protected]> you write:
> I wrote:
> >On Thu, 09 Jan 2003, Max Krasnyansky wrote:
> >>We have to be able to call try_module_get() from interrupt context.
>
> > Where? Why? Please show me one or more examples.
>
> Come to think of it, I don't think you even have to answer
> that question. You should be able to use my try_module_get() from
> interrupt context. It never blocks.
Yes, your try_module_get just gets spurious failures on SMP, as well
as thrashing the cacheline (why bother with one counter per CPU
then?).
I guess I just don't understand your solution?
Sorry,
Rusty.
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
On Fri, 10 Jan 2003, Andre Hedrick wrote:
>
> I'll bite .... what the flip is [unsafe] ??
Actually, I believe it means the modules doesn't use try_module_get
instead of the old I_cant_remember_the_name macro.
--
bill davidsen <[email protected]>
CTO, TMR Associates, Inc
Doing interesting things with little computers since 1979.