Right now there is a hole in the module ref counting system because
there is no proper ref counting for sysctl tables used by modules.
This means that if an application is holding /proc/sys/foo open and
module that created it is unloaded, then the application touches the
file the kernel will oops.
This patch fixes that by maintaining source compatibility via macro.
I am sure someone already thought of this, it just doesn't appear to
have made it in yet.
Signed-off-by: Stephen Hemminger <[email protected]>
--- net-2.6.16.orig/include/linux/sysctl.h
+++ net-2.6.16/include/linux/sysctl.h
@@ -21,6 +21,7 @@
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/compiler.h>
+#include <linux/module.h>
struct file;
struct completion;
@@ -973,9 +974,13 @@ struct ctl_table_header
struct completion *unregistering;
};
-struct ctl_table_header * register_sysctl_table(ctl_table * table,
- int insert_at_head);
-void unregister_sysctl_table(struct ctl_table_header * table);
+extern struct ctl_table_header *__register_sysctl_table(ctl_table * table,
+ struct module *owner,
+ int insert_at_head);
+#define register_sysctl_table(table, insert_at_head) \
+ __register_sysctl_table(table, THIS_MODULE, insert_at_head)
+
+extern void unregister_sysctl_table(struct ctl_table_header * table);
#else /* __KERNEL__ */
--- net-2.6.16.orig/kernel/sysctl.c
+++ net-2.6.16/kernel/sysctl.c
@@ -169,7 +169,8 @@ struct file_operations proc_sys_file_ope
extern struct proc_dir_entry *proc_sys_root;
-static void register_proc_table(ctl_table *, struct proc_dir_entry *, void *);
+static void register_proc_table(ctl_table *, struct proc_dir_entry *, void *,
+ struct module *);
static void unregister_proc_table(ctl_table *, struct proc_dir_entry *);
#endif
@@ -1036,7 +1037,7 @@ static void start_unregistering(struct c
void __init sysctl_init(void)
{
#ifdef CONFIG_PROC_FS
- register_proc_table(root_table, proc_sys_root, &root_table_header);
+ register_proc_table(root_table, proc_sys_root, &root_table_header, NULL);
init_irq_proc();
#endif
}
@@ -1279,8 +1280,9 @@ int do_sysctl_strategy (ctl_table *table
* This routine returns %NULL on a failure to register, and a pointer
* to the table header on success.
*/
-struct ctl_table_header *register_sysctl_table(ctl_table * table,
- int insert_at_head)
+struct ctl_table_header *__register_sysctl_table(ctl_table * table,
+ struct module *owner,
+ int insert_at_head)
{
struct ctl_table_header *tmp;
tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL);
@@ -1297,7 +1299,7 @@ struct ctl_table_header *register_sysctl
list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
spin_unlock(&sysctl_lock);
#ifdef CONFIG_PROC_FS
- register_proc_table(table, proc_sys_root, tmp);
+ register_proc_table(table, proc_sys_root, tmp, owner);
#endif
return tmp;
}
@@ -1328,7 +1330,8 @@ void unregister_sysctl_table(struct ctl_
#ifdef CONFIG_PROC_FS
/* Scan the sysctl entries in table and add them all into /proc */
-static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set)
+static void register_proc_table(ctl_table * table, struct proc_dir_entry *root,
+ void *set, struct module *owner)
{
struct proc_dir_entry *de;
int len;
@@ -1366,12 +1369,13 @@ static void register_proc_table(ctl_tabl
continue;
de->set = set;
de->data = (void *) table;
+ de->owner = owner;
if (table->proc_handler)
de->proc_fops = &proc_sys_file_operations;
}
table->de = de;
if (de->mode & S_IFDIR)
- register_proc_table(table->child, de, set);
+ register_proc_table(table->child, de, set, owner);
}
}
@@ -2414,8 +2418,9 @@ int proc_doulongvec_ms_jiffies_minmax(ct
return -ENOSYS;
}
-struct ctl_table_header * register_sysctl_table(ctl_table * table,
- int insert_at_head)
+struct ctl_table_header * __register_sysctl_table(ctl_table * table,
+ struct module *owner,
+ int insert_at_head)
{
return NULL;
}
@@ -2438,7 +2443,7 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
EXPORT_SYMBOL(proc_dostring);
EXPORT_SYMBOL(proc_doulongvec_minmax);
EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
-EXPORT_SYMBOL(register_sysctl_table);
+EXPORT_SYMBOL(__register_sysctl_table);
EXPORT_SYMBOL(sysctl_intvec);
EXPORT_SYMBOL(sysctl_jiffies);
EXPORT_SYMBOL(sysctl_ms_jiffies);
On Wed, 2005-12-21 at 10:35 -0800, Stephen Hemminger wrote:
>
> This patch fixes that by maintaining source compatibility via macro.
> I am sure someone already thought of this, it just doesn't appear to
> have made it in yet.
isn't it more consistent to give the sysctl table itself an .owner
field?
On Wed, 21 Dec 2005 19:42:02 +0100
Arjan van de Ven <[email protected]> wrote:
> On Wed, 2005-12-21 at 10:35 -0800, Stephen Hemminger wrote:
> >
> > This patch fixes that by maintaining source compatibility via macro.
> > I am sure someone already thought of this, it just doesn't appear to
> > have made it in yet.
>
> isn't it more consistent to give the sysctl table itself an .owner
> field?
yes, but that means changing more files.
--
Stephen Hemminger <[email protected]>
OSDL http://developer.osdl.org/~shemminger
On Wed, Dec 21, 2005 at 10:35:19AM -0800, Stephen Hemminger wrote:
> Right now there is a hole in the module ref counting system because
> there is no proper ref counting for sysctl tables used by modules.
> This means that if an application is holding /proc/sys/foo open and
> module that created it is unloaded, then the application touches the
> file the kernel will oops.
>
> This patch fixes that by maintaining source compatibility via macro.
> I am sure someone already thought of this, it just doesn't appear to
> have made it in yet.
NAK.
a) holding the file open will *NOT* pin any module structures down.
IO in progress will, but it unregistering sysctl table will block until it's
over. The same goes for sysctl(2) in progress. See use_table() and
friends in kernel/sysctl.c
b) you are not protecting any code in module; what needs protection
(and gets it) is a pile of data structures. With lifetimes that don't have
to be related to module lifetimes. IOW, use of reference to module is 100%
wrong here - it wouldn't fix anything.
As a general rule, when you pin something down, think what you are trying
to protect; if it's not just a bunch of function references - module is
the wrong thing to hold.
In particular, sysctl tables are dynamically created and removed in a
kernel that is not modular at all. Which kills any hope to get a solution
based on preventing rmmod.
Solution is fairly simple:
* put use counter into sysctl table head (i.e. object allocated by
kernel/sysctl.c)
* bump use counter when examining table in sysctl(2) and around the
actual IO in procfs access; put reference to table into proc_dir_entry to
be able to do the latter. Decrement when done with the table; if it had
hit zero _and_ there's unregistration waiting for completion - kick it.
* have unregistration kill all reference to table head and if use
counter is positive - wait for completion. Once we get it, we know that
we can safely proceed.
On Wed, Dec 21, 2005 at 07:08:49PM +0000, Al Viro wrote:
> Solution is fairly simple:
Just to clarify: said solution is already in the tree...
On Wed, 21 Dec 2005 19:08:49 +0000
Al Viro <[email protected]> wrote:
> On Wed, Dec 21, 2005 at 10:35:19AM -0800, Stephen Hemminger wrote:
> > Right now there is a hole in the module ref counting system because
> > there is no proper ref counting for sysctl tables used by modules.
> > This means that if an application is holding /proc/sys/foo open and
> > module that created it is unloaded, then the application touches the
> > file the kernel will oops.
> >
> > This patch fixes that by maintaining source compatibility via macro.
> > I am sure someone already thought of this, it just doesn't appear to
> > have made it in yet.
>
> NAK.
> a) holding the file open will *NOT* pin any module structures down.
> IO in progress will, but it unregistering sysctl table will block until it's
> over. The same goes for sysctl(2) in progress. See use_table() and
> friends in kernel/sysctl.c
> b) you are not protecting any code in module; what needs protection
> (and gets it) is a pile of data structures. With lifetimes that don't have
> to be related to module lifetimes. IOW, use of reference to module is 100%
> wrong here - it wouldn't fix anything.
>
> As a general rule, when you pin something down, think what you are trying
> to protect; if it's not just a bunch of function references - module is
> the wrong thing to hold.
>
> In particular, sysctl tables are dynamically created and removed in a
> kernel that is not modular at all. Which kills any hope to get a solution
> based on preventing rmmod.
>
> Solution is fairly simple:
> * put use counter into sysctl table head (i.e. object allocated by
> kernel/sysctl.c)
> * bump use counter when examining table in sysctl(2) and around the
> actual IO in procfs access; put reference to table into proc_dir_entry to
> be able to do the latter. Decrement when done with the table; if it had
> hit zero _and_ there's unregistration waiting for completion - kick it.
> * have unregistration kill all reference to table head and if use
> counter is positive - wait for completion. Once we get it, we know that
> we can safely proceed.
>
Yeah, that is better.
--
Stephen Hemminger <[email protected]>
OSDL http://developer.osdl.org/~shemminger