2011-02-04 04:37:41

by Lucian Adrian Grijincu

[permalink] [raw]
Subject: [PATCH 0/5] net: sysctl: share ipv4/ipv6 sysctl tables


Each network device gets the same 25/24 sysctl entries for ipv4/ipv6
in /proc/sys/net/ipv4/conf/DEVNAME and /proc/sys/net/ipv6/conf/DEVNAME

Unfortunately, space is wasted holding very much similar data.
Fortunately, with some tricks these entries can be shared between all
network devices.


The single entry in 'struct ctl_table' that was modified at runtime
for leaf ctl_table nodes and prevented sharing was 'parent'. This
field was first introduces for selinux and then was used to implement
sysctl_check_table. Selinux recently removed the need for this field:
* http://thread.gmane.org/gmane.linux.kernel.lsm/12623
* LKML-Reference: [email protected]


Remove the need for 'parent' in sysctl_check_table and remove the
'parent' field:

[PATCH 1/5] sysctl: faster reimplementation of sysctl_check_table
[PATCH 2/5] sysctl: remove useless ctl_table->parent field


Pave the way for sharing of ipv4/6 tables: allow data to be stored in
the nodes above the leafs that will be shared:

[PATCH 3/5] sysctl: write ctl_table->extra2 to entries created from ctl_path


Finally share the leaf sysctl tables for ipv4/ipv6:

[PATCH 4/5] ipv4: share sysctl net/ipv4/conf/DEVNAME/ tables
[PATCH 5/5] ipv6: share sysctl net/ipv6/conf/DEVNAME/ tables


fs/proc/proc_sysctl.c | 16 +++-
include/linux/inetdevice.h | 12 +++-
include/linux/ipv6.h | 15 +++-
include/linux/sysctl.h | 3 +-
include/net/net_namespace.h | 2 +
kernel/sysctl.c | 18 +---
kernel/sysctl_check.c | 125 +++++++++++++--------------
net/ipv4/devinet.c | 203 ++++++++++++++++++++++++++++--------------
net/ipv6/addrconf.c | 192 +++++++++++++++++++++++++++-------------
net/sysctl_net.c | 20 +++--
10 files changed, 387 insertions(+), 219 deletions(-)

--
1.7.4.rc1.7.g2cf08.dirty


2011-02-04 04:37:54

by Lucian Adrian Grijincu

[permalink] [raw]
Subject: [PATCH 1/5] sysctl: faster reimplementation of sysctl_check_table

Determining the parent of a node at depth d
- previous implementation: O(d)
- current implementation: O(1)

Printing the path to a node at depth d
- previous implementation: O(d^2)
- current implementation: O(d)

This comes to a cost: we use an array ('parents') holding as many
pointers as there can be sysctl levels (currently CTL_MAXNAME=10).

The 'parents' array of pointers holds the same values as the
ctl_table->parents field because the function that updates ->parents
(sysctl_set_parent) is called with either NULL (for root nodes) or
with sysctl_set_parent(table, table->child).

Signed-off-by: Lucian Adrian Grijincu <[email protected]>
---
kernel/sysctl_check.c | 121 ++++++++++++++++++++++++-------------------------
1 files changed, 60 insertions(+), 61 deletions(-)

diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 10b90d8..9b4fecd 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -6,58 +6,34 @@
#include <net/ip_vs.h>


-static int sysctl_depth(struct ctl_table *table)
-{
- struct ctl_table *tmp;
- int depth;
-
- depth = 0;
- for (tmp = table; tmp->parent; tmp = tmp->parent)
- depth++;
-
- return depth;
-}
-
-static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
+static void sysctl_print_path(struct ctl_table *table,
+ struct ctl_table **parents, int depth)
{
+ struct ctl_table *p;
int i;
-
- for (i = 0; table && i < n; i++)
- table = table->parent;
-
- return table;
-}
-
-
-static void sysctl_print_path(struct ctl_table *table)
-{
- struct ctl_table *tmp;
- int depth, i;
- depth = sysctl_depth(table);
if (table->procname) {
- for (i = depth; i >= 0; i--) {
- tmp = sysctl_parent(table, i);
- printk("/%s", tmp->procname?tmp->procname:"");
+ for (i = 0; i < depth; i++) {
+ p = parents[i];
+ printk("/%s", p->procname ? p->procname : "");
}
+ printk("/%s", table->procname);
}
printk(" ");
}

static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
- struct ctl_table *table)
+ struct ctl_table *table, struct ctl_table **parents, int depth)
{
struct ctl_table_header *head;
struct ctl_table *ref, *test;
- int depth, cur_depth;
-
- depth = sysctl_depth(table);
+ int cur_depth;

for (head = __sysctl_head_next(namespaces, NULL); head;
head = __sysctl_head_next(namespaces, head)) {
cur_depth = depth;
ref = head->ctl_table;
repeat:
- test = sysctl_parent(table, cur_depth);
+ test = parents[depth - cur_depth];
for (; ref->procname; ref++) {
int match = 0;
if (cur_depth && !ref->child)
@@ -83,11 +59,12 @@ out:
return ref;
}

-static void set_fail(const char **fail, struct ctl_table *table, const char *str)
+static void set_fail(const char **fail, struct ctl_table *table,
+ const char *str, struct ctl_table **parents, int depth)
{
if (*fail) {
printk(KERN_ERR "sysctl table check failed: ");
- sysctl_print_path(table);
+ sysctl_print_path(table, parents, depth);
printk(" %s\n", *fail);
dump_stack();
}
@@ -95,16 +72,24 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str
}

static void sysctl_check_leaf(struct nsproxy *namespaces,
- struct ctl_table *table, const char **fail)
+ struct ctl_table *table, const char **fail,
+ struct ctl_table **parents, int depth)
{
struct ctl_table *ref;

- ref = sysctl_check_lookup(namespaces, table);
- if (ref && (ref != table))
- set_fail(fail, table, "Sysctl already exists");
+ ref = sysctl_check_lookup(namespaces, table, parents, depth);
+ if (ref && (ref != table)) {
+ printk(KERN_ALERT "sysctl_check_leaf ref[%s], table[%s]\n", ref->procname, table->procname);
+ set_fail(fail, table, "Sysctl already exists", parents, depth);
+ }
}

-int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
+
+
+#define SET_FAIL(str) set_fail(&fail, table, str, parents, depth)
+
+static int __sysctl_check_table(struct nsproxy *namespaces,
+ struct ctl_table *table, struct ctl_table **parents, int depth)
{
int error = 0;
for (; table->procname; table++) {
@@ -112,23 +97,23 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)

if (table->parent) {
if (table->procname && !table->parent->procname)
- set_fail(&fail, table, "Parent without procname");
+ SET_FAIL("Parent without procname");
}
if (!table->procname)
- set_fail(&fail, table, "No procname");
+ SET_FAIL("No procname");
if (table->child) {
if (table->data)
- set_fail(&fail, table, "Directory with data?");
+ SET_FAIL("Directory with data?");
if (table->maxlen)
- set_fail(&fail, table, "Directory with maxlen?");
+ SET_FAIL("Directory with maxlen?");
if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode)
- set_fail(&fail, table, "Writable sysctl directory");
+ SET_FAIL("Writable sysctl directory");
if (table->proc_handler)
- set_fail(&fail, table, "Directory with proc_handler");
+ SET_FAIL("Directory with proc_handler");
if (table->extra1)
- set_fail(&fail, table, "Directory with extra1");
+ SET_FAIL("Directory with extra1");
if (table->extra2)
- set_fail(&fail, table, "Directory with extra2");
+ SET_FAIL("Directory with extra2");
} else {
if ((table->proc_handler == proc_dostring) ||
(table->proc_handler == proc_dointvec) ||
@@ -139,28 +124,42 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
(table->proc_handler == proc_doulongvec_minmax) ||
(table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
if (!table->data)
- set_fail(&fail, table, "No data");
+ SET_FAIL("No data");
if (!table->maxlen)
- set_fail(&fail, table, "No maxlen");
+ SET_FAIL("No maxlen");
}
#ifdef CONFIG_PROC_SYSCTL
if (table->procname && !table->proc_handler)
- set_fail(&fail, table, "No proc_handler");
-#endif
-#if 0
- if (!table->procname && table->proc_handler)
- set_fail(&fail, table, "proc_handler without procname");
+ SET_FAIL("No proc_handler");
#endif
- sysctl_check_leaf(namespaces, table, &fail);
+ parents[depth] = table;
+ sysctl_check_leaf(namespaces, table, &fail,
+ parents, depth);
}
if (table->mode > 0777)
- set_fail(&fail, table, "bogus .mode");
+ SET_FAIL("bogus .mode");
if (fail) {
- set_fail(&fail, table, NULL);
+ SET_FAIL(NULL);
error = -EINVAL;
}
- if (table->child)
- error |= sysctl_check_table(namespaces, table->child);
+ if (table->child) {
+ parents[depth] = table;
+ error |= __sysctl_check_table(namespaces, table->child,
+ parents, depth + 1);
+ }
}
return error;
}
+
+
+int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
+{
+ struct ctl_table *parents[CTL_MAXNAME];
+ /* Keep track of parents as we go down into the tree.
+ *
+ * parents[i-1] will be the parent for parents[i].
+ * The node at depth 'd' will have the parent at parents[d-1].
+ * The root node (depth=0) has no parent in this array.
+ */
+ return __sysctl_check_table(namespaces, table, parents, 0);
+}
--
1.7.4.rc1.7.g2cf08.dirty

2011-02-04 04:38:07

by Lucian Adrian Grijincu

[permalink] [raw]
Subject: [PATCH 2/5] sysctl: remove useless ctl_table->parent field

The 'parent' field was added for selinux in:
commit d912b0cc1a617d7c590d57b7ea971d50c7f02503
[PATCH] sysctl: add a parent entry to ctl_table and set the parent entry

and then was used for sysctl_check_table.

Both of the users have found other implementations.

CC: Eric W. Biederman <[email protected]>
Signed-off-by: Lucian Adrian Grijincu <[email protected]>
---
include/linux/sysctl.h | 1 -
kernel/sysctl.c | 11 -----------
kernel/sysctl_check.c | 4 ++--
3 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 7bb5cb6..1f1da4b 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -1018,7 +1018,6 @@ struct ctl_table
int maxlen;
mode_t mode;
struct ctl_table *child;
- struct ctl_table *parent; /* Automatically set */
proc_handler *proc_handler; /* Callback for text formatting */
void *extra1;
void *extra2;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 56f6fc1..42025ec 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1695,18 +1695,8 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
return test_perm(mode, op);
}

-static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
-{
- for (; table->procname; table++) {
- table->parent = parent;
- if (table->child)
- sysctl_set_parent(table, table->child);
- }
-}
-
static __init int sysctl_init(void)
{
- sysctl_set_parent(NULL, root_table);
#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
sysctl_check_table(current->nsproxy, root_table);
#endif
@@ -1864,7 +1854,6 @@ struct ctl_table_header *__register_sysctl_paths(
header->used = 0;
header->unregistering = NULL;
header->root = root;
- sysctl_set_parent(NULL, header->ctl_table);
header->count = 1;
#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
if (sysctl_check_table(namespaces, header->ctl_table)) {
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 9b4fecd..b7d9c66 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -95,8 +95,8 @@ static int __sysctl_check_table(struct nsproxy *namespaces,
for (; table->procname; table++) {
const char *fail = NULL;

- if (table->parent) {
- if (table->procname && !table->parent->procname)
+ if (depth != 0) { /* has parent */
+ if (table->procname && !parents[depth - 1]->procname)
SET_FAIL("Parent without procname");
}
if (!table->procname)
--
1.7.4.rc1.7.g2cf08.dirty

2011-02-04 04:38:12

by Lucian Adrian Grijincu

[permalink] [raw]
Subject: [PATCH 3/5] sysctl: write ctl_table->extra2 to entries created from ctl_path

For each entry in an array of 'struct ctl_path' we were registering a
'struct ctl_table' array with two entries:
- one to store the name + permissions,
- one as an end-of-array marker (completely blank).

We were not using any of the data storage fields
(data, extra1, extra2) in the first 'struct ctl_table'.

This patch adds possibility of storring some user provided
pointer in the 'extra2' field.

All users the next functions store NULL in the 'extra2'
field like they used to before this patch:
* register_sysctl_paths
* register_net_sysctl_table
* register_net_sysctl_rotable

Until now sysctl_check_table considered that the 'struct ctl_table' of
directories may not store anything in the 'extra2' field. We no longer
consider this a fault.

Signed-off-by: Lucian Adrian Grijincu <[email protected]>
---
include/linux/sysctl.h | 2 +-
include/net/net_namespace.h | 2 ++
kernel/sysctl.c | 7 +++++--
kernel/sysctl_check.c | 2 --
net/sysctl_net.c | 20 ++++++++++++++------
5 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 1f1da4b..090b9a3 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -1057,7 +1057,7 @@ struct ctl_path {
void register_sysctl_root(struct ctl_table_root *root);
struct ctl_table_header *__register_sysctl_paths(
struct ctl_table_root *root, struct nsproxy *namespaces,
- const struct ctl_path *path, struct ctl_table *table);
+ const struct ctl_path *path, struct ctl_table *table, void *pathdata);
struct ctl_table_header *register_sysctl_table(struct ctl_table * table);
struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
struct ctl_table *table);
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 1bf812b..42d4d61 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -272,6 +272,8 @@ struct ctl_table_header;

extern struct ctl_table_header *register_net_sysctl_table(struct net *net,
const struct ctl_path *path, struct ctl_table *table);
+struct ctl_table_header *register_net_sysctl_table_pathdata(struct net *net,
+ const struct ctl_path *path, struct ctl_table *table, void *pathdata);
extern struct ctl_table_header *register_net_sysctl_rotable(
const struct ctl_path *path, struct ctl_table *table);
extern void unregister_net_sysctl_table(struct ctl_table_header *header);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 42025ec..9b67c9e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1759,6 +1759,8 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
* @namespaces: Data to compute which lists of sysctl entries are visible
* @path: The path to the directory the sysctl table is in.
* @table: the top-level table structure
+ * @pathdata: user provided pointer to data that will be stored in ->extra2
+ * for every ctl_table node allocated for entries in @path
*
* Register a sysctl table hierarchy. @table should be a filled in ctl_table
* array. A completely 0 filled entry terminates the table.
@@ -1809,7 +1811,7 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
struct ctl_table_header *__register_sysctl_paths(
struct ctl_table_root *root,
struct nsproxy *namespaces,
- const struct ctl_path *path, struct ctl_table *table)
+ const struct ctl_path *path, struct ctl_table *table, void *pathdata)
{
struct ctl_table_header *header;
struct ctl_table *new, **prevp;
@@ -1841,6 +1843,7 @@ struct ctl_table_header *__register_sysctl_paths(
/* Copy the procname */
new->procname = path->procname;
new->mode = 0555;
+ new->extra2 = pathdata;

*prevp = new;
prevp = &new->child;
@@ -1895,7 +1898,7 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
struct ctl_table *table)
{
return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
- path, table);
+ path, table, NULL);
}

/**
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index b7d9c66..e09f47f 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -112,8 +112,6 @@ static int __sysctl_check_table(struct nsproxy *namespaces,
SET_FAIL("Directory with proc_handler");
if (table->extra1)
SET_FAIL("Directory with extra1");
- if (table->extra2)
- SET_FAIL("Directory with extra2");
} else {
if ((table->proc_handler == proc_dostring) ||
(table->proc_handler == proc_dointvec) ||
diff --git a/net/sysctl_net.c b/net/sysctl_net.c
index ca84212..9c92cac 100644
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -103,22 +103,30 @@ out:
}
subsys_initcall(sysctl_init);

-struct ctl_table_header *register_net_sysctl_table(struct net *net,
- const struct ctl_path *path, struct ctl_table *table)
+struct ctl_table_header *register_net_sysctl_table_pathdata(struct net *net,
+ const struct ctl_path *path, struct ctl_table *table, void *pathdata)
{
struct nsproxy namespaces;
namespaces = *current->nsproxy;
namespaces.net_ns = net;
- return __register_sysctl_paths(&net_sysctl_root,
- &namespaces, path, table);
+ return __register_sysctl_paths(&net_sysctl_root, &namespaces,
+ path, table, pathdata);
+}
+EXPORT_SYMBOL_GPL(register_net_sysctl_table_pathdata);
+
+struct ctl_table_header *register_net_sysctl_table(struct net *net,
+ const struct ctl_path *path, struct ctl_table *table)
+{
+ return register_net_sysctl_table_pathdata(net, path, table, NULL);
}
EXPORT_SYMBOL_GPL(register_net_sysctl_table);

+
struct ctl_table_header *register_net_sysctl_rotable(const
struct ctl_path *path, struct ctl_table *table)
{
- return __register_sysctl_paths(&net_sysctl_ro_root,
- &init_nsproxy, path, table);
+ return __register_sysctl_paths(&net_sysctl_ro_root, &init_nsproxy,
+ path, table, NULL);
}
EXPORT_SYMBOL_GPL(register_net_sysctl_rotable);

--
1.7.4.rc1.7.g2cf08.dirty

2011-02-04 04:38:16

by Lucian Adrian Grijincu

[permalink] [raw]
Subject: [PATCH 4/5] ipv4: share sysctl net/ipv4/conf/DEVNAME/ tables

Before this, for each network device DEVNAME that supports ipv4 a new
sysctl table was registered in $PROC/sys/net/ipv4/conf/DEVNAME/.

The sysctl table was identical for all network devices, except for:
* data: pointer to the data to be accessed in the sysctl
* extra1: the 'struct ipv4_devconf*' of the network device
* extra2: the 'struct net*' of the network namespace

Assuming we have a device name and a 'struct net*', we can get the
'struct net_device*'. From there we can compute:
* data: each entry corresponds to a position in 'struct ipv4_devconf*'
* extra1: 'struct ipv4_devconf*' can be reached from 'struct net_device*'
* extra2: the 'struct net*' that we assumed to have

The device name is determined from the path to the file (the name of
the parent dentry).

The 'struct net*' is stored in the parent 'struct ctl_table*' path by
register_net_sysctl_table_pathdata().

Signed-off-by: Lucian Adrian Grijincu <[email protected]>
---
fs/proc/proc_sysctl.c | 16 +++-
include/linux/inetdevice.h | 12 +++-
net/ipv4/devinet.c | 203 +++++++++++++++++++++++++++++---------------
3 files changed, 161 insertions(+), 70 deletions(-)

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index fb707e0..fe392f1 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -128,6 +128,11 @@ out:
return err;
}

+
+typedef int proc_handler_extended(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos,
+ struct file *filp);
+
static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
size_t count, loff_t *ppos, int write)
{
@@ -136,6 +141,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
struct ctl_table *table = PROC_I(inode)->sysctl_entry;
ssize_t error;
size_t res;
+ proc_handler_extended *phx = (proc_handler_extended *) table->proc_handler;

if (IS_ERR(head))
return PTR_ERR(head);
@@ -155,7 +161,15 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,

/* careful: calling conventions are nasty here */
res = count;
- error = table->proc_handler(table, write, buf, &res, ppos);
+ /* Most handlers only use the first 5 arguments (without @filp).
+ * Changing all is too much of work, as, at the time of writting only
+ * the devinet.c proc_handlers know about and use the @filp.
+ *
+ * This is just a HACK for now, I did this this way to not
+ * waste time changing all the handlers, in the final version
+ * I'll change all the handlers if there's not other solution.
+ */
+ error = phx(table, write, buf, &res, ppos, filp);
if (!error)
error = res;
out:
diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index ae8fdc5..caf06b3 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -43,8 +43,18 @@ enum

#define IPV4_DEVCONF_MAX (__IPV4_DEVCONF_MAX - 1)

+
+struct devinet_sysctl {
+ /* dev_name holds a copy of dev_name, because '.procname' is
+ * regarded as const by sysctl and we wouldn't want anyone to
+ * change it under our feet (see SIOCSIFNAME). */
+ char *dev_name;
+ struct ctl_table_header *sysctl_header;
+};
+
+
struct ipv4_devconf {
- void *sysctl;
+ struct devinet_sysctl devinet_sysctl;
int data[IPV4_DEVCONF_MAX];
DECLARE_BITMAP(state, IPV4_DEVCONF_MAX);
};
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 748cb5b..774d347 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -147,7 +147,7 @@ void in_dev_finish_destroy(struct in_device *idev)
}
EXPORT_SYMBOL(in_dev_finish_destroy);

-static struct in_device *inetdev_init(struct net_device *dev)
+struct in_device *inetdev_init(struct net_device *dev)
{
struct in_device *in_dev;

@@ -158,7 +158,8 @@ static struct in_device *inetdev_init(struct net_device *dev)
goto out;
memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt,
sizeof(in_dev->cnf));
- in_dev->cnf.sysctl = NULL;
+ in_dev->cnf.devinet_sysctl.dev_name = NULL;
+ in_dev->cnf.devinet_sysctl.sysctl_header = NULL;
in_dev->dev = dev;
in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl);
if (!in_dev->arp_parms)
@@ -1375,6 +1376,67 @@ static void inet_forward_change(struct net *net)
}
}

+
+
+static int devinet_conf_handler(ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos,
+ struct file *filp,
+ proc_handler *proc_handler)
+{
+ /* The path to this file is of the form:
+ * $PROC_MOUNT/sys/net/ipv4/conf/$DEVNAME/$CTL
+ *
+ * The array of 'struct ctl_table' of devinet entries is
+ * shared between all ipv4 network devices and the 'data'
+ * field of each structure only hold the offset into the
+ * 'data' field of 'struct ipv4_devconf'.
+ *
+ * To find the propper location of the data that must be
+ * accessed by this handler we need the device name and the
+ * network namespace in which it belongs.
+ */
+
+ /* We store the network namespace in the parent table's ->extra2 */
+ struct inode *parent_inode = filp->f_path.dentry->d_parent->d_inode;
+ struct ctl_table *parent_table = PROC_I(parent_inode)->sysctl_entry;
+ struct net *net = parent_table->extra2;
+
+ const char *dev_name = filp->f_path.dentry->d_parent->d_name.name;
+ struct ctl_table tmp_ctl;
+ struct net_device *dev = NULL;
+ struct in_device *in_dev = NULL;
+ struct ipv4_devconf *cnf;
+ int ret;
+
+ if (strcmp(dev_name, "all") == 0) {
+ cnf = net->ipv4.devconf_all;
+ } else if (strcmp(dev_name, "default") == 0) {
+ cnf = net->ipv4.devconf_dflt;
+ } else {
+ /* the device could have been renamed (SIOCSIFADDR) or
+ * deleted since we started accessing it's proc sysctl */
+ dev = dev_get_by_name(net, dev_name);
+ if (dev == NULL)
+ return -ENOENT;
+ in_dev = in_dev_get(dev);
+ cnf = &in_dev->cnf;
+ }
+
+ tmp_ctl = *ctl;
+ tmp_ctl.data += (char *)cnf - (char *)&ipv4_devconf;
+ tmp_ctl.extra1 = cnf;
+ tmp_ctl.extra2 = net;
+
+ ret = proc_handler(&tmp_ctl, write, buffer, lenp, ppos);
+
+ if (in_dev)
+ in_dev_put(in_dev);
+ if (dev)
+ dev_put(dev);
+ return ret;
+}
+
static int devinet_conf_proc(ctl_table *ctl, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
@@ -1445,6 +1507,33 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write,
return ret;
}

+static int devinet_conf_proc__(ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos,
+ struct file *filp)
+{
+ return devinet_conf_handler(ctl, write, buffer, lenp, ppos, filp,
+ devinet_conf_proc);
+}
+
+static int devinet_sysctl_forward__(ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos,
+ struct file *filp)
+{
+ return devinet_conf_handler(ctl, write, buffer, lenp, ppos, filp,
+ devinet_sysctl_forward);
+}
+
+static int ipv4_doint_and_flush__(ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos,
+ struct file *filp)
+{
+ return devinet_conf_handler(ctl, write, buffer, lenp, ppos, filp,
+ ipv4_doint_and_flush);
+}
+
#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \
{ \
.procname = name, \
@@ -1452,67 +1541,60 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write,
IPV4_DEVCONF_ ## attr - 1, \
.maxlen = sizeof(int), \
.mode = mval, \
- .proc_handler = proc, \
- .extra1 = &ipv4_devconf, \
+ .proc_handler = (proc_handler *) proc, \
}

#define DEVINET_SYSCTL_RW_ENTRY(attr, name) \
- DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc)
+ DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc__)

#define DEVINET_SYSCTL_RO_ENTRY(attr, name) \
- DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc)
+ DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc__)

#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \
DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc)

#define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \
- DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush)
-
-static struct devinet_sysctl_table {
- struct ctl_table_header *sysctl_header;
- struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX];
- char *dev_name;
-} devinet_sysctl = {
- .devinet_vars = {
- DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
- devinet_sysctl_forward),
- DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
-
- DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
- DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
- DEVINET_SYSCTL_RW_ENTRY(SHARED_MEDIA, "shared_media"),
- DEVINET_SYSCTL_RW_ENTRY(RP_FILTER, "rp_filter"),
- DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"),
- DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE,
- "accept_source_route"),
- DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"),
- DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"),
- DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"),
- DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"),
- DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"),
- DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"),
- DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"),
- DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"),
- DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"),
- DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
- DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
- DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
- DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
-
- DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
- DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
- DEVINET_SYSCTL_FLUSHING_ENTRY(FORCE_IGMP_VERSION,
- "force_igmp_version"),
- DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
- "promote_secondaries"),
- },
+ DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush__)
+
+const struct ctl_table ipv4_devinet_sysctl_table[__IPV4_DEVCONF_MAX] = {
+ DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
+ devinet_sysctl_forward__),
+ DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
+
+ DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
+ DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
+ DEVINET_SYSCTL_RW_ENTRY(SHARED_MEDIA, "shared_media"),
+ DEVINET_SYSCTL_RW_ENTRY(RP_FILTER, "rp_filter"),
+ DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"),
+ DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE,
+ "accept_source_route"),
+ DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"),
+ DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"),
+ DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"),
+ DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"),
+ DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"),
+ DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"),
+ DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"),
+ DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
+ DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
+
+ DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
+ DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
+ DEVINET_SYSCTL_FLUSHING_ENTRY(FORCE_IGMP_VERSION,
+ "force_igmp_version"),
+ DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
+ "promote_secondaries"),
+ { }
};

static int __devinet_sysctl_register(struct net *net, char *dev_name,
- struct ipv4_devconf *p)
+ struct ipv4_devconf *cnf)
{
- int i;
- struct devinet_sysctl_table *t;
+ struct devinet_sysctl *t = &cnf->devinet_sysctl;

#define DEVINET_CTL_PATH_DEV 3

@@ -1524,16 +1606,6 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
{ },
};

- t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL);
- if (!t)
- goto out;
-
- for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) {
- t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf;
- t->devinet_vars[i].extra1 = p;
- t->devinet_vars[i].extra2 = net;
- }
-
/*
* Make a copy of dev_name, because '.procname' is regarded as const
* by sysctl and we wouldn't want anyone to change it under our feet
@@ -1541,37 +1613,32 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
*/
t->dev_name = kstrdup(dev_name, GFP_KERNEL);
if (!t->dev_name)
- goto free;
+ goto out;

devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name;

- t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path,
- t->devinet_vars);
+ t->sysctl_header = register_net_sysctl_table_pathdata(net,
+ devinet_ctl_path, ipv4_devinet_sysctl_table, net);
if (!t->sysctl_header)
goto free_procname;

- p->sysctl = t;
return 0;

free_procname:
kfree(t->dev_name);
-free:
- kfree(t);
out:
return -ENOBUFS;
}

static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
{
- struct devinet_sysctl_table *t = cnf->sysctl;
+ struct devinet_sysctl *t = &cnf->devinet_sysctl;

if (t == NULL)
return;

- cnf->sysctl = NULL;
unregister_sysctl_table(t->sysctl_header);
kfree(t->dev_name);
- kfree(t);
}

static void devinet_sysctl_register(struct in_device *idev)
--
1.7.4.rc1.7.g2cf08.dirty

2011-02-04 04:38:21

by Lucian Adrian Grijincu

[permalink] [raw]
Subject: [PATCH 5/5] ipv6: share sysctl net/ipv6/conf/DEVNAME/ tables

Similar to the ipv4 patch:

Before this, for each network device DEVNAME that supports ipv4 a new
sysctl table was registered in $PROC/sys/net/ipv6/conf/DEVNAME/.

The sysctl table was identical for all network devices, except for:
* data: pointer to the data to be accessed in the sysctl
* extra1: the 'struct inet6_dev*' of the network device
* extra2: the 'struct net*' of the network namespace

Assuming we have a device name and a 'struct net*', we can get the
'struct net_device*'. From there we can compute:
* data: each entry corresponds to a position in 'struct ipv6_devconf*'
* extra1: 'struct inet6_dev*' can be reached from 'struct net_device*'
* extra2: the 'struct net*' that we assume we have

The device name is determined from the path to the file (the name of
the parent dentry).

The 'struct net*' is stored in the parent 'struct ctl_table*' path by
register_net_sysctl_table_pathdata().

Signed-off-by: Lucian Adrian Grijincu <[email protected]>
---
include/linux/ipv6.h | 15 ++++-
net/ipv6/addrconf.c | 192 +++++++++++++++++++++++++++++++++----------------
2 files changed, 143 insertions(+), 64 deletions(-)

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 0c99776..623761d 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -129,6 +129,17 @@ struct ipv6hdr {
};

#ifdef __KERNEL__
+
+#ifdef CONFIG_SYSCTL
+struct addrconf_sysctl {
+ /* dev_name holds a copy of dev_name, because '.procname' is
+ * regarded as const by sysctl and we wouldn't want anyone to
+ * change it under our feet (see SIOCSIFNAME). */
+ char *dev_name;
+ struct ctl_table_header *sysctl_header;
+};
+#endif
+
/*
* This structure contains configuration options per IPv6 link.
*/
@@ -172,7 +183,9 @@ struct ipv6_devconf {
__s32 disable_ipv6;
__s32 accept_dad;
__s32 force_tllao;
- void *sysctl;
+#ifdef CONFIG_SYSCTL
+ struct addrconf_sysctl addrconf_sysctl;
+#endif
};

struct ipv6_params {
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index fd6782e..27fd8a1 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -364,7 +364,8 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev)

memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf));
ndev->cnf.mtu6 = dev->mtu;
- ndev->cnf.sysctl = NULL;
+ ndev->cnf.addrconf_sysctl.dev_name = NULL;
+ ndev->cnf.addrconf_sysctl.sysctl_header = NULL;
ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
if (ndev->nd_parms == NULL) {
kfree(ndev);
@@ -4249,90 +4250,176 @@ int addrconf_sysctl_disable(ctl_table *ctl, int write,
return ret;
}

-static struct addrconf_sysctl_table
+static int addrconf_handler(ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos,
+ struct file *filp,
+ proc_handler *proc_handler)
{
- struct ctl_table_header *sysctl_header;
- ctl_table addrconf_vars[DEVCONF_MAX+1];
- char *dev_name;
-} addrconf_sysctl __read_mostly = {
- .sysctl_header = NULL,
- .addrconf_vars = {
+ /* The path to this file is of the form:
+ * $PROC_MOUNT/sys/net/ipv6/conf/$DEVNAME/$CTL
+ *
+ * The array of 'struct ctl_table' of devinet entries is
+ * shared between all ipv6 network devices and the 'data'
+ * field of each structure only hold the offset into the
+ * 'data' field of 'struct ipv6_devconf'.
+ *
+ * To find the propper location of the data that must be
+ * accessed by this handler we need the device name and the
+ * network namespace in which it belongs.
+ */
+
+ /* We store the network namespace in the parent table's ->extra2 */
+ struct inode *parent_inode = filp->f_path.dentry->d_parent->d_inode;
+ struct ctl_table *parent_table = PROC_I(parent_inode)->sysctl_entry;
+ struct net *net = parent_table->extra2;
+
+ const char *dev_name = filp->f_path.dentry->d_parent->d_name.name;
+ struct ctl_table tmp_ctl;
+ struct net_device *dev = NULL;
+ struct inet6_dev *in6_dev = NULL;
+ struct ipv6_devconf *cnf;
+ int ret;
+
+ if (strcmp(dev_name, "all") == 0) {
+ cnf = net->ipv6.devconf_all;
+ } else if (strcmp(dev_name, "default") == 0) {
+ cnf = net->ipv6.devconf_dflt;
+ } else {
+ /* the device could have been renamed (SIOCSIFADDR) or
+ * deleted since we started accessing it's proc sysctl */
+ dev = dev_get_by_name(net, dev_name);
+ if (dev == NULL)
+ return -ENOENT;
+ in6_dev = in6_dev_get(dev);
+ cnf = &in6_dev->cnf;
+ }
+
+ tmp_ctl = *ctl;
+ tmp_ctl.data += (char *)cnf - (char *)&ipv6_devconf;
+ tmp_ctl.extra1 = in6_dev;
+ tmp_ctl.extra2 = net;
+
+ ret = proc_handler(&tmp_ctl, write, buffer, lenp, ppos);
+
+ if (in6_dev)
+ in6_dev_put(in6_dev);
+ if (dev)
+ dev_put(dev);
+ return ret;
+}
+
+
+static int addrconf_proc_dointvec(ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos, struct file *filp)
+{
+ return addrconf_handler(ctl, write, buffer, lenp, ppos, filp,
+ proc_dointvec);
+}
+
+static int addrconf_proc_dointvec_jiffies(ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos, struct file *filp)
+{
+ return addrconf_handler(ctl, write, buffer, lenp, ppos, filp,
+ proc_dointvec_jiffies);
+}
+
+static int addrconf_sysctl_forward__(ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos, struct file *filp)
+{
+ return addrconf_handler(ctl, write, buffer, lenp, ppos, filp,
+ addrconf_sysctl_forward);
+}
+
+
+static int addrconf_sysctl_disable__(ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos, struct file *filp)
+{
+ return addrconf_handler(ctl, write, buffer, lenp, ppos, filp,
+ addrconf_sysctl_disable);
+}
+
+static const struct ctl_table ipv6_addrconf_sysctl_table[DEVCONF_MAX+1] = {
{
.procname = "forwarding",
.data = &ipv6_devconf.forwarding,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = addrconf_sysctl_forward,
+ .proc_handler = addrconf_sysctl_forward__,
},
{
.procname = "hop_limit",
.data = &ipv6_devconf.hop_limit,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,
},
{
.procname = "mtu",
.data = &ipv6_devconf.mtu6,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,
},
{
.procname = "accept_ra",
.data = &ipv6_devconf.accept_ra,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,
},
{
.procname = "accept_redirects",
.data = &ipv6_devconf.accept_redirects,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,
},
{
.procname = "autoconf",
.data = &ipv6_devconf.autoconf,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,
},
{
.procname = "dad_transmits",
.data = &ipv6_devconf.dad_transmits,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,
},
{
.procname = "router_solicitations",
.data = &ipv6_devconf.rtr_solicits,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,
},
{
.procname = "router_solicitation_interval",
.data = &ipv6_devconf.rtr_solicit_interval,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
+ .proc_handler = addrconf_proc_dointvec_jiffies,
},
{
.procname = "router_solicitation_delay",
.data = &ipv6_devconf.rtr_solicit_delay,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
+ .proc_handler = addrconf_proc_dointvec_jiffies,
},
{
.procname = "force_mld_version",
.data = &ipv6_devconf.force_mld_version,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,
},
#ifdef CONFIG_IPV6_PRIVACY
{
@@ -4340,35 +4427,35 @@ static struct addrconf_sysctl_table
.data = &ipv6_devconf.use_tempaddr,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,
},
{
.procname = "temp_valid_lft",
.data = &ipv6_devconf.temp_valid_lft,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,
},
{
.procname = "temp_prefered_lft",
.data = &ipv6_devconf.temp_prefered_lft,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,
},
{
.procname = "regen_max_retry",
.data = &ipv6_devconf.regen_max_retry,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,
},
{
.procname = "max_desync_factor",
.data = &ipv6_devconf.max_desync_factor,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,
},
#endif
{
@@ -4376,21 +4463,21 @@ static struct addrconf_sysctl_table
.data = &ipv6_devconf.max_addresses,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,
},
{
.procname = "accept_ra_defrtr",
.data = &ipv6_devconf.accept_ra_defrtr,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,
},
{
.procname = "accept_ra_pinfo",
.data = &ipv6_devconf.accept_ra_pinfo,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,
},
#ifdef CONFIG_IPV6_ROUTER_PREF
{
@@ -4398,14 +4485,14 @@ static struct addrconf_sysctl_table
.data = &ipv6_devconf.accept_ra_rtr_pref,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,
},
{
.procname = "router_probe_interval",
.data = &ipv6_devconf.rtr_probe_interval,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
+ .proc_handler = addrconf_proc_dointvec_jiffies,
},
#ifdef CONFIG_IPV6_ROUTE_INFO
{
@@ -4413,7 +4500,7 @@ static struct addrconf_sysctl_table
.data = &ipv6_devconf.accept_ra_rt_info_max_plen,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,
},
#endif
#endif
@@ -4422,14 +4509,14 @@ static struct addrconf_sysctl_table
.data = &ipv6_devconf.proxy_ndp,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,
},
{
.procname = "accept_source_route",
.data = &ipv6_devconf.accept_source_route,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,
},
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
{
@@ -4437,7 +4524,7 @@ static struct addrconf_sysctl_table
.data = &ipv6_devconf.optimistic_dad,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,

},
#endif
@@ -4447,7 +4534,7 @@ static struct addrconf_sysctl_table
.data = &ipv6_devconf.mc_forwarding,
.maxlen = sizeof(int),
.mode = 0444,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,
},
#endif
{
@@ -4455,33 +4542,31 @@ static struct addrconf_sysctl_table
.data = &ipv6_devconf.disable_ipv6,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = addrconf_sysctl_disable,
+ .proc_handler = addrconf_sysctl_disable__,
},
{
.procname = "accept_dad",
.data = &ipv6_devconf.accept_dad,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_proc_dointvec,
},
{
.procname = "force_tllao",
.data = &ipv6_devconf.force_tllao,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = addrconf_proc_dointvec
},
{
/* sentinel */
}
- },
};

static int __addrconf_sysctl_register(struct net *net, char *dev_name,
struct inet6_dev *idev, struct ipv6_devconf *p)
{
- int i;
- struct addrconf_sysctl_table *t;
+ struct addrconf_sysctl *t = &p->addrconf_sysctl;

#define ADDRCONF_CTL_PATH_DEV 3

@@ -4494,16 +4579,6 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name,
};


- t = kmemdup(&addrconf_sysctl, sizeof(*t), GFP_KERNEL);
- if (t == NULL)
- goto out;
-
- for (i = 0; t->addrconf_vars[i].data; i++) {
- t->addrconf_vars[i].data += (char *)p - (char *)&ipv6_devconf;
- t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */
- t->addrconf_vars[i].extra2 = net;
- }
-
/*
* Make a copy of dev_name, because '.procname' is regarded as const
* by sysctl and we wouldn't want anyone to change it under our feet
@@ -4511,38 +4586,29 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name,
*/
t->dev_name = kstrdup(dev_name, GFP_KERNEL);
if (!t->dev_name)
- goto free;
+ goto out;

addrconf_ctl_path[ADDRCONF_CTL_PATH_DEV].procname = t->dev_name;

- t->sysctl_header = register_net_sysctl_table(net, addrconf_ctl_path,
- t->addrconf_vars);
+ t->sysctl_header = register_net_sysctl_table_pathdata(net,
+ addrconf_ctl_path, ipv6_addrconf_sysctl_table, net);
if (t->sysctl_header == NULL)
goto free_procname;

- p->sysctl = t;
return 0;

free_procname:
kfree(t->dev_name);
-free:
- kfree(t);
out:
return -ENOBUFS;
}

static void __addrconf_sysctl_unregister(struct ipv6_devconf *p)
{
- struct addrconf_sysctl_table *t;
-
- if (p->sysctl == NULL)
- return;
+ struct addrconf_sysctl *t = &p->addrconf_sysctl;

- t = p->sysctl;
- p->sysctl = NULL;
unregister_sysctl_table(t->sysctl_header);
kfree(t->dev_name);
- kfree(t);
}

static void addrconf_sysctl_register(struct inet6_dev *idev)
--
1.7.4.rc1.7.g2cf08.dirty

2011-02-04 10:49:45

by Alexey Dobriyan

[permalink] [raw]
Subject: Re: [PATCH 0/5] net: sysctl: share ipv4/ipv6 sysctl tables

On Fri, Feb 4, 2011 at 6:37 AM, Lucian Adrian Grijincu
<[email protected]> wrote:
> Each network device gets the same 25/24 sysctl entries for ipv4/ipv6
> in /proc/sys/net/ipv4/conf/DEVNAME and /proc/sys/net/ipv6/conf/DEVNAME
>
> Unfortunately, space is wasted holding very much similar data.
> Fortunately, with some tricks these entries can be shared between all
> network devices.
>
>
> The single entry in 'struct ctl_table' that was modified at runtime
> for leaf ctl_table nodes and prevented sharing was 'parent'. This
> field was first introduces for selinux and then was used to implement
> sysctl_check_table. Selinux recently removed the need for this field:
> * http://thread.gmane.org/gmane.linux.kernel.lsm/12623
> * LKML-Reference: [email protected]
>
>
> Remove the need for 'parent' in sysctl_check_table and remove the
> 'parent' field:
>
> ?[PATCH 1/5] sysctl: faster reimplementation of sysctl_check_table
> ?[PATCH 2/5] sysctl: remove useless ctl_table->parent field
>
>
> Pave the way for sharing of ipv4/6 tables: allow data to be stored in
> the nodes above the leafs that will be shared:
>
> ?[PATCH 3/5] sysctl: write ctl_table->extra2 to entries created from ctl_path
>
>
> Finally share the leaf sysctl tables for ipv4/ipv6:
>
> ?[PATCH 4/5] ipv4: share sysctl net/ipv4/conf/DEVNAME/ tables
> ?[PATCH 5/5] ipv6: share sysctl net/ipv6/conf/DEVNAME/ tables

Meh.

First you remove ->parent, then heroically pass "struct file *"
to sysctl handlers which duplicates all information already passed
and brings dcache into picture.

Binary sysctl rewrite confused you into thinking that d_name.name
is the way, but it isn't.
For binary sysctl(2) you wouldn't get d_name.name.

2011-02-04 16:19:40

by Lucian Adrian Grijincu

[permalink] [raw]
Subject: Re: [PATCH 0/5] net: sysctl: share ipv4/ipv6 sysctl tables

On Fri, Feb 4, 2011 at 12:49 PM, Alexey Dobriyan <[email protected]> wrote:
>> Finally share the leaf sysctl tables for ipv4/ipv6:
>>
>>  [PATCH 4/5] ipv4: share sysctl net/ipv4/conf/DEVNAME/ tables
>>  [PATCH 5/5] ipv6: share sysctl net/ipv6/conf/DEVNAME/ tables
>
> Meh.
>
> First you remove ->parent, then heroically pass "struct file *"
> to sysctl handlers which duplicates all information already passed
> and brings dcache into picture.
>
> Binary sysctl rewrite confused you into thinking that d_name.name
> is the way, but it isn't.
> For binary sysctl(2) you wouldn't get d_name.name.


Are you really sure?

I ran this code on a machine with and without these patches. It seems
to work fine.

It reads the value from /proc/sys/net/ipv4/conf/default/tag and writes 42 back.

I'm not sure what I have to do to pass the name of a device (e.g.
"eth0") instead of "default" but at least "default" and "all" work and
have valid dentries.


#define _GNU_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <linux/sysctl.h>


int main(void)
{
struct __sysctl_args args;
int oldtag, newtag;
size_t oldtaglen, newtaglen;

int name[] = { CTL_NET, NET_IPV4, NET_IPV4_CONF,
NET_PROTO_CONF_DEFAULT, NET_IPV4_CONF_TAG };

memset(&args, 0, sizeof(struct __sysctl_args));
args.name = name;
args.nlen = sizeof(name)/sizeof(name[0]);

oldtag = -1;
oldtaglen = sizeof(oldtag);
args.oldval = &oldtag;
args.oldlenp = &oldtaglen;

newtag = 42;
newtaglen = sizeof(newtag);
args.newval = &newtag;
args.newlen = newtaglen;

if (syscall(SYS__sysctl, &args) == -1) {
perror("_sysctl");
exit(EXIT_FAILURE);
}
printf("Old tag was %d, new tag is %d\n", oldtag, newtag);
exit(EXIT_SUCCESS);
}


--
 .
..: Lucian

2011-02-04 19:40:39

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 1/5] sysctl: faster reimplementation of sysctl_check_table

Lucian Adrian Grijincu <[email protected]> writes:

> Determining the parent of a node at depth d
> - previous implementation: O(d)
> - current implementation: O(1)
>
> Printing the path to a node at depth d
> - previous implementation: O(d^2)
> - current implementation: O(d)
>
> This comes to a cost: we use an array ('parents') holding as many
> pointers as there can be sysctl levels (currently CTL_MAXNAME=10).
>
> The 'parents' array of pointers holds the same values as the
> ctl_table->parents field because the function that updates ->parents
> (sysctl_set_parent) is called with either NULL (for root nodes) or
> with sysctl_set_parent(table, table->child).

Overall this looks reasonable.

Can you include a check for going down too deeply, and overflowing your
array. Otherwise I expect overflowing the array will be a nasty failure
mode that will be hard to discover.

Eric



> Signed-off-by: Lucian Adrian Grijincu <[email protected]>
> ---
> kernel/sysctl_check.c | 121 ++++++++++++++++++++++++-------------------------
> 1 files changed, 60 insertions(+), 61 deletions(-)
>
> diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
> index 10b90d8..9b4fecd 100644
> --- a/kernel/sysctl_check.c
> +++ b/kernel/sysctl_check.c
> @@ -6,58 +6,34 @@
> #include <net/ip_vs.h>
>
>
> -static int sysctl_depth(struct ctl_table *table)
> -{
> - struct ctl_table *tmp;
> - int depth;
> -
> - depth = 0;
> - for (tmp = table; tmp->parent; tmp = tmp->parent)
> - depth++;
> -
> - return depth;
> -}
> -
> -static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
> +static void sysctl_print_path(struct ctl_table *table,
> + struct ctl_table **parents, int depth)
> {
> + struct ctl_table *p;
> int i;
> -
> - for (i = 0; table && i < n; i++)
> - table = table->parent;
> -
> - return table;
> -}
> -
> -
> -static void sysctl_print_path(struct ctl_table *table)
> -{
> - struct ctl_table *tmp;
> - int depth, i;
> - depth = sysctl_depth(table);
> if (table->procname) {
> - for (i = depth; i >= 0; i--) {
> - tmp = sysctl_parent(table, i);
> - printk("/%s", tmp->procname?tmp->procname:"");
> + for (i = 0; i < depth; i++) {
> + p = parents[i];
> + printk("/%s", p->procname ? p->procname : "");
> }
> + printk("/%s", table->procname);
> }
> printk(" ");
> }
>
> static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
> - struct ctl_table *table)
> + struct ctl_table *table, struct ctl_table **parents, int depth)
> {
> struct ctl_table_header *head;
> struct ctl_table *ref, *test;
> - int depth, cur_depth;
> -
> - depth = sysctl_depth(table);
> + int cur_depth;
>
> for (head = __sysctl_head_next(namespaces, NULL); head;
> head = __sysctl_head_next(namespaces, head)) {
> cur_depth = depth;
> ref = head->ctl_table;
> repeat:
> - test = sysctl_parent(table, cur_depth);
> + test = parents[depth - cur_depth];
> for (; ref->procname; ref++) {
> int match = 0;
> if (cur_depth && !ref->child)
> @@ -83,11 +59,12 @@ out:
> return ref;
> }
>
> -static void set_fail(const char **fail, struct ctl_table *table, const char *str)
> +static void set_fail(const char **fail, struct ctl_table *table,
> + const char *str, struct ctl_table **parents, int depth)
> {
> if (*fail) {
> printk(KERN_ERR "sysctl table check failed: ");
> - sysctl_print_path(table);
> + sysctl_print_path(table, parents, depth);
> printk(" %s\n", *fail);
> dump_stack();
> }
> @@ -95,16 +72,24 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str
> }
>
> static void sysctl_check_leaf(struct nsproxy *namespaces,
> - struct ctl_table *table, const char **fail)
> + struct ctl_table *table, const char **fail,
> + struct ctl_table **parents, int depth)
> {
> struct ctl_table *ref;
>
> - ref = sysctl_check_lookup(namespaces, table);
> - if (ref && (ref != table))
> - set_fail(fail, table, "Sysctl already exists");
> + ref = sysctl_check_lookup(namespaces, table, parents, depth);
> + if (ref && (ref != table)) {
> + printk(KERN_ALERT "sysctl_check_leaf ref[%s], table[%s]\n", ref->procname, table->procname);
> + set_fail(fail, table, "Sysctl already exists", parents, depth);
> + }
> }
>
> -int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
> +
> +
> +#define SET_FAIL(str) set_fail(&fail, table, str, parents, depth)
> +
> +static int __sysctl_check_table(struct nsproxy *namespaces,
> + struct ctl_table *table, struct ctl_table **parents, int depth)
> {
> int error = 0;
> for (; table->procname; table++) {
> @@ -112,23 +97,23 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
>
> if (table->parent) {
> if (table->procname && !table->parent->procname)
> - set_fail(&fail, table, "Parent without procname");
> + SET_FAIL("Parent without procname");
> }
> if (!table->procname)
> - set_fail(&fail, table, "No procname");
> + SET_FAIL("No procname");
> if (table->child) {
> if (table->data)
> - set_fail(&fail, table, "Directory with data?");
> + SET_FAIL("Directory with data?");
> if (table->maxlen)
> - set_fail(&fail, table, "Directory with maxlen?");
> + SET_FAIL("Directory with maxlen?");
> if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode)
> - set_fail(&fail, table, "Writable sysctl directory");
> + SET_FAIL("Writable sysctl directory");
> if (table->proc_handler)
> - set_fail(&fail, table, "Directory with proc_handler");
> + SET_FAIL("Directory with proc_handler");
> if (table->extra1)
> - set_fail(&fail, table, "Directory with extra1");
> + SET_FAIL("Directory with extra1");
> if (table->extra2)
> - set_fail(&fail, table, "Directory with extra2");
> + SET_FAIL("Directory with extra2");
> } else {
> if ((table->proc_handler == proc_dostring) ||
> (table->proc_handler == proc_dointvec) ||
> @@ -139,28 +124,42 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
> (table->proc_handler == proc_doulongvec_minmax) ||
> (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
> if (!table->data)
> - set_fail(&fail, table, "No data");
> + SET_FAIL("No data");
> if (!table->maxlen)
> - set_fail(&fail, table, "No maxlen");
> + SET_FAIL("No maxlen");
> }
> #ifdef CONFIG_PROC_SYSCTL
> if (table->procname && !table->proc_handler)
> - set_fail(&fail, table, "No proc_handler");
> -#endif
> -#if 0
> - if (!table->procname && table->proc_handler)
> - set_fail(&fail, table, "proc_handler without procname");
> + SET_FAIL("No proc_handler");
> #endif
> - sysctl_check_leaf(namespaces, table, &fail);
> + parents[depth] = table;
> + sysctl_check_leaf(namespaces, table, &fail,
> + parents, depth);
> }
> if (table->mode > 0777)
> - set_fail(&fail, table, "bogus .mode");
> + SET_FAIL("bogus .mode");
> if (fail) {
> - set_fail(&fail, table, NULL);
> + SET_FAIL(NULL);
> error = -EINVAL;
> }
> - if (table->child)
> - error |= sysctl_check_table(namespaces, table->child);
> + if (table->child) {
> + parents[depth] = table;
> + error |= __sysctl_check_table(namespaces, table->child,
> + parents, depth + 1);
> + }
> }
> return error;
> }
> +
> +
> +int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
> +{
> + struct ctl_table *parents[CTL_MAXNAME];
> + /* Keep track of parents as we go down into the tree.
> + *
> + * parents[i-1] will be the parent for parents[i].
> + * The node at depth 'd' will have the parent at parents[d-1].
> + * The root node (depth=0) has no parent in this array.
> + */
> + return __sysctl_check_table(namespaces, table, parents, 0);
> +}

2011-02-04 19:41:32

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 2/5] sysctl: remove useless ctl_table->parent field

Lucian Adrian Grijincu <[email protected]> writes:

> The 'parent' field was added for selinux in:
> commit d912b0cc1a617d7c590d57b7ea971d50c7f02503
> [PATCH] sysctl: add a parent entry to ctl_table and set the parent entry
>
> and then was used for sysctl_check_table.
>
> Both of the users have found other implementations.

This seems reasonable but we need to be careful in how we merge this so
the individual trees are correct.

> CC: Eric W. Biederman <[email protected]>
> Signed-off-by: Lucian Adrian Grijincu <[email protected]>
> ---
> include/linux/sysctl.h | 1 -
> kernel/sysctl.c | 11 -----------
> kernel/sysctl_check.c | 4 ++--
> 3 files changed, 2 insertions(+), 14 deletions(-)
>
> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
> index 7bb5cb6..1f1da4b 100644
> --- a/include/linux/sysctl.h
> +++ b/include/linux/sysctl.h
> @@ -1018,7 +1018,6 @@ struct ctl_table
> int maxlen;
> mode_t mode;
> struct ctl_table *child;
> - struct ctl_table *parent; /* Automatically set */
> proc_handler *proc_handler; /* Callback for text formatting */
> void *extra1;
> void *extra2;
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 56f6fc1..42025ec 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1695,18 +1695,8 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
> return test_perm(mode, op);
> }
>
> -static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
> -{
> - for (; table->procname; table++) {
> - table->parent = parent;
> - if (table->child)
> - sysctl_set_parent(table, table->child);
> - }
> -}
> -
> static __init int sysctl_init(void)
> {
> - sysctl_set_parent(NULL, root_table);
> #ifdef CONFIG_SYSCTL_SYSCALL_CHECK
> sysctl_check_table(current->nsproxy, root_table);
> #endif
> @@ -1864,7 +1854,6 @@ struct ctl_table_header *__register_sysctl_paths(
> header->used = 0;
> header->unregistering = NULL;
> header->root = root;
> - sysctl_set_parent(NULL, header->ctl_table);
> header->count = 1;
> #ifdef CONFIG_SYSCTL_SYSCALL_CHECK
> if (sysctl_check_table(namespaces, header->ctl_table)) {
> diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
> index 9b4fecd..b7d9c66 100644
> --- a/kernel/sysctl_check.c
> +++ b/kernel/sysctl_check.c
> @@ -95,8 +95,8 @@ static int __sysctl_check_table(struct nsproxy *namespaces,
> for (; table->procname; table++) {
> const char *fail = NULL;
>
> - if (table->parent) {
> - if (table->procname && !table->parent->procname)
> + if (depth != 0) { /* has parent */
> + if (table->procname && !parents[depth - 1]->procname)
> SET_FAIL("Parent without procname");
> }
> if (!table->procname)

2011-02-04 20:31:44

by Lucian Adrian Grijincu

[permalink] [raw]
Subject: [PATCH 1/6] sysctl: faster reimplementation of sysctl_check_table

Determining the parent of a node at depth d
- previous implementation: O(d)
- current implementation: O(1)

Printing the path to a node at depth d
- previous implementation: O(d^2)
- current implementation: O(d)

This comes to a cost: we use an array ('parents') holding as many
pointers as there can be sysctl levels (currently CTL_MAXNAME=10).

The 'parents' array of pointers holds the same values as the
ctl_table->parents field because the function that updates ->parents
(sysctl_set_parent) is called with either NULL (for root nodes) or
with sysctl_set_parent(table, table->child).

Signed-off-by: Lucian Adrian Grijincu <[email protected]>
---
kernel/sysctl_check.c | 130 +++++++++++++++++++++++++-----------------------
1 files changed, 68 insertions(+), 62 deletions(-)

diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 10b90d8..d23b085 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -6,58 +6,34 @@
#include <net/ip_vs.h>


-static int sysctl_depth(struct ctl_table *table)
-{
- struct ctl_table *tmp;
- int depth;
-
- depth = 0;
- for (tmp = table; tmp->parent; tmp = tmp->parent)
- depth++;
-
- return depth;
-}
-
-static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
+static void sysctl_print_path(struct ctl_table *table,
+ struct ctl_table **parents, int depth)
{
+ struct ctl_table *p;
int i;
-
- for (i = 0; table && i < n; i++)
- table = table->parent;
-
- return table;
-}
-
-
-static void sysctl_print_path(struct ctl_table *table)
-{
- struct ctl_table *tmp;
- int depth, i;
- depth = sysctl_depth(table);
if (table->procname) {
- for (i = depth; i >= 0; i--) {
- tmp = sysctl_parent(table, i);
- printk("/%s", tmp->procname?tmp->procname:"");
+ for (i = 0; i < depth; i++) {
+ p = parents[i];
+ printk("/%s", p->procname ? p->procname : "");
}
+ printk("/%s", table->procname);
}
printk(" ");
}

static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
- struct ctl_table *table)
+ struct ctl_table *table, struct ctl_table **parents, int depth)
{
struct ctl_table_header *head;
struct ctl_table *ref, *test;
- int depth, cur_depth;
-
- depth = sysctl_depth(table);
+ int cur_depth;

for (head = __sysctl_head_next(namespaces, NULL); head;
head = __sysctl_head_next(namespaces, head)) {
cur_depth = depth;
ref = head->ctl_table;
repeat:
- test = sysctl_parent(table, cur_depth);
+ test = parents[depth - cur_depth];
for (; ref->procname; ref++) {
int match = 0;
if (cur_depth && !ref->child)
@@ -83,11 +59,12 @@ out:
return ref;
}

-static void set_fail(const char **fail, struct ctl_table *table, const char *str)
+static void set_fail(const char **fail, struct ctl_table *table,
+ const char *str, struct ctl_table **parents, int depth)
{
if (*fail) {
printk(KERN_ERR "sysctl table check failed: ");
- sysctl_print_path(table);
+ sysctl_print_path(table, parents, depth);
printk(" %s\n", *fail);
dump_stack();
}
@@ -95,40 +72,55 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str
}

static void sysctl_check_leaf(struct nsproxy *namespaces,
- struct ctl_table *table, const char **fail)
+ struct ctl_table *table, const char **fail,
+ struct ctl_table **parents, int depth)
{
struct ctl_table *ref;

- ref = sysctl_check_lookup(namespaces, table);
- if (ref && (ref != table))
- set_fail(fail, table, "Sysctl already exists");
+ ref = sysctl_check_lookup(namespaces, table, parents, depth);
+ if (ref && (ref != table)) {
+ printk(KERN_ALERT "sysctl_check_leaf ref[%s], table[%s]\n", ref->procname, table->procname);
+ set_fail(fail, table, "Sysctl already exists", parents, depth);
+ }
}

-int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
+
+
+#define SET_FAIL(str) set_fail(&fail, table, str, parents, depth)
+
+static int __sysctl_check_table(struct nsproxy *namespaces,
+ struct ctl_table *table, struct ctl_table **parents, int depth)
{
+ const char *fail = NULL;
int error = 0;
+
+ if (depth >= CTL_MAXNAME) {
+ SET_FAIL("Sysctl tree too deep");
+ return -EINVAL;
+ }
+
for (; table->procname; table++) {
- const char *fail = NULL;
+ fail = NULL;

if (table->parent) {
if (table->procname && !table->parent->procname)
- set_fail(&fail, table, "Parent without procname");
+ SET_FAIL("Parent without procname");
}
if (!table->procname)
- set_fail(&fail, table, "No procname");
+ SET_FAIL("No procname");
if (table->child) {
if (table->data)
- set_fail(&fail, table, "Directory with data?");
+ SET_FAIL("Directory with data?");
if (table->maxlen)
- set_fail(&fail, table, "Directory with maxlen?");
+ SET_FAIL("Directory with maxlen?");
if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode)
- set_fail(&fail, table, "Writable sysctl directory");
+ SET_FAIL("Writable sysctl directory");
if (table->proc_handler)
- set_fail(&fail, table, "Directory with proc_handler");
+ SET_FAIL("Directory with proc_handler");
if (table->extra1)
- set_fail(&fail, table, "Directory with extra1");
+ SET_FAIL("Directory with extra1");
if (table->extra2)
- set_fail(&fail, table, "Directory with extra2");
+ SET_FAIL("Directory with extra2");
} else {
if ((table->proc_handler == proc_dostring) ||
(table->proc_handler == proc_dointvec) ||
@@ -139,28 +131,42 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
(table->proc_handler == proc_doulongvec_minmax) ||
(table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
if (!table->data)
- set_fail(&fail, table, "No data");
+ SET_FAIL("No data");
if (!table->maxlen)
- set_fail(&fail, table, "No maxlen");
+ SET_FAIL("No maxlen");
}
#ifdef CONFIG_PROC_SYSCTL
if (table->procname && !table->proc_handler)
- set_fail(&fail, table, "No proc_handler");
-#endif
-#if 0
- if (!table->procname && table->proc_handler)
- set_fail(&fail, table, "proc_handler without procname");
+ SET_FAIL("No proc_handler");
#endif
- sysctl_check_leaf(namespaces, table, &fail);
+ parents[depth] = table;
+ sysctl_check_leaf(namespaces, table, &fail,
+ parents, depth);
}
if (table->mode > 0777)
- set_fail(&fail, table, "bogus .mode");
+ SET_FAIL("bogus .mode");
if (fail) {
- set_fail(&fail, table, NULL);
+ SET_FAIL(NULL);
error = -EINVAL;
}
- if (table->child)
- error |= sysctl_check_table(namespaces, table->child);
+ if (table->child) {
+ parents[depth] = table;
+ error |= __sysctl_check_table(namespaces, table->child,
+ parents, depth + 1);
+ }
}
return error;
}
+
+
+int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
+{
+ struct ctl_table *parents[CTL_MAXNAME];
+ /* Keep track of parents as we go down into the tree.
+ *
+ * parents[i-1] will be the parent for parents[i].
+ * The node at depth 'd' will have the parent at parents[d-1].
+ * The root node (depth=0) has no parent in this array.
+ */
+ return __sysctl_check_table(namespaces, table, parents, 0);
+}
--
1.7.4.rc1.7.g2cf08.dirty

2011-02-04 21:11:17

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 1/6] sysctl: faster reimplementation of sysctl_check_table

Lucian Adrian Grijincu <[email protected]> writes:

> Determining the parent of a node at depth d
> - previous implementation: O(d)
> - current implementation: O(1)
>
> Printing the path to a node at depth d
> - previous implementation: O(d^2)
> - current implementation: O(d)
>
> This comes to a cost: we use an array ('parents') holding as many
> pointers as there can be sysctl levels (currently CTL_MAXNAME=10).
>
> The 'parents' array of pointers holds the same values as the
> ctl_table->parents field because the function that updates ->parents
> (sysctl_set_parent) is called with either NULL (for root nodes) or
> with sysctl_set_parent(table, table->child).

>
> Signed-off-by: Lucian Adrian Grijincu <[email protected]>
> ---
> kernel/sysctl_check.c | 130 +++++++++++++++++++++++++-----------------------
> 1 files changed, 68 insertions(+), 62 deletions(-)
>
> diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
> index 10b90d8..d23b085 100644
> --- a/kernel/sysctl_check.c
> +++ b/kernel/sysctl_check.c
> @@ -6,58 +6,34 @@
> #include <net/ip_vs.h>
>
>
> -static int sysctl_depth(struct ctl_table *table)
> -{
> - struct ctl_table *tmp;
> - int depth;
> -
> - depth = 0;
> - for (tmp = table; tmp->parent; tmp = tmp->parent)
> - depth++;
> -
> - return depth;
> -}
> -
> -static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
> +static void sysctl_print_path(struct ctl_table *table,
> + struct ctl_table **parents, int depth)
> {
> + struct ctl_table *p;
> int i;
> -
> - for (i = 0; table && i < n; i++)
> - table = table->parent;
> -
> - return table;
> -}
> -
> -
> -static void sysctl_print_path(struct ctl_table *table)
> -{
> - struct ctl_table *tmp;
> - int depth, i;
> - depth = sysctl_depth(table);
> if (table->procname) {
> - for (i = depth; i >= 0; i--) {
> - tmp = sysctl_parent(table, i);
> - printk("/%s", tmp->procname?tmp->procname:"");
> + for (i = 0; i < depth; i++) {
> + p = parents[i];
> + printk("/%s", p->procname ? p->procname : "");
> }
> + printk("/%s", table->procname);
> }
> printk(" ");
> }
>
> static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
> - struct ctl_table *table)
> + struct ctl_table *table, struct ctl_table **parents, int depth)
> {
> struct ctl_table_header *head;
> struct ctl_table *ref, *test;
> - int depth, cur_depth;
> -
> - depth = sysctl_depth(table);
> + int cur_depth;
>
> for (head = __sysctl_head_next(namespaces, NULL); head;
> head = __sysctl_head_next(namespaces, head)) {
> cur_depth = depth;
> ref = head->ctl_table;
> repeat:
> - test = sysctl_parent(table, cur_depth);
> + test = parents[depth - cur_depth];
> for (; ref->procname; ref++) {
> int match = 0;
> if (cur_depth && !ref->child)
> @@ -83,11 +59,12 @@ out:
> return ref;
> }
>
> -static void set_fail(const char **fail, struct ctl_table *table, const char *str)
> +static void set_fail(const char **fail, struct ctl_table *table,
> + const char *str, struct ctl_table **parents, int depth)
> {
> if (*fail) {
> printk(KERN_ERR "sysctl table check failed: ");
> - sysctl_print_path(table);
> + sysctl_print_path(table, parents, depth);
> printk(" %s\n", *fail);
> dump_stack();
> }
> @@ -95,40 +72,55 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str
> }
>
> static void sysctl_check_leaf(struct nsproxy *namespaces,
> - struct ctl_table *table, const char **fail)
> + struct ctl_table *table, const char **fail,
> + struct ctl_table **parents, int depth)
> {
> struct ctl_table *ref;
>
> - ref = sysctl_check_lookup(namespaces, table);
> - if (ref && (ref != table))
> - set_fail(fail, table, "Sysctl already exists");
> + ref = sysctl_check_lookup(namespaces, table, parents, depth);
> + if (ref && (ref != table)) {
> + printk(KERN_ALERT "sysctl_check_leaf ref[%s], table[%s]\n", ref->procname, table->procname);
> + set_fail(fail, table, "Sysctl already exists", parents, depth);
> + }
> }
>
> -int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
> +
> +
> +#define SET_FAIL(str) set_fail(&fail, table, str, parents, depth)
> +
> +static int __sysctl_check_table(struct nsproxy *namespaces,
> + struct ctl_table *table, struct ctl_table **parents, int depth)
> {
> + const char *fail = NULL;
> int error = 0;
> +
> + if (depth >= CTL_MAXNAME) {

This should be depth > CTL_MAXNAME. Because there are only CTL_MAXNAME
entries in the array.

Eric


> + SET_FAIL("Sysctl tree too deep");
> + return -EINVAL;
> + }
> +
> for (; table->procname; table++) {
> - const char *fail = NULL;
> + fail = NULL;
>
> if (table->parent) {
> if (table->procname && !table->parent->procname)
> - set_fail(&fail, table, "Parent without procname");
> + SET_FAIL("Parent without procname");
> }
> if (!table->procname)
> - set_fail(&fail, table, "No procname");
> + SET_FAIL("No procname");
> if (table->child) {
> if (table->data)
> - set_fail(&fail, table, "Directory with data?");
> + SET_FAIL("Directory with data?");
> if (table->maxlen)
> - set_fail(&fail, table, "Directory with maxlen?");
> + SET_FAIL("Directory with maxlen?");
> if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode)
> - set_fail(&fail, table, "Writable sysctl directory");
> + SET_FAIL("Writable sysctl directory");
> if (table->proc_handler)
> - set_fail(&fail, table, "Directory with proc_handler");
> + SET_FAIL("Directory with proc_handler");
> if (table->extra1)
> - set_fail(&fail, table, "Directory with extra1");
> + SET_FAIL("Directory with extra1");
> if (table->extra2)
> - set_fail(&fail, table, "Directory with extra2");
> + SET_FAIL("Directory with extra2");
> } else {
> if ((table->proc_handler == proc_dostring) ||
> (table->proc_handler == proc_dointvec) ||
> @@ -139,28 +131,42 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
> (table->proc_handler == proc_doulongvec_minmax) ||
> (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
> if (!table->data)
> - set_fail(&fail, table, "No data");
> + SET_FAIL("No data");
> if (!table->maxlen)
> - set_fail(&fail, table, "No maxlen");
> + SET_FAIL("No maxlen");
> }
> #ifdef CONFIG_PROC_SYSCTL
> if (table->procname && !table->proc_handler)
> - set_fail(&fail, table, "No proc_handler");
> -#endif
> -#if 0
> - if (!table->procname && table->proc_handler)
> - set_fail(&fail, table, "proc_handler without procname");
> + SET_FAIL("No proc_handler");
> #endif
> - sysctl_check_leaf(namespaces, table, &fail);
> + parents[depth] = table;
> + sysctl_check_leaf(namespaces, table, &fail,
> + parents, depth);
> }
> if (table->mode > 0777)
> - set_fail(&fail, table, "bogus .mode");
> + SET_FAIL("bogus .mode");
> if (fail) {
> - set_fail(&fail, table, NULL);
> + SET_FAIL(NULL);
> error = -EINVAL;
> }
> - if (table->child)
> - error |= sysctl_check_table(namespaces, table->child);
> + if (table->child) {
> + parents[depth] = table;
> + error |= __sysctl_check_table(namespaces, table->child,
> + parents, depth + 1);
> + }
> }
> return error;
> }
> +
> +
> +int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
> +{
> + struct ctl_table *parents[CTL_MAXNAME];
> + /* Keep track of parents as we go down into the tree.
> + *
> + * parents[i-1] will be the parent for parents[i].
> + * The node at depth 'd' will have the parent at parents[d-1].
> + * The root node (depth=0) has no parent in this array.
> + */
> + return __sysctl_check_table(namespaces, table, parents, 0);
> +}

2011-02-04 21:34:37

by Lucian Adrian Grijincu

[permalink] [raw]
Subject: Re: [PATCH 1/6] sysctl: faster reimplementation of sysctl_check_table

On Fri, Feb 4, 2011 at 11:11 PM, Eric W. Biederman
<[email protected]> wrote:
>> +static int __sysctl_check_table(struct nsproxy *namespaces,
>> +     struct ctl_table *table, struct ctl_table **parents, int depth)
>>  {
>> +     const char *fail = NULL;
>>       int error = 0;
>> +
>> +     if (depth >= CTL_MAXNAME) {
>
> This should be depth > CTL_MAXNAME.  Because there are only CTL_MAXNAME
> entries in the array.


A bit lower in the array we access 'parents[depth]'.
So the correct check should be (depth >= CTL_MAXNAME) => error.


>> -                     sysctl_check_leaf(namespaces, table, &fail);
>> +                     parents[depth] = table;
>> +                     sysctl_check_leaf(namespaces, table, &fail,
>> +                                       parents, depth);
>>               }

>> +             if (table->child) {
>> +                     parents[depth] = table;
>> +                     error |= __sysctl_check_table(namespaces, table->child,
>> +                                                   parents, depth + 1);
>> +             }



--
 .
..: Lucian
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m???? ????????I?

2011-02-05 14:27:00

by Alexey Dobriyan

[permalink] [raw]
Subject: Re: [PATCH 0/5] net: sysctl: share ipv4/ipv6 sysctl tables

On Fri, Feb 04, 2011 at 05:59:24PM +0200, Lucian Adrian Grijincu wrote:
> On Fri, Feb 4, 2011 at 12:49 PM, Alexey Dobriyan <[email protected]> wrote:
> >> Finally share the leaf sysctl tables for ipv4/ipv6:
> >>
> >> ?[PATCH 4/5] ipv4: share sysctl net/ipv4/conf/DEVNAME/ tables
> >> ?[PATCH 5/5] ipv6: share sysctl net/ipv6/conf/DEVNAME/ tables
> >
> > Meh.
> >
> > First you remove ->parent, then heroically pass "struct file *"
> > to sysctl handlers which duplicates all information already passed
> > and brings dcache into picture.
> >
> > Binary sysctl rewrite confused you into thinking that d_name.name
> > is the way, but it isn't.
> > For binary sysctl(2) you wouldn't get d_name.name.
>
>
> Are you really sure?
>
> I ran this code on a machine with and without these patches. It seems
> to work fine.
>
> It reads the value from /proc/sys/net/ipv4/conf/default/tag and writes 42 back.
>
> I'm not sure what I have to do to pass the name of a device (e.g.
> "eth0") instead of "default" but at least "default" and "all" work and
> have valid dentries.

You do ->parent->procname.

But, but you removed parent.

2011-02-05 14:59:27

by Lucian Adrian Grijincu

[permalink] [raw]
Subject: Re: [PATCH 0/5] net: sysctl: share ipv4/ipv6 sysctl tables

On Sat, Feb 5, 2011 at 4:26 PM, Alexey Dobriyan <[email protected]> wrote:
>> I'm not sure what I have to do to pass the name of a device (e.g.
>> "eth0") instead of "default" but at least "default" and "all" work and
>> have valid dentries.
>
> You do ->parent->procname.
>
> But, but you removed parent.


To be more exact I do:
const char *dev_name = filp->f_path.dentry->d_parent->d_name.name;


If the sysctl table is shared between all network devices, the last
registered node will set the ->parent.

So accessing /proc/sys/net/ipv4/conf/$DEVNAME/$CTL will access $CTL
for the last network device registered not for $DEVNAME.

--
 .
..: Lucian

2011-02-05 21:17:06

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 4/5] ipv4: share sysctl net/ipv4/conf/DEVNAME/ tables

Lucian Adrian Grijincu <[email protected]> writes:

> Before this, for each network device DEVNAME that supports ipv4 a new
> sysctl table was registered in $PROC/sys/net/ipv4/conf/DEVNAME/.
>
> The sysctl table was identical for all network devices, except for:
> * data: pointer to the data to be accessed in the sysctl
> * extra1: the 'struct ipv4_devconf*' of the network device
> * extra2: the 'struct net*' of the network namespace
>
> Assuming we have a device name and a 'struct net*', we can get the
> 'struct net_device*'. From there we can compute:
> * data: each entry corresponds to a position in 'struct ipv4_devconf*'
> * extra1: 'struct ipv4_devconf*' can be reached from 'struct net_device*'
> * extra2: the 'struct net*' that we assumed to have
>
> The device name is determined from the path to the file (the name of
> the parent dentry).
>
> The 'struct net*' is stored in the parent 'struct ctl_table*' path by
> register_net_sysctl_table_pathdata().

I don't like this direction. This makes the code more complicated and
probably racy (think what happens when you are running this sysctl when
someone is renaming the network device) for a questionable gain in space
efficiency.

Size of the sysctl data structures is interesting especially when we
have a lot of instances of this data structure but so is algorithmic
complexity. The ugly problem is right now inserts of N network devices
is O(N^2) where it could/should be O(Nlog(N)).

I think these last three patches increase reliance on slightly dubious
properties of the sysctl interface and are likely to make it hard to fix
the O(N^2) complexity that increases rtnl_lock hold times and is
otherwise a real pain when adding and deleting network devices.

Eric




> Signed-off-by: Lucian Adrian Grijincu <[email protected]>
> ---
> fs/proc/proc_sysctl.c | 16 +++-
> include/linux/inetdevice.h | 12 +++-
> net/ipv4/devinet.c | 203 +++++++++++++++++++++++++++++---------------
> 3 files changed, 161 insertions(+), 70 deletions(-)
>
> diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
> index fb707e0..fe392f1 100644
> --- a/fs/proc/proc_sysctl.c
> +++ b/fs/proc/proc_sysctl.c
> @@ -128,6 +128,11 @@ out:
> return err;
> }
>
> +
> +typedef int proc_handler_extended(struct ctl_table *ctl, int write,
> + void __user *buffer, size_t *lenp, loff_t *ppos,
> + struct file *filp);
> +
> static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
> size_t count, loff_t *ppos, int write)
> {
> @@ -136,6 +141,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
> struct ctl_table *table = PROC_I(inode)->sysctl_entry;
> ssize_t error;
> size_t res;
> + proc_handler_extended *phx = (proc_handler_extended *) table->proc_handler;
>
> if (IS_ERR(head))
> return PTR_ERR(head);
> @@ -155,7 +161,15 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
>
> /* careful: calling conventions are nasty here */
> res = count;
> - error = table->proc_handler(table, write, buf, &res, ppos);
> + /* Most handlers only use the first 5 arguments (without @filp).
> + * Changing all is too much of work, as, at the time of writting only
> + * the devinet.c proc_handlers know about and use the @filp.
> + *
> + * This is just a HACK for now, I did this this way to not
> + * waste time changing all the handlers, in the final version
> + * I'll change all the handlers if there's not other solution.
> + */
> + error = phx(table, write, buf, &res, ppos, filp);
> if (!error)
> error = res;
> out:
> diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
> index ae8fdc5..caf06b3 100644
> --- a/include/linux/inetdevice.h
> +++ b/include/linux/inetdevice.h
> @@ -43,8 +43,18 @@ enum
>
> #define IPV4_DEVCONF_MAX (__IPV4_DEVCONF_MAX - 1)
>
> +
> +struct devinet_sysctl {
> + /* dev_name holds a copy of dev_name, because '.procname' is
> + * regarded as const by sysctl and we wouldn't want anyone to
> + * change it under our feet (see SIOCSIFNAME). */
> + char *dev_name;
> + struct ctl_table_header *sysctl_header;
> +};
> +
> +
> struct ipv4_devconf {
> - void *sysctl;
> + struct devinet_sysctl devinet_sysctl;
> int data[IPV4_DEVCONF_MAX];
> DECLARE_BITMAP(state, IPV4_DEVCONF_MAX);
> };
> diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
> index 748cb5b..774d347 100644
> --- a/net/ipv4/devinet.c
> +++ b/net/ipv4/devinet.c
> @@ -147,7 +147,7 @@ void in_dev_finish_destroy(struct in_device *idev)
> }
> EXPORT_SYMBOL(in_dev_finish_destroy);
>
> -static struct in_device *inetdev_init(struct net_device *dev)
> +struct in_device *inetdev_init(struct net_device *dev)
> {
> struct in_device *in_dev;
>
> @@ -158,7 +158,8 @@ static struct in_device *inetdev_init(struct net_device *dev)
> goto out;
> memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt,
> sizeof(in_dev->cnf));
> - in_dev->cnf.sysctl = NULL;
> + in_dev->cnf.devinet_sysctl.dev_name = NULL;
> + in_dev->cnf.devinet_sysctl.sysctl_header = NULL;
> in_dev->dev = dev;
> in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl);
> if (!in_dev->arp_parms)
> @@ -1375,6 +1376,67 @@ static void inet_forward_change(struct net *net)
> }
> }
>
> +
> +
> +static int devinet_conf_handler(ctl_table *ctl, int write,
> + void __user *buffer,
> + size_t *lenp, loff_t *ppos,
> + struct file *filp,
> + proc_handler *proc_handler)
> +{
> + /* The path to this file is of the form:
> + * $PROC_MOUNT/sys/net/ipv4/conf/$DEVNAME/$CTL
> + *
> + * The array of 'struct ctl_table' of devinet entries is
> + * shared between all ipv4 network devices and the 'data'
> + * field of each structure only hold the offset into the
> + * 'data' field of 'struct ipv4_devconf'.
> + *
> + * To find the propper location of the data that must be
> + * accessed by this handler we need the device name and the
> + * network namespace in which it belongs.
> + */
> +
> + /* We store the network namespace in the parent table's ->extra2 */
> + struct inode *parent_inode = filp->f_path.dentry->d_parent->d_inode;
> + struct ctl_table *parent_table = PROC_I(parent_inode)->sysctl_entry;
> + struct net *net = parent_table->extra2;
> +
> + const char *dev_name = filp->f_path.dentry->d_parent->d_name.name;
> + struct ctl_table tmp_ctl;
> + struct net_device *dev = NULL;
> + struct in_device *in_dev = NULL;
> + struct ipv4_devconf *cnf;
> + int ret;
> +
> + if (strcmp(dev_name, "all") == 0) {
> + cnf = net->ipv4.devconf_all;
> + } else if (strcmp(dev_name, "default") == 0) {
> + cnf = net->ipv4.devconf_dflt;
> + } else {
> + /* the device could have been renamed (SIOCSIFADDR) or
> + * deleted since we started accessing it's proc sysctl */
> + dev = dev_get_by_name(net, dev_name);
> + if (dev == NULL)
> + return -ENOENT;
> + in_dev = in_dev_get(dev);
> + cnf = &in_dev->cnf;
> + }
> +
> + tmp_ctl = *ctl;
> + tmp_ctl.data += (char *)cnf - (char *)&ipv4_devconf;
> + tmp_ctl.extra1 = cnf;
> + tmp_ctl.extra2 = net;
> +
> + ret = proc_handler(&tmp_ctl, write, buffer, lenp, ppos);
> +
> + if (in_dev)
> + in_dev_put(in_dev);
> + if (dev)
> + dev_put(dev);
> + return ret;
> +}
> +
> static int devinet_conf_proc(ctl_table *ctl, int write,
> void __user *buffer,
> size_t *lenp, loff_t *ppos)
> @@ -1445,6 +1507,33 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write,
> return ret;
> }
>
> +static int devinet_conf_proc__(ctl_table *ctl, int write,
> + void __user *buffer,
> + size_t *lenp, loff_t *ppos,
> + struct file *filp)
> +{
> + return devinet_conf_handler(ctl, write, buffer, lenp, ppos, filp,
> + devinet_conf_proc);
> +}
> +
> +static int devinet_sysctl_forward__(ctl_table *ctl, int write,
> + void __user *buffer,
> + size_t *lenp, loff_t *ppos,
> + struct file *filp)
> +{
> + return devinet_conf_handler(ctl, write, buffer, lenp, ppos, filp,
> + devinet_sysctl_forward);
> +}
> +
> +static int ipv4_doint_and_flush__(ctl_table *ctl, int write,
> + void __user *buffer,
> + size_t *lenp, loff_t *ppos,
> + struct file *filp)
> +{
> + return devinet_conf_handler(ctl, write, buffer, lenp, ppos, filp,
> + ipv4_doint_and_flush);
> +}
> +
> #define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \
> { \
> .procname = name, \
> @@ -1452,67 +1541,60 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write,
> IPV4_DEVCONF_ ## attr - 1, \
> .maxlen = sizeof(int), \
> .mode = mval, \
> - .proc_handler = proc, \
> - .extra1 = &ipv4_devconf, \
> + .proc_handler = (proc_handler *) proc, \
> }
>
> #define DEVINET_SYSCTL_RW_ENTRY(attr, name) \
> - DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc)
> + DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc__)
>
> #define DEVINET_SYSCTL_RO_ENTRY(attr, name) \
> - DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc)
> + DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc__)
>
> #define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \
> DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc)
>
> #define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \
> - DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush)
> -
> -static struct devinet_sysctl_table {
> - struct ctl_table_header *sysctl_header;
> - struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX];
> - char *dev_name;
> -} devinet_sysctl = {
> - .devinet_vars = {
> - DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
> - devinet_sysctl_forward),
> - DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
> -
> - DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
> - DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
> - DEVINET_SYSCTL_RW_ENTRY(SHARED_MEDIA, "shared_media"),
> - DEVINET_SYSCTL_RW_ENTRY(RP_FILTER, "rp_filter"),
> - DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"),
> - DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE,
> - "accept_source_route"),
> - DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"),
> - DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"),
> - DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"),
> - DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"),
> - DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"),
> - DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"),
> - DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"),
> - DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"),
> - DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"),
> - DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
> - DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
> - DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
> - DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
> -
> - DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
> - DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
> - DEVINET_SYSCTL_FLUSHING_ENTRY(FORCE_IGMP_VERSION,
> - "force_igmp_version"),
> - DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
> - "promote_secondaries"),
> - },
> + DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush__)
> +
> +const struct ctl_table ipv4_devinet_sysctl_table[__IPV4_DEVCONF_MAX] = {
> + DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
> + devinet_sysctl_forward__),
> + DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
> +
> + DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
> + DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
> + DEVINET_SYSCTL_RW_ENTRY(SHARED_MEDIA, "shared_media"),
> + DEVINET_SYSCTL_RW_ENTRY(RP_FILTER, "rp_filter"),
> + DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"),
> + DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE,
> + "accept_source_route"),
> + DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"),
> + DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"),
> + DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"),
> + DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"),
> + DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"),
> + DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"),
> + DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"),
> + DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"),
> + DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"),
> + DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
> + DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
> + DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
> + DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
> +
> + DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
> + DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
> + DEVINET_SYSCTL_FLUSHING_ENTRY(FORCE_IGMP_VERSION,
> + "force_igmp_version"),
> + DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
> + "promote_secondaries"),
> + { }
> };
>
> static int __devinet_sysctl_register(struct net *net, char *dev_name,
> - struct ipv4_devconf *p)
> + struct ipv4_devconf *cnf)
> {
> - int i;
> - struct devinet_sysctl_table *t;
> + struct devinet_sysctl *t = &cnf->devinet_sysctl;
>
> #define DEVINET_CTL_PATH_DEV 3
>
> @@ -1524,16 +1606,6 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
> { },
> };
>
> - t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL);
> - if (!t)
> - goto out;
> -
> - for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) {
> - t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf;
> - t->devinet_vars[i].extra1 = p;
> - t->devinet_vars[i].extra2 = net;
> - }
> -
> /*
> * Make a copy of dev_name, because '.procname' is regarded as const
> * by sysctl and we wouldn't want anyone to change it under our feet
> @@ -1541,37 +1613,32 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
> */
> t->dev_name = kstrdup(dev_name, GFP_KERNEL);
> if (!t->dev_name)
> - goto free;
> + goto out;
>
> devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name;
>
> - t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path,
> - t->devinet_vars);
> + t->sysctl_header = register_net_sysctl_table_pathdata(net,
> + devinet_ctl_path, ipv4_devinet_sysctl_table, net);
> if (!t->sysctl_header)
> goto free_procname;
>
> - p->sysctl = t;
> return 0;
>
> free_procname:
> kfree(t->dev_name);
> -free:
> - kfree(t);
> out:
> return -ENOBUFS;
> }
>
> static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
> {
> - struct devinet_sysctl_table *t = cnf->sysctl;
> + struct devinet_sysctl *t = &cnf->devinet_sysctl;
>
> if (t == NULL)
> return;
>
> - cnf->sysctl = NULL;
> unregister_sysctl_table(t->sysctl_header);
> kfree(t->dev_name);
> - kfree(t);
> }
>
> static void devinet_sysctl_register(struct in_device *idev)