2008-03-05 12:20:47

by Pavel Emelyanov

[permalink] [raw]
Subject: [PATCH] Make /proc/net a symlink on /proc/self/net

Current /proc/net is done with so called "shadows", but current
implementation is broken and has little chances to get fixed.

The problem is that dentries subtree of /proc/net directory has
fancy revalidation rules to make processes living in different
net namespaces see different entries in /proc/net subtree, but
currently, tasks see in the /proc/net subdir the contents of any
other namespace, depending on who opened the file first.

The proposed fix is to turn /proc/net into a symlink, which points
to /proc/self/net, which in turn shows what previously was in
/proc/net - the network-related info, from the net namespace the
appropriate task lives in.

# ls -l /proc/net
lrwxrwxrwx 1 root root 8 Mar 5 15:17 /proc/net -> self/net

In other words - this behaves like /proc/mounts, but unlike
"mounts", "net" is not a file, but a directory.

Signed-off-by: Pavel Emelyanov <[email protected]>

---

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 96ee899..cc43cf0 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2274,6 +2274,7 @@ static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, task),
DIR("fd", S_IRUSR|S_IXUSR, fd),
DIR("fdinfo", S_IRUSR|S_IXUSR, fdinfo),
+ DIR("net", S_IRUGO|S_IXUSR, net),
REG("environ", S_IRUSR, environ),
INF("auxv", S_IRUSR, pid_auxv),
ONE("status", S_IRUGO, pid_status),
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 68971e6..a36ad3c 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -377,15 +377,14 @@ static struct dentry_operations proc_dentry_operations =
* Don't create negative dentries here, return -ENOENT by hand
* instead.
*/
-struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
+ struct dentry *dentry)
{
struct inode *inode = NULL;
- struct proc_dir_entry * de;
int error = -ENOENT;

lock_kernel();
spin_lock(&proc_subdir_lock);
- de = PDE(dir);
if (de) {
for (de = de->subdir; de ; de = de->next) {
if (de->namelen != dentry->d_name.len)
@@ -393,8 +392,6 @@ struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nam
if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
unsigned int ino;

- if (de->shadow_proc)
- de = de->shadow_proc(current, de);
ino = de->low_ino;
de_get(de);
spin_unlock(&proc_subdir_lock);
@@ -417,6 +414,12 @@ out_unlock:
return ERR_PTR(error);
}

+struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ return proc_lookup_de(PDE(dir), dir, dentry);
+}
+
/*
* This returns non-zero if at EOF, so that the /proc
* root directory can use this and check if it should
@@ -426,10 +429,9 @@ out_unlock:
* value of the readdir() call, as long as it's non-negative
* for success..
*/
-int proc_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
+ filldir_t filldir)
{
- struct proc_dir_entry * de;
unsigned int ino;
int i;
struct inode *inode = filp->f_path.dentry->d_inode;
@@ -438,7 +440,6 @@ int proc_readdir(struct file * filp,
lock_kernel();

ino = inode->i_ino;
- de = PDE(inode);
if (!de) {
ret = -EINVAL;
goto out;
@@ -499,6 +500,13 @@ out: unlock_kernel();
return ret;
}

+int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ struct inode *inode = filp->f_path.dentry->d_inode;
+
+ return proc_readdir_de(PDE(inode), filp, dirent, filldir);
+}
+
/*
* These are the generic /proc directory operations. They
* use the in-memory "struct proc_dir_entry" tree to parse
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 1c81c8f..bc72f5c 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -64,6 +64,8 @@ extern const struct file_operations proc_numa_maps_operations;
extern const struct file_operations proc_smaps_operations;
extern const struct file_operations proc_clear_refs_operations;
extern const struct file_operations proc_pagemap_operations;
+extern const struct file_operations proc_net_operations;
+extern const struct inode_operations proc_net_inode_operations;

void free_proc_entry(struct proc_dir_entry *de);

@@ -83,3 +85,8 @@ static inline int proc_fd(struct inode *inode)
{
return PROC_I(inode)->fd;
}
+
+struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino,
+ struct dentry *dentry);
+int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
+ filldir_t filldir);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 14e9b5a..3372c2e 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -63,6 +63,63 @@ int seq_release_net(struct inode *ino, struct file *f)
}
EXPORT_SYMBOL_GPL(seq_release_net);

+static struct net *get_proc_task_net(struct inode *dir)
+{
+ struct task_struct *task;
+ struct nsproxy *ns;
+ struct net *net = NULL;
+
+ rcu_read_lock();
+ task = get_proc_task(dir);
+ if (task != NULL) {
+ ns = task_nsproxy(task);
+ if (ns != NULL)
+ net = get_net(ns->net_ns);
+ }
+ rcu_read_unlock();
+
+ return net;
+}
+
+static struct dentry *proc_tgid_net_lookup(struct inode *dir,
+ struct dentry *dentry, struct nameidata *nd)
+{
+ struct dentry *de;
+ struct net *net;
+
+ de = ERR_PTR(-ENOENT);
+ net = get_proc_task_net(dir);
+ if (net != NULL) {
+ de = proc_lookup_de(net->proc_net, dir, dentry);
+ put_net(net);
+ }
+ return de;
+}
+
+const struct inode_operations proc_net_inode_operations = {
+ .lookup = proc_tgid_net_lookup,
+};
+
+static int proc_tgid_net_readdir(struct file *filp, void *dirent,
+ filldir_t filldir)
+{
+ int ret;
+ struct net *net;
+
+ ret = -EINVAL;
+ net = get_proc_task_net(filp->f_path.dentry->d_inode);
+ if (net != NULL) {
+ ret = proc_readdir_de(net->proc_net, filp, dirent, filldir);
+ put_net(net);
+ }
+ return ret;
+}
+
+const struct file_operations proc_net_operations = {
+ .read = generic_read_dir,
+ .readdir = proc_tgid_net_readdir,
+};
+

struct proc_dir_entry *proc_net_fops_create(struct net *net,
const char *name, mode_t mode, const struct file_operations *fops)
@@ -83,14 +140,6 @@ struct net *get_proc_net(const struct inode *inode)
}
EXPORT_SYMBOL_GPL(get_proc_net);

-static struct proc_dir_entry *shadow_pde;
-
-static struct proc_dir_entry *proc_net_shadow(struct task_struct *task,
- struct proc_dir_entry *de)
-{
- return task->nsproxy->net_ns->proc_net;
-}
-
struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
struct proc_dir_entry *parent)
{
@@ -104,45 +153,35 @@ EXPORT_SYMBOL_GPL(proc_net_mkdir);

static __net_init int proc_net_ns_init(struct net *net)
{
- struct proc_dir_entry *root, *netd, *net_statd;
+ struct proc_dir_entry *netd, *net_statd;
int err;

err = -ENOMEM;
- root = kzalloc(sizeof(*root), GFP_KERNEL);
- if (!root)
+ netd = kzalloc(sizeof(*netd), GFP_KERNEL);
+ if (!netd)
goto out;

- err = -EEXIST;
- netd = proc_net_mkdir(net, "net", root);
- if (!netd)
- goto free_root;
+ netd->data = net;

err = -EEXIST;
net_statd = proc_net_mkdir(net, "stat", netd);
if (!net_statd)
goto free_net;

- root->data = net;
-
- net->proc_net_root = root;
net->proc_net = netd;
net->proc_net_stat = net_statd;
- err = 0;
+ return 0;

+free_net:
+ kfree(netd);
out:
return err;
-free_net:
- remove_proc_entry("net", root);
-free_root:
- kfree(root);
- goto out;
}

static __net_exit void proc_net_ns_exit(struct net *net)
{
remove_proc_entry("stat", net->proc_net);
- remove_proc_entry("net", net->proc_net_root);
- kfree(net->proc_net_root);
+ kfree(net->proc_net);
}

static struct pernet_operations __net_initdata proc_net_ns_ops = {
@@ -152,8 +191,7 @@ static struct pernet_operations __net_initdata proc_net_ns_ops = {

int __init proc_net_init(void)
{
- shadow_pde = proc_mkdir("net", NULL);
- shadow_pde->shadow_proc = proc_net_shadow;
+ proc_symlink("net", NULL, "self/net");

return register_pernet_subsys(&proc_net_ns_ops);
}
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index d9a9e71..9b6c935 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -50,8 +50,6 @@ typedef int (read_proc_t)(char *page, char **start, off_t off,
typedef int (write_proc_t)(struct file *file, const char __user *buffer,
unsigned long count, void *data);
typedef int (get_info_t)(char *, char **, off_t, int);
-typedef struct proc_dir_entry *(shadow_proc_t)(struct task_struct *task,
- struct proc_dir_entry *pde);

struct proc_dir_entry {
unsigned int low_ino;
@@ -82,7 +80,6 @@ struct proc_dir_entry {
int pde_users; /* number of callers into module in progress */
spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
struct completion *pde_unload_completion;
- shadow_proc_t *shadow_proc;
};

struct kcore_list {
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 28738b7..923f2b8 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -31,7 +31,6 @@ struct net {

struct proc_dir_entry *proc_net;
struct proc_dir_entry *proc_net_stat;
- struct proc_dir_entry *proc_net_root;

struct list_head sysctl_table_headers;


2008-03-05 19:15:31

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH] Make /proc/net a symlink on /proc/self/net

On Wed, Mar 05, 2008 at 03:20:07PM +0300, Pavel Emelyanov wrote:
> Current /proc/net is done with so called "shadows", but current
> implementation is broken and has little chances to get fixed.
>
> The problem is that dentries subtree of /proc/net directory has
> fancy revalidation rules to make processes living in different
> net namespaces see different entries in /proc/net subtree, but
> currently, tasks see in the /proc/net subdir the contents of any
> other namespace, depending on who opened the file first.
>
> The proposed fix is to turn /proc/net into a symlink, which points
> to /proc/self/net, which in turn shows what previously was in
> /proc/net - the network-related info, from the net namespace the
> appropriate task lives in.
>
> # ls -l /proc/net
> lrwxrwxrwx 1 root root 8 Mar 5 15:17 /proc/net -> self/net
>
> In other words - this behaves like /proc/mounts, but unlike
> "mounts", "net" is not a file, but a directory.

One question below for get_proc_task_net() -- I don't see how the
reference to a task struct is released.

Thanx, Paul

> Signed-off-by: Pavel Emelyanov <[email protected]>
>
> ---
>
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 96ee899..cc43cf0 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -2274,6 +2274,7 @@ static const struct pid_entry tgid_base_stuff[] = {
> DIR("task", S_IRUGO|S_IXUGO, task),
> DIR("fd", S_IRUSR|S_IXUSR, fd),
> DIR("fdinfo", S_IRUSR|S_IXUSR, fdinfo),
> + DIR("net", S_IRUGO|S_IXUSR, net),
> REG("environ", S_IRUSR, environ),
> INF("auxv", S_IRUSR, pid_auxv),
> ONE("status", S_IRUGO, pid_status),
> diff --git a/fs/proc/generic.c b/fs/proc/generic.c
> index 68971e6..a36ad3c 100644
> --- a/fs/proc/generic.c
> +++ b/fs/proc/generic.c
> @@ -377,15 +377,14 @@ static struct dentry_operations proc_dentry_operations =
> * Don't create negative dentries here, return -ENOENT by hand
> * instead.
> */
> -struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
> +struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
> + struct dentry *dentry)
> {
> struct inode *inode = NULL;
> - struct proc_dir_entry * de;
> int error = -ENOENT;
>
> lock_kernel();
> spin_lock(&proc_subdir_lock);
> - de = PDE(dir);
> if (de) {
> for (de = de->subdir; de ; de = de->next) {
> if (de->namelen != dentry->d_name.len)
> @@ -393,8 +392,6 @@ struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nam
> if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
> unsigned int ino;
>
> - if (de->shadow_proc)
> - de = de->shadow_proc(current, de);
> ino = de->low_ino;
> de_get(de);
> spin_unlock(&proc_subdir_lock);
> @@ -417,6 +414,12 @@ out_unlock:
> return ERR_PTR(error);
> }
>
> +struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
> + struct nameidata *nd)
> +{
> + return proc_lookup_de(PDE(dir), dir, dentry);
> +}
> +
> /*
> * This returns non-zero if at EOF, so that the /proc
> * root directory can use this and check if it should
> @@ -426,10 +429,9 @@ out_unlock:
> * value of the readdir() call, as long as it's non-negative
> * for success..
> */
> -int proc_readdir(struct file * filp,
> - void * dirent, filldir_t filldir)
> +int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
> + filldir_t filldir)
> {
> - struct proc_dir_entry * de;
> unsigned int ino;
> int i;
> struct inode *inode = filp->f_path.dentry->d_inode;
> @@ -438,7 +440,6 @@ int proc_readdir(struct file * filp,
> lock_kernel();
>
> ino = inode->i_ino;
> - de = PDE(inode);
> if (!de) {
> ret = -EINVAL;
> goto out;
> @@ -499,6 +500,13 @@ out: unlock_kernel();
> return ret;
> }
>
> +int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
> +{
> + struct inode *inode = filp->f_path.dentry->d_inode;
> +
> + return proc_readdir_de(PDE(inode), filp, dirent, filldir);
> +}
> +
> /*
> * These are the generic /proc directory operations. They
> * use the in-memory "struct proc_dir_entry" tree to parse
> diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> index 1c81c8f..bc72f5c 100644
> --- a/fs/proc/internal.h
> +++ b/fs/proc/internal.h
> @@ -64,6 +64,8 @@ extern const struct file_operations proc_numa_maps_operations;
> extern const struct file_operations proc_smaps_operations;
> extern const struct file_operations proc_clear_refs_operations;
> extern const struct file_operations proc_pagemap_operations;
> +extern const struct file_operations proc_net_operations;
> +extern const struct inode_operations proc_net_inode_operations;
>
> void free_proc_entry(struct proc_dir_entry *de);
>
> @@ -83,3 +85,8 @@ static inline int proc_fd(struct inode *inode)
> {
> return PROC_I(inode)->fd;
> }
> +
> +struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino,
> + struct dentry *dentry);
> +int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
> + filldir_t filldir);
> diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
> index 14e9b5a..3372c2e 100644
> --- a/fs/proc/proc_net.c
> +++ b/fs/proc/proc_net.c
> @@ -63,6 +63,63 @@ int seq_release_net(struct inode *ino, struct file *f)
> }
> EXPORT_SYMBOL_GPL(seq_release_net);
>
> +static struct net *get_proc_task_net(struct inode *dir)
> +{
> + struct task_struct *task;
> + struct nsproxy *ns;
> + struct net *net = NULL;
> +
> + rcu_read_lock();
> + task = get_proc_task(dir);

This implies a get_task_struct(), as get_proc_task() invokes get_proc_task()
which invokes get_pid_task() which invokes get_task_struct().

I don't see the corresponding put_task_struct() -- what am I missing here?

> + if (task != NULL) {
> + ns = task_nsproxy(task);
> + if (ns != NULL)
> + net = get_net(ns->net_ns);
> + }
> + rcu_read_unlock();
> +
> + return net;
> +}
> +
> +static struct dentry *proc_tgid_net_lookup(struct inode *dir,
> + struct dentry *dentry, struct nameidata *nd)
> +{
> + struct dentry *de;
> + struct net *net;
> +
> + de = ERR_PTR(-ENOENT);
> + net = get_proc_task_net(dir);
> + if (net != NULL) {
> + de = proc_lookup_de(net->proc_net, dir, dentry);
> + put_net(net);
> + }
> + return de;
> +}
> +
> +const struct inode_operations proc_net_inode_operations = {
> + .lookup = proc_tgid_net_lookup,
> +};
> +
> +static int proc_tgid_net_readdir(struct file *filp, void *dirent,
> + filldir_t filldir)
> +{
> + int ret;
> + struct net *net;
> +
> + ret = -EINVAL;
> + net = get_proc_task_net(filp->f_path.dentry->d_inode);
> + if (net != NULL) {
> + ret = proc_readdir_de(net->proc_net, filp, dirent, filldir);
> + put_net(net);
> + }
> + return ret;
> +}
> +
> +const struct file_operations proc_net_operations = {
> + .read = generic_read_dir,
> + .readdir = proc_tgid_net_readdir,
> +};
> +
>
> struct proc_dir_entry *proc_net_fops_create(struct net *net,
> const char *name, mode_t mode, const struct file_operations *fops)
> @@ -83,14 +140,6 @@ struct net *get_proc_net(const struct inode *inode)
> }
> EXPORT_SYMBOL_GPL(get_proc_net);
>
> -static struct proc_dir_entry *shadow_pde;
> -
> -static struct proc_dir_entry *proc_net_shadow(struct task_struct *task,
> - struct proc_dir_entry *de)
> -{
> - return task->nsproxy->net_ns->proc_net;
> -}
> -
> struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
> struct proc_dir_entry *parent)
> {
> @@ -104,45 +153,35 @@ EXPORT_SYMBOL_GPL(proc_net_mkdir);
>
> static __net_init int proc_net_ns_init(struct net *net)
> {
> - struct proc_dir_entry *root, *netd, *net_statd;
> + struct proc_dir_entry *netd, *net_statd;
> int err;
>
> err = -ENOMEM;
> - root = kzalloc(sizeof(*root), GFP_KERNEL);
> - if (!root)
> + netd = kzalloc(sizeof(*netd), GFP_KERNEL);
> + if (!netd)
> goto out;
>
> - err = -EEXIST;
> - netd = proc_net_mkdir(net, "net", root);
> - if (!netd)
> - goto free_root;
> + netd->data = net;
>
> err = -EEXIST;
> net_statd = proc_net_mkdir(net, "stat", netd);
> if (!net_statd)
> goto free_net;
>
> - root->data = net;
> -
> - net->proc_net_root = root;
> net->proc_net = netd;
> net->proc_net_stat = net_statd;
> - err = 0;
> + return 0;
>
> +free_net:
> + kfree(netd);
> out:
> return err;
> -free_net:
> - remove_proc_entry("net", root);
> -free_root:
> - kfree(root);
> - goto out;
> }
>
> static __net_exit void proc_net_ns_exit(struct net *net)
> {
> remove_proc_entry("stat", net->proc_net);
> - remove_proc_entry("net", net->proc_net_root);
> - kfree(net->proc_net_root);
> + kfree(net->proc_net);
> }
>
> static struct pernet_operations __net_initdata proc_net_ns_ops = {
> @@ -152,8 +191,7 @@ static struct pernet_operations __net_initdata proc_net_ns_ops = {
>
> int __init proc_net_init(void)
> {
> - shadow_pde = proc_mkdir("net", NULL);
> - shadow_pde->shadow_proc = proc_net_shadow;
> + proc_symlink("net", NULL, "self/net");
>
> return register_pernet_subsys(&proc_net_ns_ops);
> }
> diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
> index d9a9e71..9b6c935 100644
> --- a/include/linux/proc_fs.h
> +++ b/include/linux/proc_fs.h
> @@ -50,8 +50,6 @@ typedef int (read_proc_t)(char *page, char **start, off_t off,
> typedef int (write_proc_t)(struct file *file, const char __user *buffer,
> unsigned long count, void *data);
> typedef int (get_info_t)(char *, char **, off_t, int);
> -typedef struct proc_dir_entry *(shadow_proc_t)(struct task_struct *task,
> - struct proc_dir_entry *pde);
>
> struct proc_dir_entry {
> unsigned int low_ino;
> @@ -82,7 +80,6 @@ struct proc_dir_entry {
> int pde_users; /* number of callers into module in progress */
> spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
> struct completion *pde_unload_completion;
> - shadow_proc_t *shadow_proc;
> };
>
> struct kcore_list {
> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> index 28738b7..923f2b8 100644
> --- a/include/net/net_namespace.h
> +++ b/include/net/net_namespace.h
> @@ -31,7 +31,6 @@ struct net {
>
> struct proc_dir_entry *proc_net;
> struct proc_dir_entry *proc_net_stat;
> - struct proc_dir_entry *proc_net_root;
>
> struct list_head sysctl_table_headers;
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2008-03-05 21:29:57

by David Miller

[permalink] [raw]
Subject: Re: [PATCH] Make /proc/net a symlink on /proc/self/net

From: "Paul E. McKenney" <[email protected]>
Date: Wed, 5 Mar 2008 11:15:01 -0800

> One question below for get_proc_task_net() -- I don't see how the
> reference to a task struct is released.

Yes, it definitely seems to leak.

2008-03-06 00:03:27

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] Make /proc/net a symlink on /proc/self/net

>>
>> +static struct net *get_proc_task_net(struct inode *dir)
>> +{
>> + struct task_struct *task;
>> + struct nsproxy *ns;
>> + struct net *net = NULL;
>> +
>> + rcu_read_lock();
>> + task = get_proc_task(dir);
>
> This implies a get_task_struct(), as get_proc_task() invokes get_proc_task()
> which invokes get_pid_task() which invokes get_task_struct().
>
> I don't see the corresponding put_task_struct() -- what am I missing here?

You aren't. However we can easily make this:
task = pid_task(proc_pid(dir), PIDTYPE_PID);
And we won't need to increment the count on the task_struct.

Good catch.

>> + if (task != NULL) {
>> + ns = task_nsproxy(task);
>> + if (ns != NULL)
>> + net = get_net(ns->net_ns);
>> + }
>> + rcu_read_unlock();
>> +
>> + return net;
>> +}
>> +

2008-03-06 00:22:23

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH] Make /proc/net a symlink on /proc/self/net

On Wed, Mar 05, 2008 at 04:59:56PM -0700, Eric W. Biederman wrote:
> >>
> >> +static struct net *get_proc_task_net(struct inode *dir)
> >> +{
> >> + struct task_struct *task;
> >> + struct nsproxy *ns;
> >> + struct net *net = NULL;
> >> +
> >> + rcu_read_lock();
> >> + task = get_proc_task(dir);
> >
> > This implies a get_task_struct(), as get_proc_task() invokes get_proc_task()
> > which invokes get_pid_task() which invokes get_task_struct().
> >
> > I don't see the corresponding put_task_struct() -- what am I missing here?
>
> You aren't. However we can easily make this:
> task = pid_task(proc_pid(dir), PIDTYPE_PID);
> And we won't need to increment the count on the task_struct.

That looks like it should do the trick! And the net_get() below
will allow you to safely export the "net" pointer out of an RCU
read-side critical section, so with that change looks good to me!

Thanx, Paul

> Good catch.
>
> >> + if (task != NULL) {
> >> + ns = task_nsproxy(task);
> >> + if (ns != NULL)
> >> + net = get_net(ns->net_ns);
> >> + }
> >> + rcu_read_unlock();
> >> +
> >> + return net;
> >> +}
> >> +

2008-03-06 00:25:42

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] Make /proc/net a symlink on /proc/self/net

Pavel Emelyanov <[email protected]> writes:

> Current /proc/net is done with so called "shadows", but current
> implementation is broken and has little chances to get fixed.
>
> The problem is that dentries subtree of /proc/net directory has
> fancy revalidation rules to make processes living in different
> net namespaces see different entries in /proc/net subtree, but
> currently, tasks see in the /proc/net subdir the contents of any
> other namespace, depending on who opened the file first.
>
> The proposed fix is to turn /proc/net into a symlink, which points
> to /proc/self/net, which in turn shows what previously was in
> /proc/net - the network-related info, from the net namespace the
> appropriate task lives in.
>
> # ls -l /proc/net
> lrwxrwxrwx 1 root root 8 Mar 5 15:17 /proc/net -> self/net
>
> In other words - this behaves like /proc/mounts, but unlike
> "mounts", "net" is not a file, but a directory.

Overall this looks good, thanks.


As a follow on patch this should be moved from /proc/<pid>/net
into /proc/<pid>/task/<tid>/net.

We don't have anything that currently forces network namespaces to be
the same between different tasks of the same task group (I just
looked), nor do we have a technical reason to require that.

So we should fix our infrastructure to include the companion of
/proc/self, a /proc/current (which points at the current task)
after which it should be about a two line change to move this
from the tgid to the task directory.

Eric

2008-03-06 08:26:58

by Pavel Emelyanov

[permalink] [raw]
Subject: Re: [PATCH] Make /proc/net a symlink on /proc/self/net

Paul E. McKenney wrote:
> On Wed, Mar 05, 2008 at 03:20:07PM +0300, Pavel Emelyanov wrote:
>> Current /proc/net is done with so called "shadows", but current
>> implementation is broken and has little chances to get fixed.
>>
>> The problem is that dentries subtree of /proc/net directory has
>> fancy revalidation rules to make processes living in different
>> net namespaces see different entries in /proc/net subtree, but
>> currently, tasks see in the /proc/net subdir the contents of any
>> other namespace, depending on who opened the file first.
>>
>> The proposed fix is to turn /proc/net into a symlink, which points
>> to /proc/self/net, which in turn shows what previously was in
>> /proc/net - the network-related info, from the net namespace the
>> appropriate task lives in.
>>
>> # ls -l /proc/net
>> lrwxrwxrwx 1 root root 8 Mar 5 15:17 /proc/net -> self/net
>>
>> In other words - this behaves like /proc/mounts, but unlike
>> "mounts", "net" is not a file, but a directory.
>
> One question below for get_proc_task_net() -- I don't see how the
> reference to a task struct is released.

Shame on me :( Thanks Paul, I'll prepare the fixed patch shortly,
all the more so Eric finally agreed with it...

> Thanx, Paul
>
>> Signed-off-by: Pavel Emelyanov <[email protected]>
>>
>> ---
>>
>> diff --git a/fs/proc/base.c b/fs/proc/base.c
>> index 96ee899..cc43cf0 100644
>> --- a/fs/proc/base.c
>> +++ b/fs/proc/base.c
>> @@ -2274,6 +2274,7 @@ static const struct pid_entry tgid_base_stuff[] = {
>> DIR("task", S_IRUGO|S_IXUGO, task),
>> DIR("fd", S_IRUSR|S_IXUSR, fd),
>> DIR("fdinfo", S_IRUSR|S_IXUSR, fdinfo),
>> + DIR("net", S_IRUGO|S_IXUSR, net),
>> REG("environ", S_IRUSR, environ),
>> INF("auxv", S_IRUSR, pid_auxv),
>> ONE("status", S_IRUGO, pid_status),
>> diff --git a/fs/proc/generic.c b/fs/proc/generic.c
>> index 68971e6..a36ad3c 100644
>> --- a/fs/proc/generic.c
>> +++ b/fs/proc/generic.c
>> @@ -377,15 +377,14 @@ static struct dentry_operations proc_dentry_operations =
>> * Don't create negative dentries here, return -ENOENT by hand
>> * instead.
>> */
>> -struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
>> +struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
>> + struct dentry *dentry)
>> {
>> struct inode *inode = NULL;
>> - struct proc_dir_entry * de;
>> int error = -ENOENT;
>>
>> lock_kernel();
>> spin_lock(&proc_subdir_lock);
>> - de = PDE(dir);
>> if (de) {
>> for (de = de->subdir; de ; de = de->next) {
>> if (de->namelen != dentry->d_name.len)
>> @@ -393,8 +392,6 @@ struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nam
>> if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
>> unsigned int ino;
>>
>> - if (de->shadow_proc)
>> - de = de->shadow_proc(current, de);
>> ino = de->low_ino;
>> de_get(de);
>> spin_unlock(&proc_subdir_lock);
>> @@ -417,6 +414,12 @@ out_unlock:
>> return ERR_PTR(error);
>> }
>>
>> +struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
>> + struct nameidata *nd)
>> +{
>> + return proc_lookup_de(PDE(dir), dir, dentry);
>> +}
>> +
>> /*
>> * This returns non-zero if at EOF, so that the /proc
>> * root directory can use this and check if it should
>> @@ -426,10 +429,9 @@ out_unlock:
>> * value of the readdir() call, as long as it's non-negative
>> * for success..
>> */
>> -int proc_readdir(struct file * filp,
>> - void * dirent, filldir_t filldir)
>> +int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
>> + filldir_t filldir)
>> {
>> - struct proc_dir_entry * de;
>> unsigned int ino;
>> int i;
>> struct inode *inode = filp->f_path.dentry->d_inode;
>> @@ -438,7 +440,6 @@ int proc_readdir(struct file * filp,
>> lock_kernel();
>>
>> ino = inode->i_ino;
>> - de = PDE(inode);
>> if (!de) {
>> ret = -EINVAL;
>> goto out;
>> @@ -499,6 +500,13 @@ out: unlock_kernel();
>> return ret;
>> }
>>
>> +int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
>> +{
>> + struct inode *inode = filp->f_path.dentry->d_inode;
>> +
>> + return proc_readdir_de(PDE(inode), filp, dirent, filldir);
>> +}
>> +
>> /*
>> * These are the generic /proc directory operations. They
>> * use the in-memory "struct proc_dir_entry" tree to parse
>> diff --git a/fs/proc/internal.h b/fs/proc/internal.h
>> index 1c81c8f..bc72f5c 100644
>> --- a/fs/proc/internal.h
>> +++ b/fs/proc/internal.h
>> @@ -64,6 +64,8 @@ extern const struct file_operations proc_numa_maps_operations;
>> extern const struct file_operations proc_smaps_operations;
>> extern const struct file_operations proc_clear_refs_operations;
>> extern const struct file_operations proc_pagemap_operations;
>> +extern const struct file_operations proc_net_operations;
>> +extern const struct inode_operations proc_net_inode_operations;
>>
>> void free_proc_entry(struct proc_dir_entry *de);
>>
>> @@ -83,3 +85,8 @@ static inline int proc_fd(struct inode *inode)
>> {
>> return PROC_I(inode)->fd;
>> }
>> +
>> +struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino,
>> + struct dentry *dentry);
>> +int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
>> + filldir_t filldir);
>> diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
>> index 14e9b5a..3372c2e 100644
>> --- a/fs/proc/proc_net.c
>> +++ b/fs/proc/proc_net.c
>> @@ -63,6 +63,63 @@ int seq_release_net(struct inode *ino, struct file *f)
>> }
>> EXPORT_SYMBOL_GPL(seq_release_net);
>>
>> +static struct net *get_proc_task_net(struct inode *dir)
>> +{
>> + struct task_struct *task;
>> + struct nsproxy *ns;
>> + struct net *net = NULL;
>> +
>> + rcu_read_lock();
>> + task = get_proc_task(dir);
>
> This implies a get_task_struct(), as get_proc_task() invokes get_proc_task()
> which invokes get_pid_task() which invokes get_task_struct().
>
> I don't see the corresponding put_task_struct() -- what am I missing here?
>
>> + if (task != NULL) {
>> + ns = task_nsproxy(task);
>> + if (ns != NULL)
>> + net = get_net(ns->net_ns);
>> + }
>> + rcu_read_unlock();
>> +
>> + return net;
>> +}
>> +
>> +static struct dentry *proc_tgid_net_lookup(struct inode *dir,
>> + struct dentry *dentry, struct nameidata *nd)
>> +{
>> + struct dentry *de;
>> + struct net *net;
>> +
>> + de = ERR_PTR(-ENOENT);
>> + net = get_proc_task_net(dir);
>> + if (net != NULL) {
>> + de = proc_lookup_de(net->proc_net, dir, dentry);
>> + put_net(net);
>> + }
>> + return de;
>> +}
>> +
>> +const struct inode_operations proc_net_inode_operations = {
>> + .lookup = proc_tgid_net_lookup,
>> +};
>> +
>> +static int proc_tgid_net_readdir(struct file *filp, void *dirent,
>> + filldir_t filldir)
>> +{
>> + int ret;
>> + struct net *net;
>> +
>> + ret = -EINVAL;
>> + net = get_proc_task_net(filp->f_path.dentry->d_inode);
>> + if (net != NULL) {
>> + ret = proc_readdir_de(net->proc_net, filp, dirent, filldir);
>> + put_net(net);
>> + }
>> + return ret;
>> +}
>> +
>> +const struct file_operations proc_net_operations = {
>> + .read = generic_read_dir,
>> + .readdir = proc_tgid_net_readdir,
>> +};
>> +
>>
>> struct proc_dir_entry *proc_net_fops_create(struct net *net,
>> const char *name, mode_t mode, const struct file_operations *fops)
>> @@ -83,14 +140,6 @@ struct net *get_proc_net(const struct inode *inode)
>> }
>> EXPORT_SYMBOL_GPL(get_proc_net);
>>
>> -static struct proc_dir_entry *shadow_pde;
>> -
>> -static struct proc_dir_entry *proc_net_shadow(struct task_struct *task,
>> - struct proc_dir_entry *de)
>> -{
>> - return task->nsproxy->net_ns->proc_net;
>> -}
>> -
>> struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
>> struct proc_dir_entry *parent)
>> {
>> @@ -104,45 +153,35 @@ EXPORT_SYMBOL_GPL(proc_net_mkdir);
>>
>> static __net_init int proc_net_ns_init(struct net *net)
>> {
>> - struct proc_dir_entry *root, *netd, *net_statd;
>> + struct proc_dir_entry *netd, *net_statd;
>> int err;
>>
>> err = -ENOMEM;
>> - root = kzalloc(sizeof(*root), GFP_KERNEL);
>> - if (!root)
>> + netd = kzalloc(sizeof(*netd), GFP_KERNEL);
>> + if (!netd)
>> goto out;
>>
>> - err = -EEXIST;
>> - netd = proc_net_mkdir(net, "net", root);
>> - if (!netd)
>> - goto free_root;
>> + netd->data = net;
>>
>> err = -EEXIST;
>> net_statd = proc_net_mkdir(net, "stat", netd);
>> if (!net_statd)
>> goto free_net;
>>
>> - root->data = net;
>> -
>> - net->proc_net_root = root;
>> net->proc_net = netd;
>> net->proc_net_stat = net_statd;
>> - err = 0;
>> + return 0;
>>
>> +free_net:
>> + kfree(netd);
>> out:
>> return err;
>> -free_net:
>> - remove_proc_entry("net", root);
>> -free_root:
>> - kfree(root);
>> - goto out;
>> }
>>
>> static __net_exit void proc_net_ns_exit(struct net *net)
>> {
>> remove_proc_entry("stat", net->proc_net);
>> - remove_proc_entry("net", net->proc_net_root);
>> - kfree(net->proc_net_root);
>> + kfree(net->proc_net);
>> }
>>
>> static struct pernet_operations __net_initdata proc_net_ns_ops = {
>> @@ -152,8 +191,7 @@ static struct pernet_operations __net_initdata proc_net_ns_ops = {
>>
>> int __init proc_net_init(void)
>> {
>> - shadow_pde = proc_mkdir("net", NULL);
>> - shadow_pde->shadow_proc = proc_net_shadow;
>> + proc_symlink("net", NULL, "self/net");
>>
>> return register_pernet_subsys(&proc_net_ns_ops);
>> }
>> diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
>> index d9a9e71..9b6c935 100644
>> --- a/include/linux/proc_fs.h
>> +++ b/include/linux/proc_fs.h
>> @@ -50,8 +50,6 @@ typedef int (read_proc_t)(char *page, char **start, off_t off,
>> typedef int (write_proc_t)(struct file *file, const char __user *buffer,
>> unsigned long count, void *data);
>> typedef int (get_info_t)(char *, char **, off_t, int);
>> -typedef struct proc_dir_entry *(shadow_proc_t)(struct task_struct *task,
>> - struct proc_dir_entry *pde);
>>
>> struct proc_dir_entry {
>> unsigned int low_ino;
>> @@ -82,7 +80,6 @@ struct proc_dir_entry {
>> int pde_users; /* number of callers into module in progress */
>> spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
>> struct completion *pde_unload_completion;
>> - shadow_proc_t *shadow_proc;
>> };
>>
>> struct kcore_list {
>> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
>> index 28738b7..923f2b8 100644
>> --- a/include/net/net_namespace.h
>> +++ b/include/net/net_namespace.h
>> @@ -31,7 +31,6 @@ struct net {
>>
>> struct proc_dir_entry *proc_net;
>> struct proc_dir_entry *proc_net_stat;
>> - struct proc_dir_entry *proc_net_root;
>>
>> struct list_head sysctl_table_headers;
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>

2008-03-06 08:40:45

by Pavel Emelyanov

[permalink] [raw]
Subject: Re: [PATCH] Make /proc/net a symlink on /proc/self/net

Eric W. Biederman wrote:
> Pavel Emelyanov <[email protected]> writes:
>
>> Current /proc/net is done with so called "shadows", but current
>> implementation is broken and has little chances to get fixed.
>>
>> The problem is that dentries subtree of /proc/net directory has
>> fancy revalidation rules to make processes living in different
>> net namespaces see different entries in /proc/net subtree, but
>> currently, tasks see in the /proc/net subdir the contents of any
>> other namespace, depending on who opened the file first.
>>
>> The proposed fix is to turn /proc/net into a symlink, which points
>> to /proc/self/net, which in turn shows what previously was in
>> /proc/net - the network-related info, from the net namespace the
>> appropriate task lives in.
>>
>> # ls -l /proc/net
>> lrwxrwxrwx 1 root root 8 Mar 5 15:17 /proc/net -> self/net
>>
>> In other words - this behaves like /proc/mounts, but unlike
>> "mounts", "net" is not a file, but a directory.
>
> Overall this looks good, thanks.
>
>
> As a follow on patch this should be moved from /proc/<pid>/net
> into /proc/<pid>/task/<tid>/net.

I knew you would request for it :)

> We don't have anything that currently forces network namespaces to be
> the same between different tasks of the same task group (I just
> looked), nor do we have a technical reason to require that.
>
> So we should fix our infrastructure to include the companion of
> /proc/self, a /proc/current (which points at the current task)
> after which it should be about a two line change to move this
> from the tgid to the task directory.

This is right what I intended to tell you when you would propose
to tune the /proc/net link. I'm glad hearing, that you already
agree with that.

> Eric
>

2008-03-23 13:02:25

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] Make /proc/net a symlink on /proc/self/net

On Wednesday 05 March 2008 13:20, Pavel Emelyanov wrote:
> Current /proc/net is done with so called "shadows", but current
> implementation is broken and has little chances to get fixed.
>
> The problem is that dentries subtree of /proc/net directory has
> fancy revalidation rules to make processes living in different
> net namespaces see different entries in /proc/net subtree, but
> currently, tasks see in the /proc/net subdir the contents of any
> other namespace, depending on who opened the file first.
>
> The proposed fix is to turn /proc/net into a symlink, which points
> to /proc/self/net, which in turn shows what previously was in
> /proc/net - the network-related info, from the net namespace the
> appropriate task lives in.
>
> # ls -l /proc/net
> lrwxrwxrwx 1 root root 8 Mar 5 15:17 /proc/net -> self/net

This broke tools which read /proc/net/dev. Under non-root,
they are no longer working. This is a regression.
--
vda

2008-03-23 13:05:44

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH] Make /proc/net a symlink on /proc/self/net

On Sunday 23 March 2008 14:00, Denys Vlasenko wrote:
> > The proposed fix is to turn /proc/net into a symlink, which points
> > to /proc/self/net, which in turn shows what previously was in
> > /proc/net - the network-related info, from the net namespace the
> > appropriate task lives in.
> >
> > # ls -l /proc/net
> > lrwxrwxrwx 1 root root 8 Mar 5 15:17 /proc/net -> self/net
>
> This broke tools which read /proc/net/dev. Under non-root,
> they are no longer working. This is a regression.

I see that this is already reported & fix is in queue.

Sorry for the noise.
--
vda

2008-03-23 13:16:47

by David Miller

[permalink] [raw]
Subject: Re: [PATCH] Make /proc/net a symlink on /proc/self/net

From: Denys Vlasenko <[email protected]>
Date: Sun, 23 Mar 2008 14:00:41 +0100

> On Wednesday 05 March 2008 13:20, Pavel Emelyanov wrote:
> > Current /proc/net is done with so called "shadows", but current
> > implementation is broken and has little chances to get fixed.
> >
> > The problem is that dentries subtree of /proc/net directory has
> > fancy revalidation rules to make processes living in different
> > net namespaces see different entries in /proc/net subtree, but
> > currently, tasks see in the /proc/net subdir the contents of any
> > other namespace, depending on who opened the file first.
> >
> > The proposed fix is to turn /proc/net into a symlink, which points
> > to /proc/self/net, which in turn shows what previously was in
> > /proc/net - the network-related info, from the net namespace the
> > appropriate task lives in.
> >
> > # ls -l /proc/net
> > lrwxrwxrwx 1 root root 8 Mar 5 15:17 /proc/net -> self/net
>
> This broke tools which read /proc/net/dev. Under non-root,
> they are no longer working. This is a regression.

This is fixed with a commit that was made yesterday.