LinuxLists.cc - [PATCH -mm 1/5] New system call, unshare

2005-12-08 22:09:51

Subject: [PATCH -mm 1/5] New system call, unshare

[PATCH -mm 1/5] unshare system call: System call handler function
sys_unshare

Signed-off-by: Janak Desai

fs/namespace.c | 55 +++++++++-----
include/linux/namespace.h | 1
kernel/fork.c | 175
++++++++++++++++++++++++++++++++++++++--------
3 files changed, 185 insertions(+), 46 deletions(-)

diff -Naurp 2.6.15-rc5-mm1/fs/namespace.c
2.6.15-rc5-mm1+unshare/fs/namespace.c
--- 2.6.15-rc5-mm1/fs/namespace.c 2005-12-06 21:06:14.000000000 +0000
+++ 2.6.15-rc5-mm1+unshare/fs/namespace.c 2005-12-07 15:42:03.000000000
+0000
@@ -1314,7 +1314,11 @@ dput_out:
return retval;
}

-int copy_namespace(int flags, struct task_struct *tsk)
+/*
+ * Allocate a new namespace structure and populate it with contents
+ * copied from the namespace of the passed in task structure.
+ */
+struct namespace *dup_namespace(struct task_struct *tsk)
{
struct namespace *namespace = tsk->namespace;
struct namespace *new_ns;
@@ -1322,19 +1326,6 @@ int copy_namespace(int flags, struct tas
struct fs_struct *fs = tsk->fs;
struct vfsmount *p, *q;

- if (!namespace)
- return 0;
-
- get_namespace(namespace);
-
- if (!(flags & CLONE_NEWNS))
- return 0;
-
- if (!capable(CAP_SYS_ADMIN)) {
- put_namespace(namespace);
- return -EPERM;
- }
-
new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL);
if (!new_ns)
goto out;
@@ -1385,8 +1376,6 @@ int copy_namespace(int flags, struct tas
}
up_write(&namespace_sem);

- tsk->namespace = new_ns;
-
if (rootmnt)
mntput(rootmnt);
if (pwdmnt)
@@ -1394,12 +1383,40 @@ int copy_namespace(int flags, struct tas
if (altrootmnt)
mntput(altrootmnt);

- put_namespace(namespace);
- return 0;
+out:
+ return new_ns;
+}
+
+int copy_namespace(int flags, struct task_struct *tsk)
+{
+ struct namespace *namespace = tsk->namespace;
+ struct namespace *new_ns;
+ int err = 0;
+
+ if (!namespace)
+ return 0;
+
+ get_namespace(namespace);
+
+ if (!(flags & CLONE_NEWNS))
+ return 0;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ new_ns = dup_namespace(tsk);
+ if (!new_ns) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ tsk->namespace = new_ns;

out:
put_namespace(namespace);
- return -ENOMEM;
+ return err;
}

asmlinkage long sys_mount(char __user * dev_name, char __user *
dir_name,
diff -Naurp 2.6.15-rc5-mm1/include/linux/namespace.h
2.6.15-rc5-mm1+unshare/include/linux/namespace.h
--- 2.6.15-rc5-mm1/include/linux/namespace.h 2005-12-06
21:06:21.000000000 +0000
+++ 2.6.15-rc5-mm1+unshare/include/linux/namespace.h 2005-12-07
15:40:54.000000000 +0000
@@ -15,6 +15,7 @@ struct namespace {

extern int copy_namespace(int, struct task_struct *);
extern void __put_namespace(struct namespace *namespace);
+extern struct namespace *dup_namespace(struct task_struct *);

static inline void put_namespace(struct namespace *namespace)
{
diff -Naurp 2.6.15-rc5-mm1/kernel/fork.c
2.6.15-rc5-mm1+unshare/kernel/fork.c
--- 2.6.15-rc5-mm1/kernel/fork.c 2005-12-06 21:06:22.000000000 +0000
+++ 2.6.15-rc5-mm1+unshare/kernel/fork.c 2005-12-07 16:51:37.000000000
+0000
@@ -445,6 +445,55 @@ void mm_release(struct task_struct *tsk,
}
}

+/*
+ * Allocate a new mm structure and copy contents from the
+ * mm structure of the passed in task structure.
+ */
+static struct mm_struct *dup_mm(struct task_struct *tsk)
+{
+ struct mm_struct *mm, *oldmm = current->mm;
+ int err;
+
+ if (!oldmm)
+ return NULL;
+
+ mm = allocate_mm();
+ if (!mm)
+ goto fail_nomem;
+
+ memcpy(mm, oldmm, sizeof(*mm));
+
+ if (!mm_init(mm))
+ goto fail_nomem;
+
+ if (init_new_context(tsk, mm))
+ goto fail_nocontext;
+
+ err = dup_mmap(mm, oldmm);
+ if (err)
+ goto free_pt;
+
+ mm->hiwater_rss = get_mm_rss(mm);
+ mm->hiwater_vm = mm->total_vm;
+
+ return mm;
+
+free_pt:
+ mmput(mm);
+
+fail_nomem:
+ return NULL;
+
+fail_nocontext:
+ /*
+ * If init_new_context() failed, we cannot use mmput() to free the mm
+ * because it calls destroy_context()
+ */
+ mm_free_pgd(mm);
+ free_mm(mm);
+ return NULL;
+}
+
static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
{
struct mm_struct * mm, *oldmm;
@@ -472,43 +521,17 @@ static int copy_mm(unsigned long clone_f
}

retval = -ENOMEM;
- mm = allocate_mm();
+ mm = dup_mm(tsk);
if (!mm)
goto fail_nomem;

- /* Copy the current MM stuff.. */
- memcpy(mm, oldmm, sizeof(*mm));
- if (!mm_init(mm))
- goto fail_nomem;
-
- if (init_new_context(tsk,mm))
- goto fail_nocontext;
-
- retval = dup_mmap(mm, oldmm);
- if (retval)
- goto free_pt;
-
- mm->hiwater_rss = get_mm_rss(mm);
- mm->hiwater_vm = mm->total_vm;
-
good_mm:
tsk->mm = mm;
tsk->active_mm = mm;
return 0;

-free_pt:
- mmput(mm);
fail_nomem:
return retval;
-
-fail_nocontext:
- /*
- * If init_new_context() failed, we cannot use mmput() to free the mm
- * because it calls destroy_context()
- */
- mm_free_pgd(mm);
- free_mm(mm);
- return retval;
}

static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
@@ -1311,3 +1334,101 @@ void __init proc_caches_init(void)
sizeof(struct mm_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
}
+
+/*
+ * Performs sanity checks on the flags passed to the unshare system
+ * call.
+ */
+static inline int check_unshare_flags(unsigned long unshare_flags)
+{
+ int err = -EINVAL;
+
+ if (unshare_flags & ~(CLONE_NEWNS | CLONE_VM))
+ goto errout;
+
+ /*
+ * Cannot unshare namespace if the fs structure is being shared
+ * through a previous call to clone()
+ */
+ if ((unshare_flags & CLONE_NEWNS) &&
+ (atomic_read(&current->fs->count) > 1))
+ goto errout;
+
+ /*
+ * Cannot unshare vm if sighnal handlers are being shared through
+ * a previous call to clone()
+ */
+ if ((unshare_flags & CLONE_VM) &&
+ (atomic_read(&current->sighand->count) > 1))
+ goto errout;
+
+ return 0;
+
+errout:
+ return err;
+
+}
+
+/*
+ * unshare allows a process to 'unshare' part of the process
+ * context which was originally shared using clone. copy_*
+ * functions used by do_fork() cannot be used here directly
+ * because they modify an inactive task_struct that is being
+ * constructed. Here we are modifying the current, active,
+ * task_struct.
+ */
+asmlinkage long sys_unshare(unsigned long unshare_flags)
+{
+ int err = 0;
+ struct namespace *new_ns = NULL, *ns = current->namespace;
+ struct mm_struct *new_mm = NULL, *active_mm = NULL, *mm = current->mm;
+
+ err = check_unshare_flags(unshare_flags);
+ if (err)
+ goto unshare_out;
+
+ if ((unshare_flags & CLONE_NEWNS) &&
+ (ns && atomic_read(&ns->count) > 1)) {
+ err = -EPERM;
+ if (!capable(CAP_SYS_ADMIN))
+ goto unshare_out;
+
+ err = -ENOMEM;
+ new_ns = dup_namespace(current);
+ if (!new_ns)
+ goto unshare_out;
+ }
+
+ if ((unshare_flags & CLONE_VM) && (atomic_read(&mm->mm_users) > 1)) {
+ err = -ENOMEM;
+ new_mm = dup_mm(current);
+ if (!new_mm)
+ goto unshare_cleanup_ns;
+ }
+
+ if (new_ns) {
+ task_lock(current);
+ current->namespace = new_ns;
+ task_unlock(current);
+ put_namespace(ns);
+ }
+
+ if (new_mm) {
+ task_lock(current);
+ active_mm = current->active_mm;
+ current->mm = new_mm;
+ current->active_mm = new_mm;
+ activate_mm(active_mm, new_mm);
+ task_unlock(current);
+ mmput(mm);
+ }
+
+ return 0;
+
+unshare_cleanup_ns:
+ if (new_ns)
+ put_namespace(new_ns);
+
+unshare_out:
+ return err;
+}

2005-12-09 10:01:53

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH -mm 1/5] New system call, unshare

* JANAK DESAI <[email protected]> wrote:

> [PATCH -mm 1/5] unshare system call: System call handler function
> sys_unshare

>+ if (unshare_flags & ~(CLONE_NEWNS | CLONE_VM))
>+ goto errout;

just curious, did you consider all the other CLONE_* flags as well, to
see whether it makes sense to add unshare support for them?

Ingo

2005-12-09 12:03:05

by Al Viro

[permalink] [raw]

Subject: Re: [PATCH -mm 1/5] New system call, unshare

On Fri, Dec 09, 2005 at 11:55:02AM +0100, Ingo Molnar wrote:
>
> * JANAK DESAI <[email protected]> wrote:
>
> > [PATCH -mm 1/5] unshare system call: System call handler function
> > sys_unshare
>
> >+ if (unshare_flags & ~(CLONE_NEWNS | CLONE_VM))
> >+ goto errout;
>
> just curious, did you consider all the other CLONE_* flags as well, to
> see whether it makes sense to add unshare support for them?

IMO the right thing to do is
* accept *all* flags from the very beginning
* check constraints ("CLONE_NEWNS must be accompanied by CLONE_FS")
and either -EINVAL if they are not satisfied or silently force them.
* for each unimplemented flag check if we corresponding thing
is shared; -EINVAL otherwise.

Then for each flag we care to implement we should replace such check with
actual unsharing - a patch per flag.

CLONE_FS and CLONE_FILES are *definitely* worth implementing and are
trivial to implement. The only thing we must take care of is doing
all replacements under task_lock, without dropping it between updates.
I would say that CLONE_SIGHAND is also an obvious candidate for adding.

2005-12-09 14:16:05

by Janak Desai

[permalink] [raw]

Subject: Re: [PATCH -mm 1/5] New system call, unshare

Al Viro wrote:

>On Fri, Dec 09, 2005 at 11:55:02AM +0100, Ingo Molnar wrote:
>
>
>>* JANAK DESAI <[email protected]> wrote:
>>
>>
>>
>>>[PATCH -mm 1/5] unshare system call: System call handler function
>>>sys_unshare
>>>
>>>
>>>+ if (unshare_flags & ~(CLONE_NEWNS | CLONE_VM))
>>>+ goto errout;
>>>
>>>
>>just curious, did you consider all the other CLONE_* flags as well, to
>>see whether it makes sense to add unshare support for them?
>>
>>
>
>IMO the right thing to do is
> * accept *all* flags from the very beginning
> * check constraints ("CLONE_NEWNS must be accompanied by CLONE_FS")
>and either -EINVAL if they are not satisfied or silently force them.
> * for each unimplemented flag check if we corresponding thing
>is shared; -EINVAL otherwise.
>
>Then for each flag we care to implement we should replace such check with
>actual unsharing - a patch per flag.
>
>CLONE_FS and CLONE_FILES are *definitely* worth implementing and are
>trivial to implement. The only thing we must take care of is doing
>all replacements under task_lock, without dropping it between updates.
>
>

Ok, thanks. I will restructure code and reorganize patches accordingly
and post
updated patches.

To answer Ingo's question, we did look at other flags when I started.
However,
I wanted to keep the system call simple enough, with atleast namespace
unsharing,
so it would get accepted. In the original discussion on fsdevel,
unsharing of vm
was mentioned as useful so I added that in addition to namespace unsharing.

>I would say that CLONE_SIGHAND is also an obvious candidate for adding.
>
>
I did have signal handler unsharing in one of the earlier incarnation of
the patch,
however Chris Wright alerted me (on IRC) to a possible problem with posix
real time signals if we allow unsharing of signal handlers. He pointed
me to the
way send_sigqueue is stashing sighand lock for later use and since
timers are
flushed on exec and exit, it may lead to an oops. Since my primary
interest was
in namespace unsharing, I disallowed unsharing of signal handler. I will
take a
look at it more detail and come back with specific issues with sighand
unsharing.

Thanks.

>-
>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>the body of a message to [email protected]
>More majordomo info at http://vger.kernel.org/majordomo-info.html
>Please read the FAQ at http://www.tux.org/lkml/
>
>
>
>

2005-12-09 14:34:59

by Al Viro

[permalink] [raw]

Subject: Re: [PATCH -mm 1/5] New system call, unshare

On Fri, Dec 09, 2005 at 09:15:53AM -0500, JANAK DESAI wrote:
> To answer Ingo's question, we did look at other flags when I started.
> However,
> I wanted to keep the system call simple enough, with atleast namespace
> unsharing,
> so it would get accepted. In the original discussion on fsdevel,
> unsharing of vm
> was mentioned as useful so I added that in addition to namespace unsharing.

So make that a series... Note that it can be merged gradually - adding
and debugging the unsharing of fs, files, etc. can be done independently
and with no ABI changes.

2005-12-09 14:48:45

by Janak Desai

[permalink] [raw]

Subject: Re: [PATCH -mm 1/5] New system call, unshare

Al Viro wrote:

>On Fri, Dec 09, 2005 at 09:15:53AM -0500, JANAK DESAI wrote:
>
>
>>To answer Ingo's question, we did look at other flags when I started.
>>However,
>>I wanted to keep the system call simple enough, with atleast namespace
>>unsharing,
>>so it would get accepted. In the original discussion on fsdevel,
>>unsharing of vm
>>was mentioned as useful so I added that in addition to namespace unsharing.
>>
>>
>
>So make that a series... Note that it can be merged gradually - adding
>and debugging the unsharing of fs, files, etc. can be done independently
>and with no ABI changes.
>
>

Ok, thanks. Will do.

>-
>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>the body of a message to [email protected]
>More majordomo info at http://vger.kernel.org/majordomo-info.html
>Please read the FAQ at http://www.tux.org/lkml/
>
>
>
>

2005-12-09 19:02:46

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH -mm 1/5] New system call, unshare

* JANAK DESAI <[email protected]> wrote:

> To answer Ingo's question, we did look at other flags when I started.
> However, I wanted to keep the system call simple enough, with atleast
> namespace unsharing, so it would get accepted. In the original
> discussion on fsdevel, unsharing of vm was mentioned as useful so I
> added that in addition to namespace unsharing.

i think the only sane way to do this is to support unsharing for all
flags. Internally (i.e. in the patch-queue) it can still be structured
in a finegrained way, but in terms of exposing it to applications, it
should be an all or nothing solution i think.

Ingo

2005-12-09 19:57:12

by Janak Desai

[permalink] [raw]

Subject: Re: [PATCH -mm 1/5] New system call, unshare

Ingo Molnar wrote:

>* JANAK DESAI <[email protected]> wrote:
>
>
>
>>To answer Ingo's question, we did look at other flags when I started.
>>However, I wanted to keep the system call simple enough, with atleast
>>namespace unsharing, so it would get accepted. In the original
>>discussion on fsdevel, unsharing of vm was mentioned as useful so I
>>added that in addition to namespace unsharing.
>>
>>
>
>i think the only sane way to do this is to support unsharing for all
>flags. Internally (i.e. in the patch-queue) it can still be structured
>in a finegrained way, but in terms of exposing it to applications, it
>should be an all or nothing solution i think.
>
> Ingo
>
>
>
>
yes, I understand that now. I am currently restructuring the code based
on Al Viro's
post. The system call will ..
* accept all flags
* check for illiegal combination of flags and return -EINVAL
* check for implied missing flags and silently force them
* for each flag, call a corresponding unshare function. The flag
specific unshare function will
* return -EINVAL if the structure is being shared but the
corresponding unsharing code is not
implemented yet
* return success and pass back a pointer to a newly allocated and
duplicated structure if the
structure is being shared and the unshare code is implemented
for that flag
* return success and pass back a null pointer if the structure is
not being shared to begin with
* return -ENOMEM or appropriate error if there is a failure in
allocating/duplicating the structure
* Error returns will appropriately cleanup previously duplicated
structures and return with an error
from the syscall
* If no errors, then after going through all flags, we will be left
with appropriate new structures.
* acquire appropriate (task_lock) locks and update current task
struct with non-null, duplicated,
structures
* relinquish locks and return success from the syscall

This will allow us to incrementally add unshare functionality for
different flags without requiring
and changes to apps.

Please let me know if you see any gotchas in the above.

-Janak