LinuxLists.cc - [PATCH] New System call unshare (try 2)

2005-10-12 16:07:34

Subject: [PATCH] New System call unshare (try 2)

Hi Chris,

Here is the reworked unshare system call patch which attempts to fix
problems that you identified with the first incarnation. The patch, against
the latest mm tree 2.6.14-rc2-mm2, was tested on single processor i386
system for the following:
Don't allow namespace unsharing, if sharing fs (CLONE_FS)
Don't allow sighand unsharing if not unsharing vm
Don't allow vm unsharing if task cloned with CLONE_THREAD
Don't allow vm unsharing if the task is performing async io
Successful unsharing of namespace
Cloned a task without sharing fs. Performed mounts before
and after a call to unshare to verify that the namespace
was indeed unshared.
Successful unsharing of vm
Cloned a task using CLONE_VM, CLONE_SIGHAND and CLONE_FILES.
Used a shared memory segment and futex within it to
synchronize execution of threads. Verified shared and
unshared vm before and after successful unshare call.

Would appreciate feedback.

Thanks.

-Janak

--------------------------------------------------------------------------
diff -Naurp 2.6.14-rc2-mm2/fs/namespace.c 2.6.14-rc2-mm2+unshare/fs/namespace.c
--- 2.6.14-rc2-mm2/fs/namespace.c 2005-10-12 02:56:08.000000000 +0000
+++ 2.6.14-rc2-mm2+unshare/fs/namespace.c 2005-10-12 11:46:52.000000000 +0000
@@ -1069,9 +1069,6 @@ int copy_namespace(int flags, struct tas
{
struct namespace *namespace = tsk->namespace;
struct namespace *new_ns;
- struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL;
- struct fs_struct *fs = tsk->fs;
- struct vfsmount *p, *q;

if (!namespace)
return 0;
@@ -1086,6 +1083,28 @@ int copy_namespace(int flags, struct tas
return -EPERM;
}

+ new_ns = dup_namespace_struct(tsk);
+ if (!new_ns)
+ goto out;
+
+ tsk->namespace = new_ns;
+
+ put_namespace(namespace);
+ return 0;
+
+out:
+ put_namespace(namespace);
+ return -ENOMEM;
+}
+
+struct namespace *dup_namespace_struct(struct task_struct *tsk)
+{
+ struct namespace *namespace = tsk->namespace;
+ struct namespace *new_ns;
+ struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL;
+ struct fs_struct *fs = tsk->fs;
+ struct vfsmount *p, *q;
+
new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL);
if (!new_ns)
goto out;
@@ -1134,8 +1153,6 @@ int copy_namespace(int flags, struct tas
}
up_write(&tsk->namespace->sem);

- tsk->namespace = new_ns;
-
if (rootmnt)
mntput(rootmnt);
if (pwdmnt)
@@ -1143,12 +1160,10 @@ int copy_namespace(int flags, struct tas
if (altrootmnt)
mntput(altrootmnt);

- put_namespace(namespace);
- return 0;
+ return new_ns;

out:
- put_namespace(namespace);
- return -ENOMEM;
+ return NULL;
}

asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name,
diff -Naurp 2.6.14-rc2-mm2/include/linux/namespace.h 2.6.14-rc2-mm2+unshare/include/linux/namespace.h
--- 2.6.14-rc2-mm2/include/linux/namespace.h 2005-10-12 02:56:35.000000000 +0000
+++ 2.6.14-rc2-mm2+unshare/include/linux/namespace.h 2005-10-12 11:45:46.000000000 +0000
@@ -14,6 +14,7 @@ struct namespace {

extern int copy_namespace(int, struct task_struct *);
extern void __put_namespace(struct namespace *namespace);
+extern struct namespace *dup_namespace_struct(struct task_struct *);

static inline void put_namespace(struct namespace *namespace)
{
diff -Naurp 2.6.14-rc2-mm2/kernel/fork.c 2.6.14-rc2-mm2+unshare/kernel/fork.c
--- 2.6.14-rc2-mm2/kernel/fork.c 2005-10-12 02:56:39.000000000 +0000
+++ 2.6.14-rc2-mm2+unshare/kernel/fork.c 2005-10-12 13:26:05.000000000 +0000
@@ -448,6 +448,55 @@ void mm_release(struct task_struct *tsk,
}
}

+/*
+ * Allocate a new mm structure and copy contents from the
+ * mm structure of the passed in task structure.
+ */
+static struct mm_struct *dup_mm_struct(struct task_struct *tsk)
+{
+ struct mm_struct *mm, *oldmm = current->mm;
+ int err;
+
+ if (!oldmm)
+ return NULL;
+
+ mm = allocate_mm();
+ if (!mm)
+ goto fail_nomem;
+
+ memcpy(mm, oldmm, sizeof(*mm));
+
+ if (!mm_init(mm))
+ goto fail_nomem;
+
+ if (init_new_context(tsk, mm))
+ goto fail_nocontext;
+
+ err = dup_mmap(mm, oldmm);
+ if (err)
+ goto free_pt;
+
+ mm->hiwater_rss = get_mm_rss(mm);
+ mm->hiwater_vm = mm->total_vm;
+
+ return mm;
+
+free_pt:
+ mmput(mm);
+
+fail_nomem:
+ return NULL;
+
+fail_nocontext:
+ /*
+ * If init_new_context() failed, we cannot use mmput() to free the mm
+ * because it calls destroy_context()
+ */
+ mm_free_pgd(mm);
+ free_mm(mm);
+ return NULL;
+}
+
static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
{
struct mm_struct * mm, *oldmm;
@@ -482,43 +531,17 @@ static int copy_mm(unsigned long clone_f
}

retval = -ENOMEM;
- mm = allocate_mm();
+ mm = dup_mm_struct(tsk);
if (!mm)
goto fail_nomem;

- /* Copy the current MM stuff.. */
- memcpy(mm, oldmm, sizeof(*mm));
- if (!mm_init(mm))
- goto fail_nomem;
-
- if (init_new_context(tsk,mm))
- goto fail_nocontext;
-
- retval = dup_mmap(mm, oldmm);
- if (retval)
- goto free_pt;
-
- mm->hiwater_rss = get_mm_rss(mm);
- mm->hiwater_vm = mm->total_vm;
-
good_mm:
tsk->mm = mm;
tsk->active_mm = mm;
return 0;

-free_pt:
- mmput(mm);
fail_nomem:
return retval;
-
-fail_nocontext:
- /*
- * If init_new_context() failed, we cannot use mmput() to free the mm
- * because it calls destroy_context()
- */
- mm_free_pgd(mm);
- free_mm(mm);
- return retval;
}

static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
@@ -753,6 +776,23 @@ int unshare_files(void)

EXPORT_SYMBOL(unshare_files);

+/*
+ * Allocate a new sighand structure and copy contents from the
+ * sighand of the passed in task structure
+ */
+static struct sighand_struct *dup_sh_struct(struct task_struct *tsk)
+{
+ struct sighand_struct *sh;
+
+ sh = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
+ if (!sh)
+ return NULL;
+ spin_lock_init(&sh->siglock);
+ atomic_set(&sh->count, 1);
+ memcpy(sh->action, current->sighand->action, sizeof(sh->action));
+ return sh;
+}
+
static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
{
struct sighand_struct *sig;
@@ -761,13 +801,10 @@ static inline int copy_sighand(unsigned
atomic_inc(&current->sighand->count);
return 0;
}
- sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
+ sig = dup_sh_struct(tsk);
tsk->sighand = sig;
if (!sig)
return -ENOMEM;
- spin_lock_init(&sig->siglock);
- atomic_set(&sig->count, 1);
- memcpy(sig->action, current->sighand->action, sizeof(sig->action));
return 0;
}

@@ -1321,3 +1358,164 @@ void __init proc_caches_init(void)
sizeof(struct mm_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
}
+
+/*
+ * Performs sanity checks on the flags passed to the unshare system
+ * call.
+ */
+static inline int check_unshare_flags(unsigned long unshare_flags)
+{
+ int err = -EINVAL;
+
+ if (unshare_flags & ~(CLONE_NEWNS | CLONE_VM | CLONE_SIGHAND))
+ goto errout;
+
+ /*
+ * Shared signal handlers imply shared VM, so if CLONE_SIGHAND is
+ * set, CLONE_VM must also be set in the system call argument.
+ */
+ if ((unshare_flags & CLONE_SIGHAND) && !(unshare_flags & CLONE_VM))
+ goto errout;
+
+ /*
+ * Cannot unshare namespace if the fs structure is being shared
+ * through a previous call to clone()
+ */
+ if ((unshare_flags & CLONE_NEWNS) &&
+ (atomic_read(&current->fs->count) > 1))
+ goto errout;
+
+ if (unshare_flags & CLONE_VM) {
+ /*
+ * Cannot unshare vm if the signal structure is being shared
+ * through a previous call to clone() with CLONE_THREAD.
+ */
+ if (atomic_read(&current->signal->count) > 1)
+ goto errout;
+
+ /*
+ * Cannot unshare vm only if the sighand structure is being
+ * shared through a previous call to clone()
+ */
+ if (!(unshare_flags & CLONE_SIGHAND) &&
+ (atomic_read(&current->sighand->count) > 1))
+ goto errout;
+
+ /*
+ * If the task is performing async io, unsharing of vm will
+ * only allow the parent task, with which the vm was being
+ * shared, to continue async io operations. The async io
+ * context is not copied to the new, unshared, copy of vm.
+ * So don't allow unsharing of vm if the process is setup
+ * to perform async io.
+ */
+ if (current->mm->ioctx_list)
+ goto errout;
+ }
+ return 0;
+
+errout:
+ return err;
+
+}
+
+/*
+ * unshare allows a process to 'unshare' part of the process
+ * context which was originally shared using clone. copy_*
+ * functions used by do_fork() cannot be used here directly
+ * because they modify an inactive task_struct that is being
+ * constructed. Here we are modifying the current, active,
+ * task_struct.
+ */
+asmlinkage long sys_unshare(unsigned long unshare_flags)
+{
+ int err = 0, unshare_ns = 0, unshare_mm = 0, unshare_sh = 0;
+ struct namespace *new_ns = NULL, *ns = current->namespace;
+ struct mm_struct *new_mm = NULL, *active_mm = NULL, *mm = current->mm;
+ struct sighand_struct *new_sh = NULL, *sh = current->sighand;
+
+ err = check_unshare_flags(unshare_flags);
+ if (err)
+ goto unshare_out;
+
+ if (unshare_flags & CLONE_NEWNS) {
+ if (ns && atomic_read(&ns->count) > 1) {
+ err = -EPERM;
+ if (!capable(CAP_SYS_ADMIN))
+ goto unshare_out;
+
+ err = -ENOMEM;
+ new_ns = dup_namespace_struct(current);
+ if (!new_ns)
+ goto unshare_out;
+ else
+ unshare_ns = 1;
+ }
+ }
+
+ if (unshare_flags & CLONE_VM) {
+ if (atomic_read(&mm->mm_users) > 1) {
+ err = -ENOMEM;
+ new_mm = dup_mm_struct(current);
+ if (!new_mm)
+ goto unshare_cleanup_ns;
+ else {
+ unshare_mm = 1;
+ if (unshare_flags & CLONE_SIGHAND) {
+ if (atomic_read(&sh->count) > 1) {
+ new_sh = dup_sh_struct(current);
+ if (!new_sh)
+ goto unshare_cleanup_mm;
+ else
+ unshare_sh = 1;
+ }
+ }
+ }
+ }
+ }
+
+ if (unshare_ns) {
+ task_lock(current);
+ current->namespace = new_ns;
+ task_unlock(current);
+ put_namespace(ns);
+ }
+
+ if (unshare_mm) {
+ task_lock(current);
+ current->min_flt = current->maj_flt = 0;
+ current->nvcsw = current->nivcsw = 0;
+ active_mm = current->active_mm;
+ current->mm = new_mm;
+ current->active_mm = new_mm;
+ activate_mm(active_mm, new_mm);
+ task_unlock(current);
+ mmput(mm);
+ }
+
+ if (unshare_sh) {
+ write_lock_irq(&tasklist_lock);
+ spin_lock(&sh->siglock);
+ spin_lock(&new_sh->siglock);
+ current->sighand = new_sh;
+ spin_unlock(&new_sh->siglock);
+ spin_unlock(&sh->siglock);
+ write_unlock_irq(&tasklist_lock);
+
+ if (atomic_dec_and_test(&sh->count))
+ kmem_cache_free(sighand_cachep, sh);
+ }
+
+ return 0;
+
+unshare_cleanup_mm:
+ if (unshare_mm)
+ mmput(new_mm);
+
+unshare_cleanup_ns:
+ if (unshare_ns)
+ put_namespace(new_ns);
+
+unshare_out:
+ return err;
+}

2005-10-12 17:19:27

by Jamie Lokier

[permalink] [raw]

Subject: Re: [PATCH] New System call unshare (try 2)

Janak Desai wrote:
> Don't allow namespace unsharing, if sharing fs (CLONE_FS)

Makes sense. clone() has the same test at the start. (I think
namespace should be a property of fs, not task, anyway. Or completely
eliminated because it's implied by the task's root dentry+vfsmnt).

> Don't allow sighand unsharing if not unsharing vm

Why not? It's permitted to clone with unshared sighand and shared vm,
and it's useful too.

It's the combination shared sighand + unshared vm which is not
allowed by clone - so I think that's what you should refuse.

> Don't allow vm unsharing if task cloned with CLONE_THREAD

It would be better to do what clone does, and say "don't allow sighand
unsharing if task cloned with CLONE_THREAD". This is because
CLONE_THREAD tasks must have shared signals.

In combination with the rule above for sighand (my rule, not yours),
that implies "don't allow vm unsharing.." as a consequence.

> Don't allow vm unsharing if the task is performing async io

Why not?

Async ios are tied to an mm (see lookup_ioctx in fs/aio.c), which may
be shared among tasks. I see no reason why the async ios can't
continue and be waited in on in other tasks that may be using the old mm.

The new mm, if vm is unshared, would simply not see the outstanding
aios - in the same way as if a vm was unshared by fork().

-- Jamie

2005-10-12 18:17:46

by Janak Desai

[permalink] [raw]

Subject: Re: [PATCH] New System call unshare (try 2)

Jamie Lokier wrote:
> Janak Desai wrote:
>
>> Don't allow namespace unsharing, if sharing fs (CLONE_FS)
>
>
> Makes sense. clone() has the same test at the start. (I think
> namespace should be a property of fs, not task, anyway. Or completely
> eliminated because it's implied by the task's root dentry+vfsmnt).
>
>
>> Don't allow sighand unsharing if not unsharing vm
>
>
> Why not? It's permitted to clone with unshared sighand and shared vm,
> and it's useful too.
>
> It's the combination shared sighand + unshared vm which is not
> allowed by clone - so I think that's what you should refuse.
>

Yes, thanks. I had that backwards. I am checking for the illegal
combination of "shared sighand + unshared vm" later but I will
allow (and test) unsharing of sighand while keeping shared vm.

>
>> Don't allow vm unsharing if task cloned with CLONE_THREAD
>
>
> It would be better to do what clone does, and say "don't allow sighand
> unsharing if task cloned with CLONE_THREAD". This is because
> CLONE_THREAD tasks must have shared signals.
>
> In combination with the rule above for sighand (my rule, not yours),
> that implies "don't allow vm unsharing.." as a consequence.
>

Ok. Basically, with my current logic you are forced to unshare
both vm and sighand. I will update so you can unshare just sighand
and tie the CLONE_THREAD restriction to sighand.

>
>> Don't allow vm unsharing if the task is performing async io
>
>
> Why not?
>
> Async ios are tied to an mm (see lookup_ioctx in fs/aio.c), which may
> be shared among tasks. I see no reason why the async ios can't
> continue and be waited in on in other tasks that may be using the old mm.
>
> The new mm, if vm is unshared, would simply not see the outstanding
> aios - in the same way as if a vm was unshared by fork().
>

Yes, I did see that async ios are tied to an mm and that aio context is
cleared when a new mm is created by clone/exec. It just appeared odd to
me that a task that setup aio is unable to continue performing aio
after unsharing mm, but the task that it was sharing mm with, can. I
don't have any problem making this change. It's just that I wasn't sure
about aio usage and thought it was better to not allow an mm unshare if
aio is being performed.

-Janak

2005-10-12 23:54:15

by Jamie Lokier

[permalink] [raw]

Subject: Re: [PATCH] New System call unshare (try 2)

Janak Desai wrote:
> >> Don't allow vm unsharing if the task is performing async io
> >
> >Why not?
> >
> >Async ios are tied to an mm (see lookup_ioctx in fs/aio.c), which may
> >be shared among tasks. I see no reason why the async ios can't
> >continue and be waited in on in other tasks that may be using the old mm.
> >
> >The new mm, if vm is unshared, would simply not see the outstanding
> >aios - in the same way as if a vm was unshared by fork().
>
> Yes, I did see that async ios are tied to an mm and that aio context is
> cleared when a new mm is created by clone/exec. It just appeared odd to
> me that a task that setup aio is unable to continue performing aio
> after unsharing mm, but the task that it was sharing mm with, can.

The unshared task can do new aios, it just can't wait for ones
submitted prior to unsharing the mm. But that makes sense, because it
may not necessarily see the data written by the ones outstanding prior
to unsharing either.

It may seem odd, but think of it like this: fork() has the identical
oddness, if you see it like that. If fork() does that, then it's ok

Also this: aio is peculiar in not working with select/poll/epoll too,
forcing a program which does non-aio things to use threads and extra
context switches for the sole purpose of waiting on aios (whose reason
for existing is to reduce context switches - ironically). Thus nobody
would blame anyone for describing they way one waits for aio events
as slightly dubious :)
</rant>

> I don't have any problem making this change. It's just that I wasn't
> sure about aio usage and thought it was better to not allow an mm
> unshare if aio is being performed.

As long as it's well enough defined behaviour (and it is, because
unsharing during clone/fork defines that behaviour, whatever it is),
then I say don't put in the arbitrary restriction. Someone will find
a use for it one day - not least, they may want to emulate clone() by
doing a clone that shares everything followed by unshare() (presumably
with something in between, as the reason for doing this). Ideally
that should work whatever the combination of sharing flags.

-- Jamie

2005-10-13 09:58:27

by Chris Wright

[permalink] [raw]

Subject: Re: [PATCH] New System call unshare (try 2)

* Janak Desai ([email protected]) wrote:
> diff -Naurp 2.6.14-rc2-mm2/fs/namespace.c 2.6.14-rc2-mm2+unshare/fs/namespace.c
> --- 2.6.14-rc2-mm2/fs/namespace.c 2005-10-12 02:56:08.000000000 +0000
> +++ 2.6.14-rc2-mm2+unshare/fs/namespace.c 2005-10-12 11:46:52.000000000 +0000
> @@ -1069,9 +1069,6 @@ int copy_namespace(int flags, struct tas
> {
> struct namespace *namespace = tsk->namespace;
> struct namespace *new_ns;
> - struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL;
> - struct fs_struct *fs = tsk->fs;
> - struct vfsmount *p, *q;
>
> if (!namespace)
> return 0;
> @@ -1086,6 +1083,28 @@ int copy_namespace(int flags, struct tas
> return -EPERM;
> }
>
> + new_ns = dup_namespace_struct(tsk);

How about just dup_namespace()?

> + if (!new_ns)
> + goto out;
> +
> + tsk->namespace = new_ns;
> +
> + put_namespace(namespace);
> + return 0;
> +
> +out:
> + put_namespace(namespace);
> + return -ENOMEM;

If you 'return err', then you could use single return there.

> +}
> +
> +struct namespace *dup_namespace_struct(struct task_struct *tsk)
> +{
> + struct namespace *namespace = tsk->namespace;
> + struct namespace *new_ns;
> + struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL;
> + struct fs_struct *fs = tsk->fs;
> + struct vfsmount *p, *q;
> +
> new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL);
> if (!new_ns)
> goto out;
> @@ -1134,8 +1153,6 @@ int copy_namespace(int flags, struct tas
> }
> up_write(&tsk->namespace->sem);
>
> - tsk->namespace = new_ns;
> -
> if (rootmnt)
> mntput(rootmnt);
> if (pwdmnt)
> @@ -1143,12 +1160,10 @@ int copy_namespace(int flags, struct tas
> if (altrootmnt)
> mntput(altrootmnt);
>
> - put_namespace(namespace);
> - return 0;
> + return new_ns;
>
> out:
> - put_namespace(namespace);
> - return -ENOMEM;
> + return NULL;

Same here, just return new_ns.

> }
>
> asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name,
> diff -Naurp 2.6.14-rc2-mm2/include/linux/namespace.h 2.6.14-rc2-mm2+unshare/include/linux/namespace.h
> --- 2.6.14-rc2-mm2/include/linux/namespace.h 2005-10-12 02:56:35.000000000 +0000
> +++ 2.6.14-rc2-mm2+unshare/include/linux/namespace.h 2005-10-12 11:45:46.000000000 +0000
> @@ -14,6 +14,7 @@ struct namespace {
>
> extern int copy_namespace(int, struct task_struct *);
> extern void __put_namespace(struct namespace *namespace);
> +extern struct namespace *dup_namespace_struct(struct task_struct *);
>
> static inline void put_namespace(struct namespace *namespace)
> {
> diff -Naurp 2.6.14-rc2-mm2/kernel/fork.c 2.6.14-rc2-mm2+unshare/kernel/fork.c
> --- 2.6.14-rc2-mm2/kernel/fork.c 2005-10-12 02:56:39.000000000 +0000
> +++ 2.6.14-rc2-mm2+unshare/kernel/fork.c 2005-10-12 13:26:05.000000000 +0000
> @@ -448,6 +448,55 @@ void mm_release(struct task_struct *tsk,
> }
> }
>
> +/*
> + * Allocate a new mm structure and copy contents from the
> + * mm structure of the passed in task structure.
> + */
> +static struct mm_struct *dup_mm_struct(struct task_struct *tsk)

How about dup_mm()?

> +{
> + struct mm_struct *mm, *oldmm = current->mm;
> + int err;
> +
> + if (!oldmm)
> + return NULL;
> +
> + mm = allocate_mm();
> + if (!mm)
> + goto fail_nomem;
> +
> + memcpy(mm, oldmm, sizeof(*mm));
> +
> + if (!mm_init(mm))
> + goto fail_nomem;
> +
> + if (init_new_context(tsk, mm))
> + goto fail_nocontext;
> +
> + err = dup_mmap(mm, oldmm);
> + if (err)
> + goto free_pt;
> +
> + mm->hiwater_rss = get_mm_rss(mm);
> + mm->hiwater_vm = mm->total_vm;
> +
> + return mm;
> +
> +free_pt:
> + mmput(mm);
> +
> +fail_nomem:
> + return NULL;
> +
> +fail_nocontext:
> + /*
> + * If init_new_context() failed, we cannot use mmput() to free the mm
> + * because it calls destroy_context()
> + */
> + mm_free_pgd(mm);
> + free_mm(mm);
> + return NULL;
> +}
> +
> static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
> {
> struct mm_struct * mm, *oldmm;
> @@ -482,43 +531,17 @@ static int copy_mm(unsigned long clone_f
> }
>
> retval = -ENOMEM;
> - mm = allocate_mm();
> + mm = dup_mm_struct(tsk);
> if (!mm)
> goto fail_nomem;
>
> - /* Copy the current MM stuff.. */
> - memcpy(mm, oldmm, sizeof(*mm));
> - if (!mm_init(mm))
> - goto fail_nomem;
> -
> - if (init_new_context(tsk,mm))
> - goto fail_nocontext;
> -
> - retval = dup_mmap(mm, oldmm);
> - if (retval)
> - goto free_pt;
> -
> - mm->hiwater_rss = get_mm_rss(mm);
> - mm->hiwater_vm = mm->total_vm;
> -
> good_mm:
> tsk->mm = mm;
> tsk->active_mm = mm;
> return 0;
>
> -free_pt:
> - mmput(mm);
> fail_nomem:
> return retval;
> -
> -fail_nocontext:
> - /*
> - * If init_new_context() failed, we cannot use mmput() to free the mm
> - * because it calls destroy_context()
> - */
> - mm_free_pgd(mm);
> - free_mm(mm);
> - return retval;
> }
>
> static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
> @@ -753,6 +776,23 @@ int unshare_files(void)
>
> EXPORT_SYMBOL(unshare_files);
>
> +/*
> + * Allocate a new sighand structure and copy contents from the
> + * sighand of the passed in task structure
> + */
> +static struct sighand_struct *dup_sh_struct(struct task_struct *tsk)

dup_sighand is more expressive name.

> +{
> + struct sighand_struct *sh;
> +
> + sh = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
> + if (!sh)
> + return NULL;
> + spin_lock_init(&sh->siglock);
> + atomic_set(&sh->count, 1);
> + memcpy(sh->action, current->sighand->action, sizeof(sh->action));
> + return sh;
> +}
> +
> static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
> {
> struct sighand_struct *sig;
> @@ -761,13 +801,10 @@ static inline int copy_sighand(unsigned
> atomic_inc(&current->sighand->count);
> return 0;
> }
> - sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
> + sig = dup_sh_struct(tsk);
> tsk->sighand = sig;
> if (!sig)
> return -ENOMEM;
> - spin_lock_init(&sig->siglock);
> - atomic_set(&sig->count, 1);
> - memcpy(sig->action, current->sighand->action, sizeof(sig->action));
> return 0;
> }
>
> @@ -1321,3 +1358,164 @@ void __init proc_caches_init(void)
> sizeof(struct mm_struct), 0,
> SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
> }
> +
> +/*
> + * Performs sanity checks on the flags passed to the unshare system
> + * call.
> + */
> +static inline int check_unshare_flags(unsigned long unshare_flags)
> +{
> + int err = -EINVAL;
> +
> + if (unshare_flags & ~(CLONE_NEWNS | CLONE_VM | CLONE_SIGHAND))
> + goto errout;
> +
> + /*
> + * Shared signal handlers imply shared VM, so if CLONE_SIGHAND is
> + * set, CLONE_VM must also be set in the system call argument.
> + */
> + if ((unshare_flags & CLONE_SIGHAND) && !(unshare_flags & CLONE_VM))
> + goto errout;

Oops, this test is backwards.

> + /*
> + * Cannot unshare namespace if the fs structure is being shared
> + * through a previous call to clone()
> + */
> + if ((unshare_flags & CLONE_NEWNS) &&
> + (atomic_read(&current->fs->count) > 1))
> + goto errout;
> +
> + if (unshare_flags & CLONE_VM) {
> + /*
> + * Cannot unshare vm if the signal structure is being shared
> + * through a previous call to clone() with CLONE_THREAD.
> + */
> + if (atomic_read(&current->signal->count) > 1)
> + goto errout;
> +
> + /*
> + * Cannot unshare vm only if the sighand structure is being
> + * shared through a previous call to clone()
> + */
> + if (!(unshare_flags & CLONE_SIGHAND) &&
> + (atomic_read(&current->sighand->count) > 1))

And it's really just this case that's the issue.

> + goto errout;
> +
> + /*
> + * If the task is performing async io, unsharing of vm will
> + * only allow the parent task, with which the vm was being
> + * shared, to continue async io operations. The async io
> + * context is not copied to the new, unshared, copy of vm.
> + * So don't allow unsharing of vm if the process is setup
> + * to perform async io.
> + */
> + if (current->mm->ioctx_list)
> + goto errout;

This is likely overkill. The scenario I was woried about was a single
task that manages to unshare itself (easy with concurrent read of /proc
entry). I was concerned it had possibility to cause some trouble
when mmdrop happened while ctx_id was still valid mapping in new mm.
But the dup will zero the ioctx_list, exit_aio will have been called,
so I think it's no different than setting up aio and then calling exit(2).
Not very useful, but looks harmless.

> + }
> + return 0;
> +
> +errout:
> + return err;
> +
> +}
> +
> +/*
> + * unshare allows a process to 'unshare' part of the process
> + * context which was originally shared using clone. copy_*
> + * functions used by do_fork() cannot be used here directly
> + * because they modify an inactive task_struct that is being
> + * constructed. Here we are modifying the current, active,
> + * task_struct.
> + */
> +asmlinkage long sys_unshare(unsigned long unshare_flags)
> +{
> + int err = 0, unshare_ns = 0, unshare_mm = 0, unshare_sh = 0;
> + struct namespace *new_ns = NULL, *ns = current->namespace;
> + struct mm_struct *new_mm = NULL, *active_mm = NULL, *mm = current->mm;
> + struct sighand_struct *new_sh = NULL, *sh = current->sighand;

Aren't these flags unnecessary?

!unshare_ns == !new_ns, !unshare_mm == !new_mm, !unshare_sh == !new_sh

> +
> + err = check_unshare_flags(unshare_flags);
> + if (err)
> + goto unshare_out;
> +
> + if (unshare_flags & CLONE_NEWNS) {
> + if (ns && atomic_read(&ns->count) > 1) {
> + err = -EPERM;
> + if (!capable(CAP_SYS_ADMIN))
> + goto unshare_out;
> +
> + err = -ENOMEM;
> + new_ns = dup_namespace_struct(current);
> + if (!new_ns)
> + goto unshare_out;
> + else
> + unshare_ns = 1;

drop else

> + }
> + }
> +
> + if (unshare_flags & CLONE_VM) {
> + if (atomic_read(&mm->mm_users) > 1) {

Save some tabs...

if ((unshare_flags & CLONE_VM) && atomic_read(&mm->mm_users) > 1)

> + err = -ENOMEM;
> + new_mm = dup_mm_struct(current);
> + if (!new_mm)
> + goto unshare_cleanup_ns;
> + else {

drop else

> + unshare_mm = 1;
> + if (unshare_flags & CLONE_SIGHAND) {
> + if (atomic_read(&sh->count) > 1) {

Save some more tabs. Another compound conditional. Actually you can't
do unshare VM w/out unsharing sighand, so only checking for sighand->count
matters.

> + new_sh = dup_sh_struct(current);
> + if (!new_sh)
> + goto unshare_cleanup_mm;
> + else
> + unshare_sh = 1;

drop else

> + }
> + }
> + }
> + }
> + }
> +
> + if (unshare_ns) {
> + task_lock(current);
> + current->namespace = new_ns;
> + task_unlock(current);
> + put_namespace(ns);
> + }

Typically it makes locking simpler to take and release lock once.
Something like:

task_lock(current);
if (new_ns)
current->namspace = new_ns;
if (new_mm) {
current->mm = new_mm;
...
}
task_unlock(current);

But the put_namespace and mmput make it ugly either way.

> +
> + if (unshare_mm) {
> + task_lock(current);
> + current->min_flt = current->maj_flt = 0;
> + current->nvcsw = current->nivcsw = 0;

I don't think we need to reset these statistics, it's not a new task.

> + active_mm = current->active_mm;
> + current->mm = new_mm;
> + current->active_mm = new_mm;
> + activate_mm(active_mm, new_mm);
> + task_unlock(current);
> + mmput(mm);
> + }
> +
> + if (unshare_sh) {
> + write_lock_irq(&tasklist_lock);
> + spin_lock(&sh->siglock);
> + spin_lock(&new_sh->siglock);
> + current->sighand = new_sh;
> + spin_unlock(&new_sh->siglock);
> + spin_unlock(&sh->siglock);
> + write_unlock_irq(&tasklist_lock);
> +
> + if (atomic_dec_and_test(&sh->count))
> + kmem_cache_free(sighand_cachep, sh);

Should have a helper. Also, this looks like trouble. I believe this
could cause an oops because sighand count is used more like thread count
than refcount.

> + }
> +
> + return 0;
> +
> +unshare_cleanup_mm:
> + if (unshare_mm)
> + mmput(new_mm);
> +
> +unshare_cleanup_ns:
> + if (unshare_ns)
> + put_namespace(new_ns);
> +
> +unshare_out:
> + return err;
> +}

2005-10-13 10:11:45

by Jamie Lokier

[permalink] [raw]

Subject: Re: [PATCH] New System call unshare (try 2)

Janak Desai wrote:
> +
> +/*
> + * Performs sanity checks on the flags passed to the unshare system
> + * call.
> + */
> +static inline int check_unshare_flags(unsigned long unshare_flags)

After making the changes we talked about to the above function - it
would make sense for clone() to call it too. Have the tests in one
place, so both calls are consistent with each other, and will remain
so. The atomic_read() parts are dependent on the call being
unshare(), but the bit tests should be identical I think.

-- Jamie

2005-10-13 10:13:54

by Chris Wright

[permalink] [raw]

Subject: Re: [PATCH] New System call unshare (try 2)

* Jamie Lokier ([email protected]) wrote:
> Janak Desai wrote:
> > Don't allow sighand unsharing if not unsharing vm
>
> Why not? It's permitted to clone with unshared sighand and shared vm,
> and it's useful too.

I think that one's just backwards. Although I do question how useful it
is to unshare sighand. Sharing vm is pretty intimate ;-)

> It's the combination shared sighand + unshared vm which is not
> allowed by clone - so I think that's what you should refuse.
>
> > Don't allow vm unsharing if task cloned with CLONE_THREAD
>
> It would be better to do what clone does, and say "don't allow sighand
> unsharing if task cloned with CLONE_THREAD". This is because
> CLONE_THREAD tasks must have shared signals.

Yes, I agree.

> In combination with the rule above for sighand (my rule, not yours),
> that implies "don't allow vm unsharing.." as a consequence.
>
> > Don't allow vm unsharing if the task is performing async io
>
> Why not?
>
> Async ios are tied to an mm (see lookup_ioctx in fs/aio.c), which may
> be shared among tasks. I see no reason why the async ios can't
> continue and be waited in on in other tasks that may be using the old mm.

My concern was the case where there are no other tasks. But I don't
think that's an issue other than having the aio effect of setting up
aio then exiting.

thanks,
-chris

2005-10-13 13:27:32

by Janak Desai

[permalink] [raw]

Subject: Re: [PATCH] New System call unshare (try 2)

Chris Wright wrote:
> * Jamie Lokier ([email protected]) wrote:
>
>>Janak Desai wrote:
>>
>>> Don't allow sighand unsharing if not unsharing vm
>>
>>Why not? It's permitted to clone with unshared sighand and shared vm,
>>and it's useful too.
>
>
> I think that one's just backwards. Although I do question how useful it
> is to unshare sighand. Sharing vm is pretty intimate ;-)
>
>
>>It's the combination shared sighand + unshared vm which is not
>>allowed by clone - so I think that's what you should refuse.
>>
>>
>>> Don't allow vm unsharing if task cloned with CLONE_THREAD
>>
>>It would be better to do what clone does, and say "don't allow sighand
>>unsharing if task cloned with CLONE_THREAD". This is because
>>CLONE_THREAD tasks must have shared signals.
>
>
> Yes, I agree.
>
>
>>In combination with the rule above for sighand (my rule, not yours),
>>that implies "don't allow vm unsharing.." as a consequence.
>>
>>
>>> Don't allow vm unsharing if the task is performing async io
>>
>>Why not?
>>
>>Async ios are tied to an mm (see lookup_ioctx in fs/aio.c), which may
>>be shared among tasks. I see no reason why the async ios can't
>>continue and be waited in on in other tasks that may be using the old mm.
>
>
> My concern was the case where there are no other tasks. But I don't
> think that's an issue other than having the aio effect of setting up
> aio then exiting.
>
> thanks,
> -chris
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
>

Thanks Chris and Jamie. I understand the issues that you raised and
agree with your change recommendations. I will make the necessary
changes, test aio and unsharing of signal handlers (while keeping
shared vm) and post the updated patch by tomorrow.

-Janak