2006-05-01 19:54:23

by Serge E. Hallyn

[permalink] [raw]
Subject: [PATCH 7/7] uts namespaces: Implement CLONE_NEWUTS flag

Implement a CLONE_NEWUTS flag, and use it at clone and sys_unshare.

Signed-off-by: Serge E. Hallyn <[email protected]>

---

include/linux/sched.h | 1 +
include/linux/utsname.h | 7 ++++++
kernel/fork.c | 13 ++++++++++--
kernel/utsname.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 72 insertions(+), 2 deletions(-)

f785920e68ae2482ff76df39d6be5335bbfcc511
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3d74d77..7f4af71 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -62,6 +62,7 @@ #define CLONE_DETACHED 0x00400000 /* Un
#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
#define CLONE_STOPPED 0x02000000 /* Start in stopped state */
+#define CLONE_NEWUTS 0x04000000 /* New utsname group? */

/*
* List of flags we want to share for kernel threads,
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index d58a406..caecec7 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -40,6 +40,8 @@ struct uts_namespace {
extern struct uts_namespace init_uts_ns;

#ifdef CONFIG_UTS_NS
+extern int unshare_utsname(unsigned long unshare_flags,
+ struct uts_namespace **new_uts);
extern int copy_utsname(int flags, struct task_struct *tsk);
extern void free_uts_ns(struct kref *kref);

@@ -70,6 +72,11 @@ static inline void put_uts_ns(struct uts
static inline void exit_utsname(struct task_struct *p)
{
}
+static inline int unshare_utsname(unsigned long unshare_flags,
+ struct uts_namespace **new_uts)
+{
+ return -EINVAL;
+}
#endif

static inline struct new_utsname *utsname(void)
diff --git a/kernel/fork.c b/kernel/fork.c
index cedfd86..c52c274 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1566,13 +1566,14 @@ asmlinkage long sys_unshare(unsigned lon
struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
struct files_struct *fd, *new_fd = NULL;
struct sem_undo_list *new_ulist = NULL;
+ struct uts_namespace *new_uts = NULL;

check_unshare_flags(&unshare_flags);

/* Return -EINVAL for all unsupported flags */
err = -EINVAL;
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
- CLONE_VM|CLONE_FILES|CLONE_SYSVSEM))
+ CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|CLONE_NEWUTS))
goto bad_unshare_out;

if ((err = unshare_thread(unshare_flags)))
@@ -1589,8 +1590,11 @@ asmlinkage long sys_unshare(unsigned lon
goto bad_unshare_cleanup_vm;
if ((err = unshare_semundo(unshare_flags, &new_ulist)))
goto bad_unshare_cleanup_fd;
+ if ((err = unshare_utsname(unshare_flags, &new_uts)))
+ goto bad_unshare_cleanup_fd;

- if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) {
+ if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist ||
+ new_uts) {

task_lock(current);

@@ -1627,6 +1631,11 @@ asmlinkage long sys_unshare(unsigned lon
new_fd = fd;
}

+ if (new_uts) {
+ put_uts_ns(current->uts_ns);
+ current->uts_ns = new_uts;
+ }
+
task_unlock(current);
}

diff --git a/kernel/utsname.c b/kernel/utsname.c
index f9e8f86..5a97a21 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -21,6 +21,41 @@ static inline void get_uts_ns(struct uts
}

/*
+ * Clone a new ns copying an original utsname, setting refcount to 1
+ * @old_ns: namespace to clone
+ * Return NULL on error (failure to kmalloc), new ns otherwise
+ */
+struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
+{
+ struct uts_namespace *ns;
+
+ ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
+ if (ns) {
+ memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
+ kref_init(&ns->kref);
+ }
+ return ns;
+}
+
+/*
+ * unshare the current process' utsname namespace.
+ * called only in sys_unshare()
+ */
+int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts)
+{
+ if (unshare_flags & CLONE_NEWUTS) {
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ *new_uts = clone_uts_ns(current->uts_ns);
+ if (!*new_uts)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/*
* Copy task tsk's utsname namespace, or clone it if flags
* specifies CLONE_NEWUTS. In latter case, changes to the
* utsname of this process won't be seen by parent, and vice
@@ -29,6 +64,7 @@ static inline void get_uts_ns(struct uts
int copy_utsname(int flags, struct task_struct *tsk)
{
struct uts_namespace *old_ns = tsk->uts_ns;
+ struct uts_namespace *new_ns;
int err = 0;

if (!old_ns)
@@ -36,6 +72,23 @@ int copy_utsname(int flags, struct task_

get_uts_ns(old_ns);

+ if (!(flags & CLONE_NEWUTS))
+ return 0;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ new_ns = clone_uts_ns(old_ns);
+ if (!new_ns) {
+ err = -ENOMEM;
+ goto out;
+ }
+ tsk->uts_ns = new_ns;
+
+out:
+ put_uts_ns(old_ns);
return err;
}

--
1.3.0



2006-05-01 20:29:27

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH 7/7] uts namespaces: Implement CLONE_NEWUTS flag

On Mon, 2006-05-01 at 14:53 -0500, Serge E. Hallyn wrote:
> +struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
> +{
> + struct uts_namespace *ns;
> +
> + ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
> + if (ns) {
> + memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
> + kref_init(&ns->kref);
> + }
> + return ns;
> +}

Very small nit...

Would this memcpy be more appropriate as a strncpy()?

> +int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts)
> +{
> + if (unshare_flags & CLONE_NEWUTS) {
> + if (!capable(CAP_SYS_ADMIN))
> + return -EPERM;
> +
> + *new_uts = clone_uts_ns(current->uts_ns);
> + if (!*new_uts)
> + return -ENOMEM;
> + }
> +
> + return 0;
> +}

Would it be a bit nicer to use the ERR_PTR() mechanism here instead of
the double-pointer bit?

I've always liked those a bit better because there's no hiding the fact
of what is actually a return value from a function.

-- Dave

2006-05-01 21:59:54

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH 7/7] uts namespaces: Implement CLONE_NEWUTS flag

On Mon, 2006-05-01 at 16:11 -0500, Serge E. Hallyn wrote:
> Might be worth a separate patch to change over all those helpers in
> fork.c? (I think they were all brought in along with the sys_unshare
> syscall)

I'd be a little scared to touch good, working code, but it couldn't hurt
to see the patch.

-- Dave

2006-05-02 00:34:36

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH 7/7] uts namespaces: Implement CLONE_NEWUTS flag

Quoting Dave Hansen ([email protected]):
> On Mon, 2006-05-01 at 14:53 -0500, Serge E. Hallyn wrote:
> > +struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
> > +{
> > + struct uts_namespace *ns;
> > +
> > + ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
> > + if (ns) {
> > + memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
> > + kref_init(&ns->kref);
> > + }
> > + return ns;
> > +}
>
> Very small nit...
>
> Would this memcpy be more appropriate as a strncpy()?
>
> > +int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts)
> > +{
> > + if (unshare_flags & CLONE_NEWUTS) {
> > + if (!capable(CAP_SYS_ADMIN))
> > + return -EPERM;
> > +
> > + *new_uts = clone_uts_ns(current->uts_ns);
> > + if (!*new_uts)
> > + return -ENOMEM;
> > + }
> > +
> > + return 0;
> > +}
>
> Would it be a bit nicer to use the ERR_PTR() mechanism here instead of
> the double-pointer bit?
>
> I've always liked those a bit better because there's no hiding the fact
> of what is actually a return value from a function.

I agree. I was (grudgingly) copying the style from the other helpers
in fs/fork.c. Then I had to pull it out so it could cleanly return
-ENOMEM if !CONFIG_UTS, but I expect CONFIG_UTS to be yanked, and
this fn to be returned to fs/fork.c...

Might be worth a separate patch to change over all those helpers in
fork.c? (I think they were all brought in along with the sys_unshare
syscall)

Agreed on all your other points, thanks.

-serge

2006-05-02 06:55:40

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 7/7] uts namespaces: Implement CLONE_NEWUTS flag

"Serge E. Hallyn" <[email protected]> writes:

> Implement a CLONE_NEWUTS flag, and use it at clone and sys_unshare.

I still think it's a design mistake to add zillions of pointers to task_struct
for every possible kernel object. Going through a proxy datastructure to
merge common cases would be much better.

-Andi

2006-05-02 08:04:52

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 7/7] uts namespaces: Implement CLONE_NEWUTS flag

Andi Kleen <[email protected]> writes:

> "Serge E. Hallyn" <[email protected]> writes:
>
>> Implement a CLONE_NEWUTS flag, and use it at clone and sys_unshare.
>
> I still think it's a design mistake to add zillions of pointers to task_struct
> for every possible kernel object. Going through a proxy datastructure to
> merge common cases would be much better.

The design point is not every kernel object. The target is one
per namespace. Or more usefully one per logical chunk of the kernel.

The UTS namespace is by far the smallest of these pieces.

The kernel networking stack is probably the largest.

At last count there were about 7 of these, enough so that the few
remaining clone bits would be sufficient.

Do you disagree that to be able to implement this properly we
need to take small steps?

Are there any semantic differences between a clone bit and what you
are proposing?

If it is just an internal implementation detail do you have a clear
suggestion on how to implement this? Possibly illustrated by the
thread flags that are already in this situation, and more likely
to always share everything.

Except for reducing reference counting I really don't see where
we could have a marked improvement. I also don't see us closing
the door to that kind of implementation at this point, but instead
taking the simple path.

Eric

2006-05-02 08:17:30

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 7/7] uts namespaces: Implement CLONE_NEWUTS flag

On Tuesday 02 May 2006 10:03, Eric W. Biederman wrote:
> Andi Kleen <[email protected]> writes:
>
> > "Serge E. Hallyn" <[email protected]> writes:
> >
> >> Implement a CLONE_NEWUTS flag, and use it at clone and sys_unshare.
> >
> > I still think it's a design mistake to add zillions of pointers to task_struct
> > for every possible kernel object. Going through a proxy datastructure to
> > merge common cases would be much better.
>
> The design point is not every kernel object. The target is one
> per namespace. Or more usefully one per logical chunk of the kernel.

Which are likely tens or even hundreds if you go through all the source.
Even tens would bloat task_struct far too much.


> The UTS namespace is by far the smallest of these pieces.
>
> The kernel networking stack is probably the largest.
>
> At last count there were about 7 of these, enough so that the few
> remaining clone bits would be sufficient.

7 additional bits will probably not be enough. I still don't
quite understand why you want individual bits for everything.
Why not group them into logical pieces?

If you really want individual bits you'll likely need to think
about a clone2() call with more flags soon.

>
> Do you disagree that to be able to implement this properly we
> need to take small steps?

No, but at least the basic infrastructure for expansion should
be added first.

> Are there any semantic differences between a clone bit and what you
> are proposing?

No just internals.

> If it is just an internal implementation detail do you have a clear
> suggestion on how to implement this? Possibly illustrated by the
> thread flags that are already in this situation, and more likely
> to always share everything.

Have a proxy structure which has pointers to the many name spaces and a bit
mask for "namespace X is different". This structure would be reference
counted. task_struct has a single pointer to it.

On clone check the clone flags against the bit mask. If there is any
difference allocate a new structure and clone the name spaces into it.
If no difference just use the old data structure after increasing its
reference count.

Possible name "nsproxy".

> Except for reducing reference counting I really don't see where
> we could have a marked improvement. I also don't see us closing
> the door to that kind of implementation at this point, but instead
> taking the simple path.

With many name spaces you would have smaller task_struct, less cache
foot print, better cache use of task_struct because slab cache colouring
will still work etc.

-Andi

2006-05-02 08:49:29

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 7/7] uts namespaces: Implement CLONE_NEWUTS flag

Andi Kleen <[email protected]> writes:

> On Tuesday 02 May 2006 10:03, Eric W. Biederman wrote:
>> Andi Kleen <[email protected]> writes:
>>
>> > "Serge E. Hallyn" <[email protected]> writes:
>> >
>> >> Implement a CLONE_NEWUTS flag, and use it at clone and sys_unshare.
>> >
>> > I still think it's a design mistake to add zillions of pointers to
> task_struct
>> > for every possible kernel object. Going through a proxy datastructure to
>> > merge common cases would be much better.
>>
>> The design point is not every kernel object. The target is one
>> per namespace. Or more usefully one per logical chunk of the kernel.
>
> Which are likely tens or even hundreds if you go through all the source.
> Even tens would bloat task_struct far too much.

We don't have that many places where we put global names in
the linux api. Yes there are several and it is painful,
but we are nowhere near as high as you expect.

>> The UTS namespace is by far the smallest of these pieces.
>>
>> The kernel networking stack is probably the largest.
>>
>> At last count there were about 7 of these, enough so that the few
>> remaining clone bits would be sufficient.
>
> 7 additional bits will probably not be enough. I still don't
> quite understand why you want individual bits for everything.
> Why not group them into logical pieces?

Each bit currently maps to one logical piece, that is why I expect
to have enough bits.

> If you really want individual bits you'll likely need to think
> about a clone2() call with more flags soon.

Being short on bits should keep us thinking about keeping things
in logical chunks.

>> Do you disagree that to be able to implement this properly we
>> need to take small steps?
>
> No, but at least the basic infrastructure for expansion should
> be added first.
>
>> Are there any semantic differences between a clone bit and what you
>> are proposing?
>
> No just internals.

Good that means we can still optimize this.

> > If it is just an internal implementation detail do you have a clear
>> suggestion on how to implement this? Possibly illustrated by the
>> thread flags that are already in this situation, and more likely
>> to always share everything.
>
> Have a proxy structure which has pointers to the many name spaces and a bit
> mask for "namespace X is different". This structure would be reference
> counted. task_struct has a single pointer to it.
>
> On clone check the clone flags against the bit mask. If there is any
> difference allocate a new structure and clone the name spaces into it.
> If no difference just use the old data structure after increasing its
> reference count.
>
> Possible name "nsproxy".

Sounds reasonable, and I have nothing against it.
I simply think at this point where we are still struggling to
get the simplest namespace merged and working correctly that is a premature
optimization.

>> Except for reducing reference counting I really don't see where
>> we could have a marked improvement. I also don't see us closing
>> the door to that kind of implementation at this point, but instead
>> taking the simple path.
>
> With many name spaces you would have smaller task_struct, less cache
> foot print, better cache use of task_struct because slab cache colouring
> will still work etc.

Makes sense. I have no problem against that as an optimization,
and it is firmly on the todo list as something to try. Right now with
only 2 non-thread clone flags I believe it will obfuscate the code more
than help performance.

Eric

2006-05-02 08:56:50

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 7/7] uts namespaces: Implement CLONE_NEWUTS flag

Dave Hansen <[email protected]> writes:

> On Mon, 2006-05-01 at 14:53 -0500, Serge E. Hallyn wrote:
>> +struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
>> +{
>> + struct uts_namespace *ns;
>> +
>> + ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
>> + if (ns) {
>> + memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
>> + kref_init(&ns->kref);
>> + }
>> + return ns;
>> +}
>
> Very small nit...
>
> Would this memcpy be more appropriate as a strncpy()?

Nope. It is several strings. Although a different field name
for the old utsname structure might be called for to keep people
from getting confused.

Eric

2006-05-02 17:21:01

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH 7/7] uts namespaces: Implement CLONE_NEWUTS flag

Quoting Andi Kleen ([email protected]):
> On Tuesday 02 May 2006 10:03, Eric W. Biederman wrote:
> 7 additional bits will probably not be enough. I still don't
> quite understand why you want individual bits for everything.
> Why not group them into logical pieces?

I wouldn't be surprised if it makes sense to combine some of them. For
instance, perhaps utsname and networking?

> Have a proxy structure which has pointers to the many name spaces and a bit
> mask for "namespace X is different".

different from what?

Oh, you mean in case we want to allow cloning a namespace outside of
fork *without* cloning the nsproxy struct?

> This structure would be reference
> counted. task_struct has a single pointer to it.

If it is reference counted, that implies it is shared between some
processes. But namespace pointers themselves are shared between some of
these nsproxy's. The lifetime mgmt here is one reason I haven't tried a
patch to do this.

Still, like Eric, I'm fine with trying that approach. It just doesn't
seem like we can draw any meaningful conclusions with just one namespace
pointer, and adding a separate reference counted object around the
utsname pointer would seem to just make the code harder to review.

> With many name spaces you would have smaller task_struct, less cache
> foot print, better cache use of task_struct because slab cache colouring
> will still work etc.

I suppose we could run some performance tests with some dummy namespace
pointers? 9 void *'s directly in the task struct, and the same inside a
refcounted container struct. The results might add some urgency to
implementing the struct nsproxy.

-serge

2006-05-02 17:31:00

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 7/7] uts namespaces: Implement CLONE_NEWUTS flag

On Tuesday 02 May 2006 19:20, Serge E. Hallyn wrote:
> Quoting Andi Kleen ([email protected]):
> > On Tuesday 02 May 2006 10:03, Eric W. Biederman wrote:
> > 7 additional bits will probably not be enough. I still don't
> > quite understand why you want individual bits for everything.
> > Why not group them into logical pieces?
>
> I wouldn't be surprised if it makes sense to combine some of them. For
> instance, perhaps utsname and networking?

I frankly would just combine everything new.

>
> > Have a proxy structure which has pointers to the many name spaces and a bit
> > mask for "namespace X is different".
>
> different from what?

>From the parent.

>
> Oh, you mean in case we want to allow cloning a namespace outside of
> fork *without* cloning the nsproxy struct?

Basically every time any name space changes you need a new nsproxy.

>
> > This structure would be reference
> > counted. task_struct has a single pointer to it.
>
> If it is reference counted, that implies it is shared between some
> processes. But namespace pointers themselves are shared between some of
> these nsproxy's. The lifetime mgmt here is one reason I haven't tried a
> patch to do this.

The livetime management is no different from having individual pointers.

> > With many name spaces you would have smaller task_struct, less cache
> > foot print, better cache use of task_struct because slab cache colouring
> > will still work etc.
>
> I suppose we could run some performance tests with some dummy namespace
> pointers? 9 void *'s directly in the task struct, and the same inside a
> refcounted container struct. The results might add some urgency to
> implementing the struct nsproxy.

Not sure you'll notice too much difference on the beginning. I am just
the opinion memory/cache bloat needs to be attacked at the root, not
when it's too late.

-Andi

2006-05-02 21:42:47

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH 7/7] uts namespaces: Implement CLONE_NEWUTS flag

Quoting Dave Hansen ([email protected]):
> On Mon, 2006-05-01 at 16:11 -0500, Serge E. Hallyn wrote:
> > Might be worth a separate patch to change over all those helpers in
> > fork.c? (I think they were all brought in along with the sys_unshare
> > syscall)
>
> I'd be a little scared to touch good, working code, but it couldn't hurt
> to see the patch.

Hmm, well the following untested patch was just to see the end results.
Summary: it ends up quite a bit uglier :-(

I think I like it better as is.

thanks,
-serge

Subject: [PATCH] fs/fork.c: unshare cleanup

Switch some of the unshare patch to more kernel conformant style.

Signed-off-by: Serge E. Hallyn <[email protected]>

---

kernel/fork.c | 102 ++++++++++++++++++++++++++++++++++++---------------------
1 files changed, 64 insertions(+), 38 deletions(-)

c30251d8442cfee2ada7dfd6c46159cf44011213
diff --git a/kernel/fork.c b/kernel/fork.c
index d2fa57d..42753a4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1450,86 +1450,92 @@ static int unshare_thread(unsigned long
/*
* Unshare the filesystem structure if it is being shared
*/
-static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
+static struct fs_struct * unshare_fs(unsigned long flags)
{
struct fs_struct *fs = current->fs;
+ struct fs_struct *new_fs = NULL;

- if ((unshare_flags & CLONE_FS) &&
+ if ((flags & CLONE_FS) &&
(fs && atomic_read(&fs->count) > 1)) {
- *new_fsp = __copy_fs_struct(current->fs);
- if (!*new_fsp)
- return -ENOMEM;
+ new_fs = __copy_fs_struct(current->fs);
+ if (!new_fs)
+ new_fs = ERR_PTR(-ENOMEM);
}

- return 0;
+ return new_fs;
}

/*
* Unshare the namespace structure if it is being shared
*/
-static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs)
+static struct namespace *unshare_namespace(unsigned long flags,
+ struct fs_struct *new_fs)
{
struct namespace *ns = current->namespace;
+ struct namespace *new_ns = NULL;

- if ((unshare_flags & CLONE_NEWNS) &&
+ if ((flags & CLONE_NEWNS) &&
(ns && atomic_read(&ns->count) > 1)) {
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ if (!capable(CAP_SYS_ADMIN)) {
+ return ERR_PTR(-EPERM);

- *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs);
+ *new_ns = dup_namespace(current, new_fs ? new_fs : current->fs);
if (!*new_nsp)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
}

- return 0;
+ return new_ns;
}

/*
* Unsharing of sighand for tasks created with CLONE_SIGHAND is not
* supported yet
*/
-static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
+static struct sighand_struct *unshare_sighand(unsigned long flags);
{
struct sighand_struct *sigh = current->sighand;
+ struct sighand_struct *new_sigh = NULL;

- if ((unshare_flags & CLONE_SIGHAND) &&
+ if ((flags & CLONE_SIGHAND) &&
(sigh && atomic_read(&sigh->count) > 1))
- return -EINVAL;
- else
- return 0;
+ return ERR_PTR(-EINVAL);
+
+ return new_sigh;
}

/*
* Unshare vm if it is being shared
*/
-static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
+static mm_struct *unshare_vm(unsigned long flags)
{
struct mm_struct *mm = current->mm;
+ struct mm_struct *new_mm = NULL;

- if ((unshare_flags & CLONE_VM) &&
+ if ((flags & CLONE_VM) &&
(mm && atomic_read(&mm->mm_users) > 1)) {
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
}

- return 0;
+ return new_mm;
}

/*
* Unshare file descriptor table if it is being shared
*/
-static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
+static struct files_struct *unshare_fd(unsigned long flags)
{
struct files_struct *fd = current->files;
+ struct files_struct *new_fd = NULL;
int error = 0;

- if ((unshare_flags & CLONE_FILES) &&
+ if ((flags & CLONE_FILES) &&
(fd && atomic_read(&fd->count) > 1)) {
- *new_fdp = dup_fd(fd, &error);
- if (!*new_fdp)
- return error;
+ new_fd = dup_fd(fd, &error);
+ if (!new_fd)
+ return ERR_PTR(error);
}

- return 0;
+ return new_fd;
}

/*
@@ -1572,16 +1578,36 @@ asmlinkage long sys_unshare(unsigned lon

if ((err = unshare_thread(unshare_flags)))
goto bad_unshare_out;
- if ((err = unshare_fs(unshare_flags, &new_fs)))
+
+ new_fs = unshare_fs(unshare_flags);
+ if (IS_ERR(new_fs)) {
+ err = PTR_ERR(new_fs);
goto bad_unshare_cleanup_thread;
- if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs)))
+ }
+
+ new_ns = unshare_namespace(unshare_flags, &new_ns);
+ if (IS_ERR(new_ns)) {
+ err = PTR_ERR(new_ns);
goto bad_unshare_cleanup_fs;
- if ((err = unshare_sighand(unshare_flags, &new_sigh)))
+ }
+
+ new_sigh = unshare_sighand(unshare_flags);
+ if (IS_ERR(new_sigh)) {
+ err = PTR_ERR(new_sigh);
goto bad_unshare_cleanup_ns;
- if ((err = unshare_vm(unshare_flags, &new_mm)))
+ }
+ new_mm = unshare_vm(unshare_flags);
+ if (IS_ERR(new_mm)) {
+ err = PTR_ERR(new_mm);
goto bad_unshare_cleanup_sigh;
- if ((err = unshare_fd(unshare_flags, &new_fd)))
+ }
+
+ new_fd = unshare_fd(unshare_flags);
+ if (IS_ERR(new_fd)) {
+ err = PTR_ERR(new_fd);
goto bad_unshare_cleanup_vm;
+ }
+
if ((err = unshare_semundo(unshare_flags, &new_ulist)))
goto bad_unshare_cleanup_fd;

@@ -1626,24 +1652,24 @@ asmlinkage long sys_unshare(unsigned lon
}

bad_unshare_cleanup_fd:
- if (new_fd)
+ if (new_fd && !IS_ERR(new_fd))
put_files_struct(new_fd);

bad_unshare_cleanup_vm:
- if (new_mm)
+ if (new_mm && !IS_ERR(new_mm))
mmput(new_mm);

bad_unshare_cleanup_sigh:
- if (new_sigh)
+ if (new_sigh && !IS_ERR(new_sigh))
if (atomic_dec_and_test(&new_sigh->count))
kmem_cache_free(sighand_cachep, new_sigh);

bad_unshare_cleanup_ns:
- if (new_ns)
+ if (new_ns && !IS_ERR(new_ns))
put_namespace(new_ns);

bad_unshare_cleanup_fs:
- if (new_fs)
+ if (new_fs && !IS_ERR(new_fs)))
put_fs_struct(new_fs);

bad_unshare_cleanup_thread:
--
1.3.0

2006-05-03 16:11:48

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH 7/7] uts namespaces: Implement CLONE_NEWUTS flag

Quoting Andi Kleen ([email protected]):
> On Tuesday 02 May 2006 19:20, Serge E. Hallyn wrote:
> > Quoting Andi Kleen ([email protected]):
> > > Have a proxy structure which has pointers to the many name spaces and a bit
> > > mask for "namespace X is different".
> >
> > different from what?
>
> From the parent.

...

> > Oh, you mean in case we want to allow cloning a namespace outside of
> > fork *without* cloning the nsproxy struct?
>
> Basically every time any name space changes you need a new nsproxy.

But, either the nsproxy is shared between tasks and you need to copy
youself a new one as soon as any ns changes, or it is not shared, and
you don't need that info at all (just make the change in the nsproxy
immediately)

What am I missing?

Should we talk about this on irc someplace? Perhaps drag in Eric as
well?

> > > This structure would be reference
> > > counted. task_struct has a single pointer to it.
> >
> > If it is reference counted, that implies it is shared between some
> > processes. But namespace pointers themselves are shared between some of
> > these nsproxy's. The lifetime mgmt here is one reason I haven't tried a
> > patch to do this.
>
> The livetime management is no different from having individual pointers.

That's true if we have one nsproxy per process or thread, which I didn't
think was the case. Are you saying not to share nsproxy's among
processes which share all namespaces?

> > > With many name spaces you would have smaller task_struct, less cache
> > > foot print, better cache use of task_struct because slab cache colouring
> > > will still work etc.
> >
> > I suppose we could run some performance tests with some dummy namespace
> > pointers? 9 void *'s directly in the task struct, and the same inside a
> > refcounted container struct. The results might add some urgency to
> > implementing the struct nsproxy.
>
> Not sure you'll notice too much difference on the beginning. I am just

9 void*'s is probably more than we'll need, though, so it's not "the
beginning". Eric previously mentioned uts, sysvipc, net, pid, and uid,
to which we might add proc, sysctl, and signals, though those are
probably just implied through the others.

What others do you see us needing?

If the number were more likely to be 50, then in the above experiment
use 50 instead - the point was to see the performance implications
without implementing the namespaces first.

Anyway I guess I'll go ahead and queue up some tests.

> the opinion memory/cache bloat needs to be attacked at the root, not
> when it's too late.

-serge

2006-05-03 16:19:42

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH 7/7] uts namespaces: Implement CLONE_NEWUTS flag

Quoting Serge E. Hallyn ([email protected]):
> Quoting Andi Kleen ([email protected]):
> > On Tuesday 02 May 2006 19:20, Serge E. Hallyn wrote:
> > > > With many name spaces you would have smaller task_struct, less cache
> > > > foot print, better cache use of task_struct because slab cache colouring
> > > > will still work etc.
> > >
> > > I suppose we could run some performance tests with some dummy namespace
> > > pointers? 9 void *'s directly in the task struct, and the same inside a
> > > refcounted container struct. The results might add some urgency to
> > > implementing the struct nsproxy.
> >
> > Not sure you'll notice too much difference on the beginning. I am just
>
> 9 void*'s is probably more than we'll need, though, so it's not "the
> beginning". Eric previously mentioned uts, sysvipc, net, pid, and uid,
> to which we might add proc, sysctl, and signals, though those are
> probably just implied through the others.
>
> What others do you see us needing?
>
> If the number were more likely to be 50, then in the above experiment
> use 50 instead - the point was to see the performance implications
> without implementing the namespaces first.
>
> Anyway I guess I'll go ahead and queue up some tests.

Though of course one reason those tests won't be very meaningful is that
the void*'s won't be being dereferenced, so we won't be accounting for
the performance hit of the double dereference and resulting cache
hits...

-serge

2006-05-05 06:44:47

by Herbert Poetzl

[permalink] [raw]
Subject: Re: [PATCH 7/7] uts namespaces: Implement CLONE_NEWUTS flag

On Wed, May 03, 2006 at 11:11:43AM -0500, Serge E. Hallyn wrote:
> Quoting Andi Kleen ([email protected]):
> > On Tuesday 02 May 2006 19:20, Serge E. Hallyn wrote:
> > > Quoting Andi Kleen ([email protected]):
> > > > Have a proxy structure which has pointers to the many name spaces and a bit
> > > > mask for "namespace X is different".
> > >
> > > different from what?
> >
> > From the parent.
>
> ...
>
> > > Oh, you mean in case we want to allow cloning a namespace outside of
> > > fork *without* cloning the nsproxy struct?
> >
> > Basically every time any name space changes you need a new nsproxy.
>
> But, either the nsproxy is shared between tasks and you need to copy
> youself a new one as soon as any ns changes, or it is not shared, and
> you don't need that info at all (just make the change in the nsproxy
> immediately)
>
> What am I missing?
>
> Should we talk about this on irc someplace? Perhaps drag in Eric as
> well?

good idea, feel free to use #vserver (irc.oftc.net) for that

> > > > This structure would be reference
> > > > counted. task_struct has a single pointer to it.
> > >
> > > If it is reference counted, that implies it is shared between some
> > > processes. But namespace pointers themselves are shared between some of
> > > these nsproxy's. The lifetime mgmt here is one reason I haven't tried a
> > > patch to do this.
> >
> > The livetime management is no different from having individual pointers.
>
> That's true if we have one nsproxy per process or thread, which I didn't
> think was the case. Are you saying not to share nsproxy's among
> processes which share all namespaces?
>
> > > > With many name spaces you would have smaller task_struct, less cache
> > > > foot print, better cache use of task_struct because slab cache colouring
> > > > will still work etc.
> > >
> > > I suppose we could run some performance tests with some dummy namespace
> > > pointers? 9 void *'s directly in the task struct, and the same inside a
> > > refcounted container struct. The results might add some urgency to
> > > implementing the struct nsproxy.
> >
> > Not sure you'll notice too much difference on the beginning. I am just
>
> 9 void*'s is probably more than we'll need, though, so it's not "the
> beginning". Eric previously mentioned uts, sysvipc, net, pid, and uid,
> to which we might add proc, sysctl, and signals, though those are
> probably just implied through the others.

> What others do you see us needing?

the 'container', as well as accounting and resource limits
but they are not required in the beginning either

> If the number were more likely to be 50, then in the above experiment
> use 50 instead - the point was to see the performance implications
> without implementing the namespaces first.
>
> Anyway I guess I'll go ahead and queue up some tests.

good!

best,
Herbert

> > the opinion memory/cache bloat needs to be attacked at the root, not
> > when it's too late.
>
> -serge

2006-05-05 11:03:13

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 7/7] uts namespaces: Implement CLONE_NEWUTS flag


> But, either the nsproxy is shared between tasks and you need to copy
> youself a new one as soon as any ns changes

That would be the case. But it is only shared between tasks where
all the name spaces are the same.

> , or it is not shared, and
> you don't need that info at all (just make the change in the nsproxy
> immediately)

Don't follow you here.

Basically the goal is to have a minimum number of nsproxies in the system without
having to maintain a global hash table. So instead you assume that name space
changes are infrequent. In the common case of clone without a name space change
you just share the nsproxy of the parent. If there is a name space change of
any kind you get a new one.

This won't get the absolute minimum number of nsproxies, but should be reasonably
good without too much effort.

-Andi

>

2006-05-05 11:43:41

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH 7/7] uts namespaces: Implement CLONE_NEWUTS flag

Quoting Andi Kleen ([email protected]):
>
> > But, either the nsproxy is shared between tasks and you need to copy
> > youself a new one as soon as any ns changes
>
> That would be the case. But it is only shared between tasks where
> all the name spaces are the same.

Ok, that is how I was thinking.

> > , or it is not shared, and
> > you don't need that info at all (just make the change in the nsproxy
> > immediately)
>
> Don't follow you here.
>
> Basically the goal is to have a minimum number of nsproxies in the system without
> having to maintain a global hash table. So instead you assume that name space
> changes are infrequent. In the common case of clone without a name space change
> you just share the nsproxy of the parent. If there is a name space change of
> any kind you get a new one.
>
> This won't get the absolute minimum number of nsproxies, but should be reasonably
> good without too much effort.

Ok. Then I maintain that the bitmap of changed namespaces seems
unnecessary. Since you're likely sharing an nsproxy with your parent
process, when you clone a new namespace you just want to immediately get
a new nsproxy pointing to the new namespaces.

Anyway this seems simple enough to just code up. Simpler than
continuing to talk about it :)

-serge

2006-05-05 12:17:59

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH 7/7] uts namespaces: Implement CLONE_NEWUTS flag

Quoting Herbert Poetzl ([email protected]):
> On Wed, May 03, 2006 at 11:11:43AM -0500, Serge E. Hallyn wrote:
> > Should we talk about this on irc someplace? Perhaps drag in Eric as
> > well?
>
> good idea, feel free to use #vserver (irc.oftc.net) for that

Moves a bit too fast for me to be able to keep up, but I'll try to hang
out there more than I have been :)

> > Anyway I guess I'll go ahead and queue up some tests.
>
> good!

As I later mentioned, this won't necessary give honest picture of
performance implications, since some namespaces might need to be
frequently referenced, in which case the extra indirection from
task_struct->container->nsstruct may give a big hit.

But perhaps I can use the utsname ns on top of the experimental
container patch to test that aspect.

thanks,
-serge

2006-05-05 14:31:53

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 7/7] uts namespaces: Implement CLONE_NEWUTS flag

"Serge E. Hallyn" <[email protected]> writes:
>
> Ok. Then I maintain that the bitmap of changed namespaces seems
> unnecessary.

I didn't spell it out, but it's obviously to optimize cache footprint
of clone(). I expect nsproxy to be eventually more than a cacheline
and with a bitmap test you can avoid checking it all.

-Andi

2006-05-05 15:56:48

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 7/7] uts namespaces: Implement CLONE_NEWUTS flag

"Serge E. Hallyn" <[email protected]> writes:

> Ok. Then I maintain that the bitmap of changed namespaces seems
> unnecessary. Since you're likely sharing an nsproxy with your parent
> process, when you clone a new namespace you just want to immediately get
> a new nsproxy pointing to the new namespaces.
>
> Anyway this seems simple enough to just code up. Simpler than
> continuing to talk about it :)

For testing please include both uts and the filesystem namespace
in nsproxy.

That should give you at least one namespace that is frequently used.

Eric