2006-05-18 15:47:32

by Serge E. Hallyn

[permalink] [raw]
Subject: [PATCH 0/9] namespaces: Introduction

This patchset introduces a per-process utsname namespace. These can
be used by openvz, vserver, and application migration to virtualize and
isolate utsname info (i.e. hostname). More resources will follow, until
hopefully most or all vserver and openvz functionality can be implemented
by controlling resource namespaces from userspace.

Previous utsname submissions placed a pointer to the utsname namespace
straight in the task_struct. This patchset (and the last one) moves
it and the filesystem namespace pointer into struct nsproxy, which is
shared by processes sharing all namespaces. The intent is to keep
the taskstruct smaller as the number of namespaces grows.

Changes:
- the reference count on fs namespace and uts namespace now
refers to the number of nsproxies pointing to it
- some consolidation of namespace cloning and exit code to
clean up kernel/{fork,exit}.c
- passed ltp and ltpstress on smp power, x86, and x86-64
boxes.


2006-05-18 15:49:20

by Serge E. Hallyn

[permalink] [raw]
Subject: [PATCH 2/9] namespaces: incorporate fs namespace into nsproxy

This moves the mount namespace into the nsproxy. The mount
namespace count now refers to the number of nsproxies point
to it, rather than the number of tasks. As a result, the
unshare_namespace() function in kernel/fork.c no longer checks
whether it is being shared.

Signed-off-by: Serge Hallyn <[email protected]>

---

fs/namespace.c | 22 ++++++++--------------
fs/proc/base.c | 5 +++--
include/linux/init_task.h | 1 +
include/linux/namespace.h | 6 ++----
include/linux/nsproxy.h | 3 +++
include/linux/sched.h | 4 +---
kernel/exit.c | 5 -----
kernel/fork.c | 19 +++++++------------
kernel/nsproxy.c | 40 +++++++++++++++++++++++++++++++++++-----
9 files changed, 60 insertions(+), 45 deletions(-)

a06a931bb7f9f82df3267dfce14c8a9784ae68e4
diff --git a/fs/namespace.c b/fs/namespace.c
index 2c5f1f8..33330fe 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -133,7 +133,7 @@ struct vfsmount *lookup_mnt(struct vfsmo

static inline int check_mnt(struct vfsmount *mnt)
{
- return mnt->mnt_namespace == current->namespace;
+ return mnt->mnt_namespace == current->nsproxy->namespace;
}

static void touch_namespace(struct namespace *ns)
@@ -832,7 +832,7 @@ static int attach_recursive_mnt(struct v
if (parent_nd) {
detach_mnt(source_mnt, parent_nd);
attach_mnt(source_mnt, nd);
- touch_namespace(current->namespace);
+ touch_namespace(current->nsproxy->namespace);
} else {
mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
commit_tree(source_mnt);
@@ -1372,7 +1372,7 @@ dput_out:
*/
struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs)
{
- struct namespace *namespace = tsk->namespace;
+ struct namespace *namespace = tsk->nsproxy->namespace;
struct namespace *new_ns;
struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL;
struct vfsmount *p, *q;
@@ -1439,7 +1439,7 @@ struct namespace *dup_namespace(struct t

int copy_namespace(int flags, struct task_struct *tsk)
{
- struct namespace *namespace = tsk->namespace;
+ struct namespace *namespace = tsk->nsproxy->namespace;
struct namespace *new_ns;
int err = 0;

@@ -1462,7 +1462,7 @@ int copy_namespace(int flags, struct tas
goto out;
}

- tsk->namespace = new_ns;
+ tsk->nsproxy->namespace = new_ns;

out:
put_namespace(namespace);
@@ -1685,7 +1685,7 @@ asmlinkage long sys_pivot_root(const cha
detach_mnt(user_nd.mnt, &root_parent);
attach_mnt(user_nd.mnt, &old_nd); /* mount old root on put_old */
attach_mnt(new_nd.mnt, &root_parent); /* mount new_root on / */
- touch_namespace(current->namespace);
+ touch_namespace(current->nsproxy->namespace);
spin_unlock(&vfsmount_lock);
chroot_fs_refs(&user_nd, &new_nd);
security_sb_post_pivotroot(&user_nd, &new_nd);
@@ -1711,7 +1711,6 @@ static void __init init_mount_tree(void)
{
struct vfsmount *mnt;
struct namespace *namespace;
- struct task_struct *g, *p;

mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
if (IS_ERR(mnt))
@@ -1727,13 +1726,8 @@ static void __init init_mount_tree(void)
namespace->root = mnt;
mnt->mnt_namespace = namespace;

- init_task.namespace = namespace;
- read_lock(&tasklist_lock);
- do_each_thread(g, p) {
- get_namespace(namespace);
- p->namespace = namespace;
- } while_each_thread(g, p);
- read_unlock(&tasklist_lock);
+ init_task.nsproxy->namespace = namespace;
+ get_namespace(namespace);

set_fs_pwd(current->fs, namespace->root, namespace->root->mnt_root);
set_fs_root(current->fs, namespace->root, namespace->root->mnt_root);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6cc77dc..f74acae 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -72,6 +72,7 @@
#include <linux/cpuset.h>
#include <linux/audit.h>
#include <linux/poll.h>
+#include <linux/nsproxy.h>
#include "internal.h"

/*
@@ -685,7 +686,7 @@ static int mounts_open(struct inode *ino
int ret = -EINVAL;

task_lock(task);
- namespace = task->namespace;
+ namespace = task->nsproxy->namespace;
if (namespace)
get_namespace(namespace);
task_unlock(task);
@@ -752,7 +753,7 @@ static int mountstats_open(struct inode
struct seq_file *m = file->private_data;
struct namespace *namespace;
task_lock(task);
- namespace = task->namespace;
+ namespace = task->nsproxy->namespace;
if (namespace)
get_namespace(namespace);
task_unlock(task);
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 79ec4ea..672dc04 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -70,6 +70,7 @@ extern struct nsproxy init_nsproxy;
#define INIT_NSPROXY(nsproxy) { \
.count = ATOMIC_INIT(1), \
.nslock = SPIN_LOCK_UNLOCKED, \
+ .namespace = NULL, \
}

#define INIT_SIGHAND(sighand) { \
diff --git a/include/linux/namespace.h b/include/linux/namespace.h
index 3abc8e3..d137009 100644
--- a/include/linux/namespace.h
+++ b/include/linux/namespace.h
@@ -4,6 +4,7 @@

#include <linux/mount.h>
#include <linux/sched.h>
+#include <linux/nsproxy.h>

struct namespace {
atomic_t count;
@@ -26,11 +27,8 @@ static inline void put_namespace(struct

static inline void exit_namespace(struct task_struct *p)
{
- struct namespace *namespace = p->namespace;
+ struct namespace *namespace = p->nsproxy->namespace;
if (namespace) {
- task_lock(p);
- p->namespace = NULL;
- task_unlock(p);
put_namespace(namespace);
}
}
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 7bdebfa..7ebe666 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -4,6 +4,8 @@
#include <linux/spinlock.h>
#include <linux/sched.h>

+struct namespace;
+
/*
* A structure to contain pointers to all per-process
* namespaces - fs (mount), uts, network, sysvipc, etc.
@@ -19,6 +21,7 @@
struct nsproxy {
atomic_t count;
spinlock_t nslock;
+ struct namespace *namespace;
};
extern struct nsproxy init_nsproxy;

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4c0bbb3..f2c945b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -235,7 +235,6 @@ extern signed long schedule_timeout_inte
extern signed long schedule_timeout_uninterruptible(signed long timeout);
asmlinkage void schedule(void);

-struct namespace;
struct nsproxy;

/* Maximum number of active map areas.. This is a random (large) number */
@@ -807,8 +806,7 @@ struct task_struct {
struct fs_struct *fs;
/* open file information */
struct files_struct *files;
-/* namespace */
- struct namespace *namespace;
+/* namespaces */
struct nsproxy *nsproxy;
/* signal handlers */
struct signal_struct *signal;
diff --git a/kernel/exit.c b/kernel/exit.c
index da2fc84..240d0df 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -36,7 +36,6 @@
#include <linux/compat.h>
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
-#include <linux/nsproxy.h>

#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -415,11 +414,8 @@ void daemonize(const char *name, ...)
current->fs = fs;
atomic_inc(&fs->count);

- exit_namespace(current);
exit_task_namespaces(current);
- current->namespace = init_task.namespace;
current->nsproxy = init_task.nsproxy;
- get_namespace(current->namespace);
get_task_namespaces(current);

exit_files(current);
@@ -924,7 +920,6 @@ fastcall NORET_TYPE void do_exit(long co
exit_sem(tsk);
__exit_files(tsk);
__exit_fs(tsk);
- exit_namespace(tsk);
exit_task_namespaces(tsk);
exit_thread();
cpuset_exit(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 60303c3..ddab7e7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1063,11 +1063,9 @@ static task_t *copy_process(unsigned lon
goto bad_fork_cleanup_mm;
if ((retval = copy_namespaces(clone_flags, p)))
goto bad_fork_cleanup_keys;
- if ((retval = copy_namespace(clone_flags, p)))
- goto bad_fork_cleanup_namespaces;
retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
if (retval)
- goto bad_fork_cleanup_namespace;
+ goto bad_fork_cleanup_namespaces;

p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
@@ -1154,7 +1152,7 @@ static task_t *copy_process(unsigned lon
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
- goto bad_fork_cleanup_namespace;
+ goto bad_fork_cleanup_namespaces;
}

if (clone_flags & CLONE_THREAD) {
@@ -1167,7 +1165,7 @@ static task_t *copy_process(unsigned lon
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -EAGAIN;
- goto bad_fork_cleanup_namespace;
+ goto bad_fork_cleanup_namespaces;
}

p->group_leader = current->group_leader;
@@ -1219,8 +1217,6 @@ static task_t *copy_process(unsigned lon
proc_fork_connector(p);
return p;

-bad_fork_cleanup_namespace:
- exit_namespace(p);
bad_fork_cleanup_namespaces:
exit_task_namespaces(p);
bad_fork_cleanup_keys:
@@ -1472,10 +1468,9 @@ static int unshare_fs(unsigned long unsh
*/
static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs)
{
- struct namespace *ns = current->namespace;
+ struct namespace *ns = current->nsproxy->namespace;

- if ((unshare_flags & CLONE_NEWNS) &&
- (ns && atomic_read(&ns->count) > 1)) {
+ if ((unshare_flags & CLONE_NEWNS) && ns) {
if (!capable(CAP_SYS_ADMIN))
return -EPERM;

@@ -1608,8 +1603,8 @@ asmlinkage long sys_unshare(unsigned lon
}

if (new_ns) {
- ns = current->namespace;
- current->namespace = new_ns;
+ ns = current->nsproxy->namespace;
+ current->nsproxy->namespace = new_ns;
new_ns = ns;
}

diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 6f53df1..4103f58 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -1,18 +1,19 @@
/*
* Copyright (C) 2006 IBM Corporation
- *
- * Author: Serge Hallyn <[email protected]>
- *
+ *
+ * Author: Serge Hallyn <[email protected]>
+ *
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation, version 2 of the
* License.
- */
+ */

#include <linux/compile.h>
#include <linux/module.h>
#include <linux/version.h>
#include <linux/nsproxy.h>
+#include <linux/namespace.h>

static inline void get_nsproxy(struct nsproxy *ns)
{
@@ -53,6 +54,11 @@ struct nsproxy *dup_namespaces(struct ns
{
struct nsproxy *ns = clone_namespaces(orig);

+ if (ns) {
+ if (ns->namespace)
+ get_namespace(ns->namespace);
+ }
+
return ns;
}

@@ -63,16 +69,40 @@ struct nsproxy *dup_namespaces(struct ns
int copy_namespaces(int flags, struct task_struct *tsk)
{
struct nsproxy *old_ns = tsk->nsproxy;
+ struct nsproxy *new_ns;
+ int err = 0;

if (!old_ns)
return 0;

get_nsproxy(old_ns);

- return 0;
+ if (!(flags & CLONE_NEWNS))
+ return 0;
+
+ new_ns = clone_namespaces(old_ns);
+ if (!new_ns) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ tsk->nsproxy = new_ns;
+
+ err = copy_namespace(flags, tsk);
+ if (err) {
+ tsk->nsproxy = old_ns;
+ put_nsproxy(new_ns);
+ goto out;
+ }
+
+out:
+ put_nsproxy(old_ns);
+ return err;
}

void free_nsproxy(struct nsproxy *ns)
{
+ if (ns->namespace)
+ put_namespace(ns->namespace);
kfree(ns);
}
--
1.1.6

2006-05-18 15:48:53

by Serge E. Hallyn

[permalink] [raw]
Subject: [PATCH 1/9] namespaces: add nsproxy

This patch adds a nsproxy structure to the task struct. Later
patches will move the fs namespace pointer into this structure,
and introduce a new utsname namespace into the nsproxy.

The vserver and openvz functionality, then, would be implemented
in large part by virtualizing/isolating more and more resources
into namespaces, each contained in the nsproxy.

Signed-off-by: Serge Hallyn <[email protected]>

---

arch/alpha/kernel/init_task.c | 2 +
arch/arm/kernel/init_task.c | 2 +
arch/arm26/kernel/init_task.c | 2 +
arch/frv/kernel/init_task.c | 2 +
arch/h8300/kernel/init_task.c | 2 +
arch/i386/kernel/init_task.c | 2 +
arch/ia64/kernel/init_task.c | 2 +
arch/m32r/kernel/init_task.c | 2 +
arch/m68knommu/kernel/init_task.c | 2 +
arch/mips/kernel/init_task.c | 2 +
arch/parisc/kernel/init_task.c | 2 +
arch/powerpc/kernel/init_task.c | 2 +
arch/s390/kernel/init_task.c | 2 +
arch/sh/kernel/init_task.c | 2 +
arch/sh64/kernel/init_task.c | 2 +
arch/sparc/kernel/init_task.c | 2 +
arch/sparc64/kernel/init_task.c | 2 +
arch/um/kernel/init_task.c | 2 +
arch/v850/kernel/init_task.c | 2 +
arch/x86_64/kernel/init_task.c | 2 +
include/linux/init_task.h | 7 +++
include/linux/nsproxy.h | 45 +++++++++++++++++++++
include/linux/sched.h | 2 +
kernel/Makefile | 2 -
kernel/exit.c | 7 +++
kernel/fork.c | 18 ++++++++-
kernel/nsproxy.c | 78 +++++++++++++++++++++++++++++++++++++
27 files changed, 197 insertions(+), 2 deletions(-)
create mode 100644 include/linux/nsproxy.h
create mode 100644 kernel/nsproxy.c

7c65cea6d1931f03867fad978f33072a4b0a4602
diff --git a/arch/alpha/kernel/init_task.c b/arch/alpha/kernel/init_task.c
index 835d09a..83d0902 100644
--- a/arch/alpha/kernel/init_task.c
+++ b/arch/alpha/kernel/init_task.c
@@ -5,6 +5,7 @@
#include <linux/init_task.h>
#include <linux/fs.h>
#include <linux/mqueue.h>
+#include <linux/nsproxy.h>
#include <asm/uaccess.h>


@@ -13,6 +14,7 @@ static struct files_struct init_files =
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
struct task_struct init_task = INIT_TASK(init_task);

EXPORT_SYMBOL(init_mm);
diff --git a/arch/arm/kernel/init_task.c b/arch/arm/kernel/init_task.c
index a00cca0..80f5eeb 100644
--- a/arch/arm/kernel/init_task.c
+++ b/arch/arm/kernel/init_task.c
@@ -8,6 +8,7 @@
#include <linux/init.h>
#include <linux/init_task.h>
#include <linux/mqueue.h>
+#include <linux/nsproxy.h>

#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -17,6 +18,7 @@ static struct files_struct init_files =
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);

EXPORT_SYMBOL(init_mm);

diff --git a/arch/arm26/kernel/init_task.c b/arch/arm26/kernel/init_task.c
index 4191565..678c7b5 100644
--- a/arch/arm26/kernel/init_task.c
+++ b/arch/arm26/kernel/init_task.c
@@ -11,6 +11,7 @@
#include <linux/init.h>
#include <linux/init_task.h>
#include <linux/mqueue.h>
+#include <linux/nsproxy.h>

#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -20,6 +21,7 @@ static struct files_struct init_files =
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);

EXPORT_SYMBOL(init_mm);

diff --git a/arch/frv/kernel/init_task.c b/arch/frv/kernel/init_task.c
index 2299393..5ec2742 100644
--- a/arch/frv/kernel/init_task.c
+++ b/arch/frv/kernel/init_task.c
@@ -5,6 +5,7 @@
#include <linux/init_task.h>
#include <linux/fs.h>
#include <linux/mqueue.h>
+#include <linux/nsproxy.h>

#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -15,6 +16,7 @@ static struct files_struct init_files =
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);

EXPORT_SYMBOL(init_mm);

diff --git a/arch/h8300/kernel/init_task.c b/arch/h8300/kernel/init_task.c
index 19272c2..ef5755a 100644
--- a/arch/h8300/kernel/init_task.c
+++ b/arch/h8300/kernel/init_task.c
@@ -8,6 +8,7 @@
#include <linux/init_task.h>
#include <linux/fs.h>
#include <linux/mqueue.h>
+#include <linux/nsproxy.h>

#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -17,6 +18,7 @@ static struct files_struct init_files =
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);

EXPORT_SYMBOL(init_mm);

diff --git a/arch/i386/kernel/init_task.c b/arch/i386/kernel/init_task.c
index cff95d1..bd97f69 100644
--- a/arch/i386/kernel/init_task.c
+++ b/arch/i386/kernel/init_task.c
@@ -5,6 +5,7 @@
#include <linux/init_task.h>
#include <linux/fs.h>
#include <linux/mqueue.h>
+#include <linux/nsproxy.h>

#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -15,6 +16,7 @@ static struct files_struct init_files =
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);

EXPORT_SYMBOL(init_mm);

diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c
index b69c397..2d62471 100644
--- a/arch/ia64/kernel/init_task.c
+++ b/arch/ia64/kernel/init_task.c
@@ -12,6 +12,7 @@
#include <linux/sched.h>
#include <linux/init_task.h>
#include <linux/mqueue.h>
+#include <linux/nsproxy.h>

#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -21,6 +22,7 @@ static struct files_struct init_files =
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);

EXPORT_SYMBOL(init_mm);

diff --git a/arch/m32r/kernel/init_task.c b/arch/m32r/kernel/init_task.c
index 9e508fd..0057475 100644
--- a/arch/m32r/kernel/init_task.c
+++ b/arch/m32r/kernel/init_task.c
@@ -7,6 +7,7 @@
#include <linux/init_task.h>
#include <linux/fs.h>
#include <linux/mqueue.h>
+#include <linux/nsproxy.h>

#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -16,6 +17,7 @@ static struct files_struct init_files =
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);

EXPORT_SYMBOL(init_mm);

diff --git a/arch/m68knommu/kernel/init_task.c b/arch/m68knommu/kernel/init_task.c
index 3897043..b99fc6d 100644
--- a/arch/m68knommu/kernel/init_task.c
+++ b/arch/m68knommu/kernel/init_task.c
@@ -8,6 +8,7 @@
#include <linux/init_task.h>
#include <linux/fs.h>
#include <linux/mqueue.h>
+#include <linux/nsproxy.h>

#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -17,6 +18,7 @@ static struct files_struct init_files =
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);

EXPORT_SYMBOL(init_mm);

diff --git a/arch/mips/kernel/init_task.c b/arch/mips/kernel/init_task.c
index aeda7f5..dfe47e6 100644
--- a/arch/mips/kernel/init_task.c
+++ b/arch/mips/kernel/init_task.c
@@ -4,6 +4,7 @@
#include <linux/init_task.h>
#include <linux/fs.h>
#include <linux/mqueue.h>
+#include <linux/nsproxy.h>

#include <asm/thread_info.h>
#include <asm/uaccess.h>
@@ -14,6 +15,7 @@ static struct files_struct init_files =
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);

EXPORT_SYMBOL(init_mm);

diff --git a/arch/parisc/kernel/init_task.c b/arch/parisc/kernel/init_task.c
index 8384bf9..c0c43e2 100644
--- a/arch/parisc/kernel/init_task.c
+++ b/arch/parisc/kernel/init_task.c
@@ -28,6 +28,7 @@
#include <linux/init.h>
#include <linux/init_task.h>
#include <linux/mqueue.h>
+#include <linux/nsproxy.h>

#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -38,6 +39,7 @@ static struct files_struct init_files =
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);

EXPORT_SYMBOL(init_mm);

diff --git a/arch/powerpc/kernel/init_task.c b/arch/powerpc/kernel/init_task.c
index 941043a..e24ace6 100644
--- a/arch/powerpc/kernel/init_task.c
+++ b/arch/powerpc/kernel/init_task.c
@@ -5,6 +5,7 @@
#include <linux/init_task.h>
#include <linux/fs.h>
#include <linux/mqueue.h>
+#include <linux/nsproxy.h>
#include <asm/uaccess.h>

static struct fs_struct init_fs = INIT_FS;
@@ -12,6 +13,7 @@ static struct files_struct init_files =
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);

EXPORT_SYMBOL(init_mm);

diff --git a/arch/s390/kernel/init_task.c b/arch/s390/kernel/init_task.c
index d73a740..0918921 100644
--- a/arch/s390/kernel/init_task.c
+++ b/arch/s390/kernel/init_task.c
@@ -11,6 +11,7 @@
#include <linux/sched.h>
#include <linux/init_task.h>
#include <linux/mqueue.h>
+#include <linux/nsproxy.h>

#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -20,6 +21,7 @@ static struct files_struct init_files =
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);

EXPORT_SYMBOL(init_mm);

diff --git a/arch/sh/kernel/init_task.c b/arch/sh/kernel/init_task.c
index 44053ea..81caf0f 100644
--- a/arch/sh/kernel/init_task.c
+++ b/arch/sh/kernel/init_task.c
@@ -3,6 +3,7 @@
#include <linux/sched.h>
#include <linux/init_task.h>
#include <linux/mqueue.h>
+#include <linux/nsproxy.h>

#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -12,6 +13,7 @@ static struct files_struct init_files =
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);

EXPORT_SYMBOL(init_mm);

diff --git a/arch/sh64/kernel/init_task.c b/arch/sh64/kernel/init_task.c
index de2d07d..0c95f40 100644
--- a/arch/sh64/kernel/init_task.c
+++ b/arch/sh64/kernel/init_task.c
@@ -14,6 +14,7 @@
#include <linux/sched.h>
#include <linux/init_task.h>
#include <linux/mqueue.h>
+#include <linux/nsproxy.h>

#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -23,6 +24,7 @@ static struct files_struct init_files =
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);

struct pt_regs fake_swapper_regs;

diff --git a/arch/sparc/kernel/init_task.c b/arch/sparc/kernel/init_task.c
index fc31de6..a73926d 100644
--- a/arch/sparc/kernel/init_task.c
+++ b/arch/sparc/kernel/init_task.c
@@ -3,6 +3,7 @@
#include <linux/sched.h>
#include <linux/init_task.h>
#include <linux/mqueue.h>
+#include <linux/nsproxy.h>

#include <asm/pgtable.h>
#include <asm/uaccess.h>
@@ -12,6 +13,7 @@ static struct files_struct init_files =
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
struct task_struct init_task = INIT_TASK(init_task);

EXPORT_SYMBOL(init_mm);
diff --git a/arch/sparc64/kernel/init_task.c b/arch/sparc64/kernel/init_task.c
index 329b38f..f1e9a4b 100644
--- a/arch/sparc64/kernel/init_task.c
+++ b/arch/sparc64/kernel/init_task.c
@@ -3,6 +3,7 @@
#include <linux/sched.h>
#include <linux/init_task.h>
#include <linux/mqueue.h>
+#include <linux/nsproxy.h>

#include <asm/pgtable.h>
#include <asm/uaccess.h>
@@ -13,6 +14,7 @@ static struct files_struct init_files =
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);

EXPORT_SYMBOL(init_mm);

diff --git a/arch/um/kernel/init_task.c b/arch/um/kernel/init_task.c
index 49ed5dd..11188af 100644
--- a/arch/um/kernel/init_task.c
+++ b/arch/um/kernel/init_task.c
@@ -9,6 +9,7 @@
#include "linux/sched.h"
#include "linux/init_task.h"
#include "linux/mqueue.h"
+#include "linux/nsproxy.h"
#include "asm/uaccess.h"
#include "asm/pgtable.h"
#include "user_util.h"
@@ -17,6 +18,7 @@

static struct fs_struct init_fs = INIT_FS;
struct mm_struct init_mm = INIT_MM(init_mm);
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
static struct files_struct init_files = INIT_FILES;
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
diff --git a/arch/v850/kernel/init_task.c b/arch/v850/kernel/init_task.c
index ed2f93c..9d2de75 100644
--- a/arch/v850/kernel/init_task.c
+++ b/arch/v850/kernel/init_task.c
@@ -16,6 +16,7 @@
#include <linux/init_task.h>
#include <linux/fs.h>
#include <linux/mqueue.h>
+#include <linux/nsproxy.h>

#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -25,6 +26,7 @@ static struct files_struct init_files =
static struct signal_struct init_signals = INIT_SIGNALS (init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM (init_mm);
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);

EXPORT_SYMBOL(init_mm);

diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c
index ce31d90..1c87ea0 100644
--- a/arch/x86_64/kernel/init_task.c
+++ b/arch/x86_64/kernel/init_task.c
@@ -5,6 +5,7 @@
#include <linux/init_task.h>
#include <linux/fs.h>
#include <linux/mqueue.h>
+#include <linux/nsproxy.h>

#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -15,6 +16,7 @@ static struct files_struct init_files =
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
+struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);

EXPORT_SYMBOL(init_mm);

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 41ecbb8..79ec4ea 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -66,6 +66,12 @@
.session = 1, \
}

+extern struct nsproxy init_nsproxy;
+#define INIT_NSPROXY(nsproxy) { \
+ .count = ATOMIC_INIT(1), \
+ .nslock = SPIN_LOCK_UNLOCKED, \
+}
+
#define INIT_SIGHAND(sighand) { \
.count = ATOMIC_INIT(1), \
.action = { { { .sa_handler = NULL, } }, }, \
@@ -114,6 +120,7 @@ extern struct group_info init_groups;
.files = &init_files, \
.signal = &init_signals, \
.sighand = &init_sighand, \
+ .nsproxy = &init_nsproxy, \
.pending = { \
.list = LIST_HEAD_INIT(tsk.pending.list), \
.signal = {{0}}}, \
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
new file mode 100644
index 0000000..7bdebfa
--- /dev/null
+++ b/include/linux/nsproxy.h
@@ -0,0 +1,45 @@
+#ifndef _LINUX_NSPROXY_H
+#define _LINUX_NSPROXY_H
+
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+
+/*
+ * A structure to contain pointers to all per-process
+ * namespaces - fs (mount), uts, network, sysvipc, etc.
+ *
+ * 'count' is the number of tasks holding a reference.
+ * The count for each namespace, then, will be the number
+ * of nsproxies pointing to it, not the number of tasks.
+ *
+ * The nsproxy is shared by tasks which share all namespaces.
+ * As soon as a single namespace is cloned or unshared, the
+ * nsproxy is copied.
+ */
+struct nsproxy {
+ atomic_t count;
+ spinlock_t nslock;
+};
+extern struct nsproxy init_nsproxy;
+
+struct nsproxy *dup_namespaces(struct nsproxy *orig);
+int copy_namespaces(int flags, struct task_struct *tsk);
+void get_task_namespaces(struct task_struct *tsk);
+void free_nsproxy(struct nsproxy *ns);
+
+static inline void put_nsproxy(struct nsproxy *ns)
+{
+ if (atomic_dec_and_test(&ns->count)) {
+ free_nsproxy(ns);
+ }
+}
+
+static inline void exit_task_namespaces(struct task_struct *p)
+{
+ struct nsproxy *ns = p->nsproxy;
+ if (ns) {
+ put_nsproxy(ns);
+ p->nsproxy = NULL;
+ }
+}
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 29b7d4f..4c0bbb3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -236,6 +236,7 @@ extern signed long schedule_timeout_unin
asmlinkage void schedule(void);

struct namespace;
+struct nsproxy;

/* Maximum number of active map areas.. This is a random (large) number */
#define DEFAULT_MAX_MAP_COUNT 65536
@@ -808,6 +809,7 @@ struct task_struct {
struct files_struct *files;
/* namespace */
struct namespace *namespace;
+ struct nsproxy *nsproxy;
/* signal handlers */
struct signal_struct *signal;
struct sighand_struct *sighand;
diff --git a/kernel/Makefile b/kernel/Makefile
index 58908f9..215fb33 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
- hrtimer.o
+ hrtimer.o nsproxy.o

obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
obj-$(CONFIG_FUTEX) += futex.o
diff --git a/kernel/exit.c b/kernel/exit.c
index e95b932..da2fc84 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -36,6 +36,7 @@
#include <linux/compat.h>
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
+#include <linux/nsproxy.h>

#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -413,9 +414,14 @@ void daemonize(const char *name, ...)
fs = init_task.fs;
current->fs = fs;
atomic_inc(&fs->count);
+
exit_namespace(current);
+ exit_task_namespaces(current);
current->namespace = init_task.namespace;
+ current->nsproxy = init_task.nsproxy;
get_namespace(current->namespace);
+ get_task_namespaces(current);
+
exit_files(current);
current->files = init_task.files;
atomic_inc(&current->files->count);
@@ -919,6 +925,7 @@ fastcall NORET_TYPE void do_exit(long co
__exit_files(tsk);
__exit_fs(tsk);
exit_namespace(tsk);
+ exit_task_namespaces(tsk);
exit_thread();
cpuset_exit(tsk);
exit_keys(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index ac8100e..60303c3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -44,6 +44,7 @@
#include <linux/rmap.h>
#include <linux/acct.h>
#include <linux/cn_proc.h>
+#include <linux/nsproxy.h>

#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -1060,8 +1061,10 @@ static task_t *copy_process(unsigned lon
goto bad_fork_cleanup_signal;
if ((retval = copy_keys(clone_flags, p)))
goto bad_fork_cleanup_mm;
- if ((retval = copy_namespace(clone_flags, p)))
+ if ((retval = copy_namespaces(clone_flags, p)))
goto bad_fork_cleanup_keys;
+ if ((retval = copy_namespace(clone_flags, p)))
+ goto bad_fork_cleanup_namespaces;
retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
if (retval)
goto bad_fork_cleanup_namespace;
@@ -1218,6 +1221,8 @@ static task_t *copy_process(unsigned lon

bad_fork_cleanup_namespace:
exit_namespace(p);
+bad_fork_cleanup_namespaces:
+ exit_task_namespaces(p);
bad_fork_cleanup_keys:
exit_keys(p);
bad_fork_cleanup_mm:
@@ -1559,6 +1564,7 @@ asmlinkage long sys_unshare(unsigned lon
struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
struct files_struct *fd, *new_fd = NULL;
struct sem_undo_list *new_ulist = NULL;
+ struct nsproxy *new_nsproxy, *old_nsproxy;

check_unshare_flags(&unshare_flags);

@@ -1585,7 +1591,15 @@ asmlinkage long sys_unshare(unsigned lon

if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) {

+ old_nsproxy = current->nsproxy;
+ new_nsproxy = dup_namespaces(old_nsproxy);
+ if (!new_nsproxy) {
+ err = -ENOMEM;
+ goto bad_unshare_cleanup_semundo;
+ }
+
task_lock(current);
+ current->nsproxy = new_nsproxy;

if (new_fs) {
fs = current->fs;
@@ -1621,8 +1635,10 @@ asmlinkage long sys_unshare(unsigned lon
}

task_unlock(current);
+ put_nsproxy(old_nsproxy);
}

+bad_unshare_cleanup_semundo:
bad_unshare_cleanup_fd:
if (new_fd)
put_files_struct(new_fd);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
new file mode 100644
index 0000000..6f53df1
--- /dev/null
+++ b/kernel/nsproxy.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2006 IBM Corporation
+ *
+ * Author: Serge Hallyn <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/compile.h>
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/nsproxy.h>
+
+static inline void get_nsproxy(struct nsproxy *ns)
+{
+ atomic_inc(&ns->count);
+}
+
+void get_task_namespaces(struct task_struct *tsk)
+{
+ struct nsproxy *ns = tsk->nsproxy;
+ if (ns) {
+ get_nsproxy(ns);
+ }
+}
+
+/*
+ * creates a copy of "orig" with refcount 1.
+ * This does not grab references to the contained namespaces,
+ * so that needs to be done by dup_namespaces.
+ */
+static inline struct nsproxy *clone_namespaces(struct nsproxy *orig)
+{
+ struct nsproxy *ns;
+
+ ns = kmalloc(sizeof(struct nsproxy), GFP_KERNEL);
+ if (ns) {
+ memcpy(ns, orig, sizeof(struct nsproxy));
+ atomic_set(&ns->count, 1);
+ }
+ return ns;
+}
+
+/*
+ * copies the nsproxy, setting refcount to 1, and grabbing a
+ * reference to all contained namespaces. Called from
+ * sys_unshare()
+ */
+struct nsproxy *dup_namespaces(struct nsproxy *orig)
+{
+ struct nsproxy *ns = clone_namespaces(orig);
+
+ return ns;
+}
+
+/*
+ * called from clone. This now handles copy for nsproxy and all
+ * namespaces therein.
+ */
+int copy_namespaces(int flags, struct task_struct *tsk)
+{
+ struct nsproxy *old_ns = tsk->nsproxy;
+
+ if (!old_ns)
+ return 0;
+
+ get_nsproxy(old_ns);
+
+ return 0;
+}
+
+void free_nsproxy(struct nsproxy *ns)
+{
+ kfree(ns);
+}
--
1.1.6

2006-05-18 15:49:49

by Serge E. Hallyn

[permalink] [raw]
Subject: [PATCH 4/9] namespaces: utsname: switch to using uts namespaces

Replace references to system_utsname to the per-process uts namespace
where appropriate. This includes things like uname.

Changes: Per Eric Biederman's comments, use the per-process uts namespace
for ELF_PLATFORM, sunrpc, and parts of net/ipv4/ipconfig.c

Signed-off-by: Serge E. Hallyn <[email protected]>

---

arch/alpha/kernel/osf_sys.c | 24 ++++++++++++------------
arch/i386/kernel/sys_i386.c | 12 ++++++------
arch/ia64/sn/kernel/sn2/sn_hwperf.c | 2 +-
arch/m32r/kernel/sys_m32r.c | 2 +-
arch/mips/kernel/linux32.c | 2 +-
arch/mips/kernel/syscall.c | 18 +++++++++---------
arch/mips/kernel/sysirix.c | 12 ++++++------
arch/parisc/hpux/sys_hpux.c | 22 +++++++++++-----------
arch/powerpc/kernel/syscalls.c | 14 +++++++-------
arch/sh/kernel/sys_sh.c | 2 +-
arch/sh64/kernel/sys_sh64.c | 2 +-
arch/sparc/kernel/sys_sparc.c | 4 ++--
arch/sparc/kernel/sys_sunos.c | 10 +++++-----
arch/sparc64/kernel/sys_sparc.c | 4 ++--
arch/sparc64/kernel/sys_sunos32.c | 10 +++++-----
arch/sparc64/solaris/misc.c | 6 +++---
arch/um/drivers/mconsole_kern.c | 6 +++---
arch/um/kernel/syscall_kern.c | 12 ++++++------
arch/um/sys-x86_64/syscalls.c | 2 +-
arch/x86_64/ia32/sys_ia32.c | 10 +++++-----
arch/x86_64/kernel/sys_x86_64.c | 2 +-
arch/xtensa/kernel/syscalls.c | 2 +-
drivers/char/random.c | 4 ++--
fs/cifs/connect.c | 28 ++++++++++++++--------------
fs/exec.c | 2 +-
fs/lockd/clntproc.c | 4 ++--
fs/lockd/mon.c | 2 +-
fs/lockd/svclock.c | 2 +-
fs/lockd/xdr.c | 2 +-
fs/nfs/nfsroot.c | 2 +-
include/asm-i386/elf.h | 2 +-
include/linux/lockd/lockd.h | 2 +-
kernel/sys.c | 14 +++++++-------
net/ipv4/ipconfig.c | 14 +++++++-------
net/sunrpc/clnt.c | 4 ++--
35 files changed, 131 insertions(+), 131 deletions(-)

9ee063adf4d2287583dbb0a71d1d5f80d7ae011f
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 31afe3d..b793b96 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -402,15 +402,15 @@ osf_utsname(char __user *name)

down_read(&uts_sem);
error = -EFAULT;
- if (copy_to_user(name + 0, system_utsname.sysname, 32))
+ if (copy_to_user(name + 0, utsname()->sysname, 32))
goto out;
- if (copy_to_user(name + 32, system_utsname.nodename, 32))
+ if (copy_to_user(name + 32, utsname()->nodename, 32))
goto out;
- if (copy_to_user(name + 64, system_utsname.release, 32))
+ if (copy_to_user(name + 64, utsname()->release, 32))
goto out;
- if (copy_to_user(name + 96, system_utsname.version, 32))
+ if (copy_to_user(name + 96, utsname()->version, 32))
goto out;
- if (copy_to_user(name + 128, system_utsname.machine, 32))
+ if (copy_to_user(name + 128, utsname()->machine, 32))
goto out;

error = 0;
@@ -449,8 +449,8 @@ osf_getdomainname(char __user *name, int

down_read(&uts_sem);
for (i = 0; i < len; ++i) {
- __put_user(system_utsname.domainname[i], name + i);
- if (system_utsname.domainname[i] == '\0')
+ __put_user(utsname()->domainname[i], name + i);
+ if (utsname()->domainname[i] == '\0')
break;
}
up_read(&uts_sem);
@@ -608,11 +608,11 @@ asmlinkage long
osf_sysinfo(int command, char __user *buf, long count)
{
static char * sysinfo_table[] = {
- system_utsname.sysname,
- system_utsname.nodename,
- system_utsname.release,
- system_utsname.version,
- system_utsname.machine,
+ utsname()->sysname,
+ utsname()->nodename,
+ utsname()->release,
+ utsname()->version,
+ utsname()->machine,
"alpha", /* instruction set architecture */
"dummy", /* hardware serial number */
"dummy", /* hardware manufacturer */
diff --git a/arch/i386/kernel/sys_i386.c b/arch/i386/kernel/sys_i386.c
index 8fdb1fb..4af731d 100644
--- a/arch/i386/kernel/sys_i386.c
+++ b/arch/i386/kernel/sys_i386.c
@@ -210,7 +210,7 @@ asmlinkage int sys_uname(struct old_utsn
if (!name)
return -EFAULT;
down_read(&uts_sem);
- err=copy_to_user(name, &system_utsname, sizeof (*name));
+ err=copy_to_user(name, utsname(), sizeof (*name));
up_read(&uts_sem);
return err?-EFAULT:0;
}
@@ -226,15 +226,15 @@ asmlinkage int sys_olduname(struct oldol

down_read(&uts_sem);

- error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN);
+ error = __copy_to_user(&name->sysname,&utsname()->sysname,__OLD_UTS_LEN);
error |= __put_user(0,name->sysname+__OLD_UTS_LEN);
- error |= __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN);
+ error |= __copy_to_user(&name->nodename,&utsname()->nodename,__OLD_UTS_LEN);
error |= __put_user(0,name->nodename+__OLD_UTS_LEN);
- error |= __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN);
+ error |= __copy_to_user(&name->release,&utsname()->release,__OLD_UTS_LEN);
error |= __put_user(0,name->release+__OLD_UTS_LEN);
- error |= __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN);
+ error |= __copy_to_user(&name->version,&utsname()->version,__OLD_UTS_LEN);
error |= __put_user(0,name->version+__OLD_UTS_LEN);
- error |= __copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN);
+ error |= __copy_to_user(&name->machine,&utsname()->machine,__OLD_UTS_LEN);
error |= __put_user(0,name->machine+__OLD_UTS_LEN);

up_read(&uts_sem);
diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index 739c948..a27b223 100644
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -420,7 +420,7 @@ static int sn_topology_show(struct seq_f
"coherency_domain %d, "
"region_size %d\n",

- partid, system_utsname.nodename,
+ partid, utsname()->nodename,
shubtype ? "shub2" : "shub1",
(u64)nasid_mask << nasid_shift, nasid_msb, nasid_shift,
system_size, sharing_size, coher, region_size);
diff --git a/arch/m32r/kernel/sys_m32r.c b/arch/m32r/kernel/sys_m32r.c
index 670cb49..11412c0 100644
--- a/arch/m32r/kernel/sys_m32r.c
+++ b/arch/m32r/kernel/sys_m32r.c
@@ -206,7 +206,7 @@ asmlinkage int sys_uname(struct old_utsn
if (!name)
return -EFAULT;
down_read(&uts_sem);
- err=copy_to_user(name, &system_utsname, sizeof (*name));
+ err=copy_to_user(name, utsname(), sizeof (*name));
up_read(&uts_sem);
return err?-EFAULT:0;
}
diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c
index a7d2bb3..66f999b 100644
--- a/arch/mips/kernel/linux32.c
+++ b/arch/mips/kernel/linux32.c
@@ -1040,7 +1040,7 @@ asmlinkage long sys32_newuname(struct ne
int ret = 0;

down_read(&uts_sem);
- if (copy_to_user(name,&system_utsname,sizeof *name))
+ if (copy_to_user(name,utsname(),sizeof *name))
ret = -EFAULT;
up_read(&uts_sem);

diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c
index 2aeaa2f..8b13d57 100644
--- a/arch/mips/kernel/syscall.c
+++ b/arch/mips/kernel/syscall.c
@@ -232,7 +232,7 @@ out:
*/
asmlinkage int sys_uname(struct old_utsname __user * name)
{
- if (name && !copy_to_user(name, &system_utsname, sizeof (*name)))
+ if (name && !copy_to_user(name, utsname(), sizeof (*name)))
return 0;
return -EFAULT;
}
@@ -249,15 +249,15 @@ asmlinkage int sys_olduname(struct oldol
if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname)))
return -EFAULT;

- error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN);
+ error = __copy_to_user(&name->sysname,&utsname()->sysname,__OLD_UTS_LEN);
error -= __put_user(0,name->sysname+__OLD_UTS_LEN);
- error -= __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN);
+ error -= __copy_to_user(&name->nodename,&utsname()->nodename,__OLD_UTS_LEN);
error -= __put_user(0,name->nodename+__OLD_UTS_LEN);
- error -= __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN);
+ error -= __copy_to_user(&name->release,&utsname()->release,__OLD_UTS_LEN);
error -= __put_user(0,name->release+__OLD_UTS_LEN);
- error -= __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN);
+ error -= __copy_to_user(&name->version,&utsname()->version,__OLD_UTS_LEN);
error -= __put_user(0,name->version+__OLD_UTS_LEN);
- error -= __copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN);
+ error -= __copy_to_user(&name->machine,&utsname()->machine,__OLD_UTS_LEN);
error = __put_user(0,name->machine+__OLD_UTS_LEN);
error = error ? -EFAULT : 0;

@@ -293,10 +293,10 @@ asmlinkage int _sys_sysmips(int cmd, lon
return -EFAULT;

down_write(&uts_sem);
- strncpy(system_utsname.nodename, nodename, len);
+ strncpy(utsname()->nodename, nodename, len);
nodename[__NEW_UTS_LEN] = '\0';
- strlcpy(system_utsname.nodename, nodename,
- sizeof(system_utsname.nodename));
+ strlcpy(utsname()->nodename, nodename,
+ sizeof(utsname()->nodename));
up_write(&uts_sem);
return 0;
}
diff --git a/arch/mips/kernel/sysirix.c b/arch/mips/kernel/sysirix.c
index 5407b78..1b4e7e7 100644
--- a/arch/mips/kernel/sysirix.c
+++ b/arch/mips/kernel/sysirix.c
@@ -884,7 +884,7 @@ asmlinkage int irix_getdomainname(char _
down_read(&uts_sem);
if (len > __NEW_UTS_LEN)
len = __NEW_UTS_LEN;
- err = copy_to_user(name, system_utsname.domainname, len) ? -EFAULT : 0;
+ err = copy_to_user(name, utsname()->domainname, len) ? -EFAULT : 0;
up_read(&uts_sem);

return err;
@@ -1127,11 +1127,11 @@ struct iuname {
asmlinkage int irix_uname(struct iuname __user *buf)
{
down_read(&uts_sem);
- if (copy_from_user(system_utsname.sysname, buf->sysname, 65)
- || copy_from_user(system_utsname.nodename, buf->nodename, 65)
- || copy_from_user(system_utsname.release, buf->release, 65)
- || copy_from_user(system_utsname.version, buf->version, 65)
- || copy_from_user(system_utsname.machine, buf->machine, 65)) {
+ if (copy_from_user(utsname()->sysname, buf->sysname, 65)
+ || copy_from_user(utsname()->nodename, buf->nodename, 65)
+ || copy_from_user(utsname()->release, buf->release, 65)
+ || copy_from_user(utsname()->version, buf->version, 65)
+ || copy_from_user(utsname()->machine, buf->machine, 65)) {
return -EFAULT;
}
up_read(&uts_sem);
diff --git a/arch/parisc/hpux/sys_hpux.c b/arch/parisc/hpux/sys_hpux.c
index 05273cc..9fc2c08 100644
--- a/arch/parisc/hpux/sys_hpux.c
+++ b/arch/parisc/hpux/sys_hpux.c
@@ -266,15 +266,15 @@ static int hpux_uname(struct hpux_utsnam

down_read(&uts_sem);

- error = __copy_to_user(&name->sysname,&system_utsname.sysname,HPUX_UTSLEN-1);
+ error = __copy_to_user(&name->sysname,&utsname()->sysname,HPUX_UTSLEN-1);
error |= __put_user(0,name->sysname+HPUX_UTSLEN-1);
- error |= __copy_to_user(&name->nodename,&system_utsname.nodename,HPUX_UTSLEN-1);
+ error |= __copy_to_user(&name->nodename,&utsname()->nodename,HPUX_UTSLEN-1);
error |= __put_user(0,name->nodename+HPUX_UTSLEN-1);
- error |= __copy_to_user(&name->release,&system_utsname.release,HPUX_UTSLEN-1);
+ error |= __copy_to_user(&name->release,&utsname()->release,HPUX_UTSLEN-1);
error |= __put_user(0,name->release+HPUX_UTSLEN-1);
- error |= __copy_to_user(&name->version,&system_utsname.version,HPUX_UTSLEN-1);
+ error |= __copy_to_user(&name->version,&utsname()->version,HPUX_UTSLEN-1);
error |= __put_user(0,name->version+HPUX_UTSLEN-1);
- error |= __copy_to_user(&name->machine,&system_utsname.machine,HPUX_UTSLEN-1);
+ error |= __copy_to_user(&name->machine,&utsname()->machine,HPUX_UTSLEN-1);
error |= __put_user(0,name->machine+HPUX_UTSLEN-1);

up_read(&uts_sem);
@@ -373,8 +373,8 @@ int hpux_utssys(char *ubuf, int n, int t
/* TODO: print a warning about using this? */
down_write(&uts_sem);
error = -EFAULT;
- if (!copy_from_user(system_utsname.sysname, ubuf, len)) {
- system_utsname.sysname[len] = 0;
+ if (!copy_from_user(utsname()->sysname, ubuf, len)) {
+ utsname()->sysname[len] = 0;
error = 0;
}
up_write(&uts_sem);
@@ -400,8 +400,8 @@ int hpux_utssys(char *ubuf, int n, int t
/* TODO: print a warning about this? */
down_write(&uts_sem);
error = -EFAULT;
- if (!copy_from_user(system_utsname.release, ubuf, len)) {
- system_utsname.release[len] = 0;
+ if (!copy_from_user(utsname()->release, ubuf, len)) {
+ utsname()->release[len] = 0;
error = 0;
}
up_write(&uts_sem);
@@ -422,13 +422,13 @@ int hpux_getdomainname(char *name, int l

down_read(&uts_sem);

- nlen = strlen(system_utsname.domainname) + 1;
+ nlen = strlen(utsname()->domainname) + 1;

if (nlen < len)
len = nlen;
if(len > __NEW_UTS_LEN)
goto done;
- if(copy_to_user(name, system_utsname.domainname, len))
+ if(copy_to_user(name, utsname()->domainname, len))
goto done;
err = 0;
done:
diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c
index 9b69d99..d358866 100644
--- a/arch/powerpc/kernel/syscalls.c
+++ b/arch/powerpc/kernel/syscalls.c
@@ -260,7 +260,7 @@ long ppc_newuname(struct new_utsname __u
int err = 0;

down_read(&uts_sem);
- if (copy_to_user(name, &system_utsname, sizeof(*name)))
+ if (copy_to_user(name, utsname(), sizeof(*name)))
err = -EFAULT;
up_read(&uts_sem);
if (!err)
@@ -273,7 +273,7 @@ int sys_uname(struct old_utsname __user
int err = 0;

down_read(&uts_sem);
- if (copy_to_user(name, &system_utsname, sizeof(*name)))
+ if (copy_to_user(name, utsname(), sizeof(*name)))
err = -EFAULT;
up_read(&uts_sem);
if (!err)
@@ -289,19 +289,19 @@ int sys_olduname(struct oldold_utsname _
return -EFAULT;

down_read(&uts_sem);
- error = __copy_to_user(&name->sysname, &system_utsname.sysname,
+ error = __copy_to_user(&name->sysname, &utsname()->sysname,
__OLD_UTS_LEN);
error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
- error |= __copy_to_user(&name->nodename, &system_utsname.nodename,
+ error |= __copy_to_user(&name->nodename, &utsname()->nodename,
__OLD_UTS_LEN);
error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
- error |= __copy_to_user(&name->release, &system_utsname.release,
+ error |= __copy_to_user(&name->release, &utsname()->release,
__OLD_UTS_LEN);
error |= __put_user(0, name->release + __OLD_UTS_LEN);
- error |= __copy_to_user(&name->version, &system_utsname.version,
+ error |= __copy_to_user(&name->version, &utsname()->version,
__OLD_UTS_LEN);
error |= __put_user(0, name->version + __OLD_UTS_LEN);
- error |= __copy_to_user(&name->machine, &system_utsname.machine,
+ error |= __copy_to_user(&name->machine, &utsname()->machine,
__OLD_UTS_LEN);
error |= override_machine(name->machine);
up_read(&uts_sem);
diff --git a/arch/sh/kernel/sys_sh.c b/arch/sh/kernel/sys_sh.c
index 917b2f3..e4966b2 100644
--- a/arch/sh/kernel/sys_sh.c
+++ b/arch/sh/kernel/sys_sh.c
@@ -267,7 +267,7 @@ asmlinkage int sys_uname(struct old_utsn
if (!name)
return -EFAULT;
down_read(&uts_sem);
- err=copy_to_user(name, &system_utsname, sizeof (*name));
+ err=copy_to_user(name, utsname(), sizeof (*name));
up_read(&uts_sem);
return err?-EFAULT:0;
}
diff --git a/arch/sh64/kernel/sys_sh64.c b/arch/sh64/kernel/sys_sh64.c
index 58ff7d5..a8dc88c 100644
--- a/arch/sh64/kernel/sys_sh64.c
+++ b/arch/sh64/kernel/sys_sh64.c
@@ -279,7 +279,7 @@ asmlinkage int sys_uname(struct old_utsn
if (!name)
return -EFAULT;
down_read(&uts_sem);
- err=copy_to_user(name, &system_utsname, sizeof (*name));
+ err=copy_to_user(name, utsname(), sizeof (*name));
up_read(&uts_sem);
return err?-EFAULT:0;
}
diff --git a/arch/sparc/kernel/sys_sparc.c b/arch/sparc/kernel/sys_sparc.c
index 0cdfc9d..c8ad73c 100644
--- a/arch/sparc/kernel/sys_sparc.c
+++ b/arch/sparc/kernel/sys_sparc.c
@@ -470,13 +470,13 @@ asmlinkage int sys_getdomainname(char __

down_read(&uts_sem);

- nlen = strlen(system_utsname.domainname) + 1;
+ nlen = strlen(utsname()->domainname) + 1;

if (nlen < len)
len = nlen;
if (len > __NEW_UTS_LEN)
goto done;
- if (copy_to_user(name, system_utsname.domainname, len))
+ if (copy_to_user(name, utsname()->domainname, len))
goto done;
err = 0;
done:
diff --git a/arch/sparc/kernel/sys_sunos.c b/arch/sparc/kernel/sys_sunos.c
index 288de27..9f9206f 100644
--- a/arch/sparc/kernel/sys_sunos.c
+++ b/arch/sparc/kernel/sys_sunos.c
@@ -483,13 +483,13 @@ asmlinkage int sunos_uname(struct sunos_
{
int ret;
down_read(&uts_sem);
- ret = copy_to_user(&name->sname[0], &system_utsname.sysname[0], sizeof(name->sname) - 1);
+ ret = copy_to_user(&name->sname[0], &utsname()->sysname[0], sizeof(name->sname) - 1);
if (!ret) {
- ret |= __copy_to_user(&name->nname[0], &system_utsname.nodename[0], sizeof(name->nname) - 1);
+ ret |= __copy_to_user(&name->nname[0], &utsname()->nodename[0], sizeof(name->nname) - 1);
ret |= __put_user('\0', &name->nname[8]);
- ret |= __copy_to_user(&name->rel[0], &system_utsname.release[0], sizeof(name->rel) - 1);
- ret |= __copy_to_user(&name->ver[0], &system_utsname.version[0], sizeof(name->ver) - 1);
- ret |= __copy_to_user(&name->mach[0], &system_utsname.machine[0], sizeof(name->mach) - 1);
+ ret |= __copy_to_user(&name->rel[0], &utsname()->release[0], sizeof(name->rel) - 1);
+ ret |= __copy_to_user(&name->ver[0], &utsname()->version[0], sizeof(name->ver) - 1);
+ ret |= __copy_to_user(&name->mach[0], &utsname()->machine[0], sizeof(name->mach) - 1);
}
up_read(&uts_sem);
return ret ? -EFAULT : 0;
diff --git a/arch/sparc64/kernel/sys_sparc.c b/arch/sparc64/kernel/sys_sparc.c
index 7a86913..0453bd2 100644
--- a/arch/sparc64/kernel/sys_sparc.c
+++ b/arch/sparc64/kernel/sys_sparc.c
@@ -707,13 +707,13 @@ asmlinkage long sys_getdomainname(char _

down_read(&uts_sem);

- nlen = strlen(system_utsname.domainname) + 1;
+ nlen = strlen(utsname()->domainname) + 1;

if (nlen < len)
len = nlen;
if (len > __NEW_UTS_LEN)
goto done;
- if (copy_to_user(name, system_utsname.domainname, len))
+ if (copy_to_user(name, utsname()->domainname, len))
goto done;
err = 0;
done:
diff --git a/arch/sparc64/kernel/sys_sunos32.c b/arch/sparc64/kernel/sys_sunos32.c
index ae5b32f..ba98c47 100644
--- a/arch/sparc64/kernel/sys_sunos32.c
+++ b/arch/sparc64/kernel/sys_sunos32.c
@@ -439,16 +439,16 @@ asmlinkage int sunos_uname(struct sunos_
int ret;

down_read(&uts_sem);
- ret = copy_to_user(&name->sname[0], &system_utsname.sysname[0],
+ ret = copy_to_user(&name->sname[0], &utsname()->sysname[0],
sizeof(name->sname) - 1);
- ret |= copy_to_user(&name->nname[0], &system_utsname.nodename[0],
+ ret |= copy_to_user(&name->nname[0], &utsname()->nodename[0],
sizeof(name->nname) - 1);
ret |= put_user('\0', &name->nname[8]);
- ret |= copy_to_user(&name->rel[0], &system_utsname.release[0],
+ ret |= copy_to_user(&name->rel[0], &utsname()->release[0],
sizeof(name->rel) - 1);
- ret |= copy_to_user(&name->ver[0], &system_utsname.version[0],
+ ret |= copy_to_user(&name->ver[0], &utsname()->version[0],
sizeof(name->ver) - 1);
- ret |= copy_to_user(&name->mach[0], &system_utsname.machine[0],
+ ret |= copy_to_user(&name->mach[0], &utsname()->machine[0],
sizeof(name->mach) - 1);
up_read(&uts_sem);
return (ret ? -EFAULT : 0);
diff --git a/arch/sparc64/solaris/misc.c b/arch/sparc64/solaris/misc.c
index 5284996..5d0162a 100644
--- a/arch/sparc64/solaris/misc.c
+++ b/arch/sparc64/solaris/misc.c
@@ -239,7 +239,7 @@ asmlinkage int solaris_utssys(u32 buf, u
/* Let's cheat */
err = set_utsfield(v->sysname, "SunOS", 1, 0);
down_read(&uts_sem);
- err |= set_utsfield(v->nodename, system_utsname.nodename,
+ err |= set_utsfield(v->nodename, utsname()->nodename,
1, 1);
up_read(&uts_sem);
err |= set_utsfield(v->release, "2.6", 0, 0);
@@ -263,7 +263,7 @@ asmlinkage int solaris_utsname(u32 buf)
/* Why should we not lie a bit? */
down_read(&uts_sem);
err = set_utsfield(v->sysname, "SunOS", 0, 0);
- err |= set_utsfield(v->nodename, system_utsname.nodename, 1, 1);
+ err |= set_utsfield(v->nodename, utsname()->nodename, 1, 1);
err |= set_utsfield(v->release, "5.6", 0, 0);
err |= set_utsfield(v->version, "Generic", 0, 0);
err |= set_utsfield(v->machine, machine(), 0, 0);
@@ -295,7 +295,7 @@ asmlinkage int solaris_sysinfo(int cmd,
case SI_HOSTNAME:
r = buffer + 256;
down_read(&uts_sem);
- for (p = system_utsname.nodename, q = buffer;
+ for (p = utsname()->nodename, q = buffer;
q < r && *p && *p != '.'; *q++ = *p++);
up_read(&uts_sem);
*q = 0;
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 6d7173f..244244a 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -106,9 +106,9 @@ void mconsole_version(struct mc_request
{
char version[256];

- sprintf(version, "%s %s %s %s %s", system_utsname.sysname,
- system_utsname.nodename, system_utsname.release,
- system_utsname.version, system_utsname.machine);
+ sprintf(version, "%s %s %s %s %s", utsname()->sysname,
+ utsname()->nodename, utsname()->release,
+ utsname()->version, utsname()->machine);
mconsole_reply(req, version, 0, 0);
}

diff --git a/arch/um/kernel/syscall_kern.c b/arch/um/kernel/syscall_kern.c
index 37d3978..d90e9ed 100644
--- a/arch/um/kernel/syscall_kern.c
+++ b/arch/um/kernel/syscall_kern.c
@@ -110,7 +110,7 @@ long sys_uname(struct old_utsname __user
if (!name)
return -EFAULT;
down_read(&uts_sem);
- err=copy_to_user(name, &system_utsname, sizeof (*name));
+ err=copy_to_user(name, utsname(), sizeof (*name));
up_read(&uts_sem);
return err?-EFAULT:0;
}
@@ -126,19 +126,19 @@ long sys_olduname(struct oldold_utsname

down_read(&uts_sem);

- error = __copy_to_user(&name->sysname,&system_utsname.sysname,
+ error = __copy_to_user(&name->sysname,&utsname()->sysname,
__OLD_UTS_LEN);
error |= __put_user(0,name->sysname+__OLD_UTS_LEN);
- error |= __copy_to_user(&name->nodename,&system_utsname.nodename,
+ error |= __copy_to_user(&name->nodename,&utsname()->nodename,
__OLD_UTS_LEN);
error |= __put_user(0,name->nodename+__OLD_UTS_LEN);
- error |= __copy_to_user(&name->release,&system_utsname.release,
+ error |= __copy_to_user(&name->release,&utsname()->release,
__OLD_UTS_LEN);
error |= __put_user(0,name->release+__OLD_UTS_LEN);
- error |= __copy_to_user(&name->version,&system_utsname.version,
+ error |= __copy_to_user(&name->version,&utsname()->version,
__OLD_UTS_LEN);
error |= __put_user(0,name->version+__OLD_UTS_LEN);
- error |= __copy_to_user(&name->machine,&system_utsname.machine,
+ error |= __copy_to_user(&name->machine,&utsname()->machine,
__OLD_UTS_LEN);
error |= __put_user(0,name->machine+__OLD_UTS_LEN);

diff --git a/arch/um/sys-x86_64/syscalls.c b/arch/um/sys-x86_64/syscalls.c
index 6acee5c..3ad014e 100644
--- a/arch/um/sys-x86_64/syscalls.c
+++ b/arch/um/sys-x86_64/syscalls.c
@@ -21,7 +21,7 @@ asmlinkage long sys_uname64(struct new_u
{
int err;
down_read(&uts_sem);
- err = copy_to_user(name, &system_utsname, sizeof (*name));
+ err = copy_to_user(name, utsname(), sizeof (*name));
up_read(&uts_sem);
if (personality(current->personality) == PER_LINUX32)
err |= copy_to_user(&name->machine, "i686", 5);
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index f182b20..6e0a19d 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -801,13 +801,13 @@ asmlinkage long sys32_olduname(struct ol

down_read(&uts_sem);

- error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN);
+ error = __copy_to_user(&name->sysname,&utsname()->sysname,__OLD_UTS_LEN);
__put_user(0,name->sysname+__OLD_UTS_LEN);
- __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN);
+ __copy_to_user(&name->nodename,&utsname()->nodename,__OLD_UTS_LEN);
__put_user(0,name->nodename+__OLD_UTS_LEN);
- __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN);
+ __copy_to_user(&name->release,&utsname()->release,__OLD_UTS_LEN);
__put_user(0,name->release+__OLD_UTS_LEN);
- __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN);
+ __copy_to_user(&name->version,&utsname()->version,__OLD_UTS_LEN);
__put_user(0,name->version+__OLD_UTS_LEN);
{
char *arch = "x86_64";
@@ -830,7 +830,7 @@ long sys32_uname(struct old_utsname __us
if (!name)
return -EFAULT;
down_read(&uts_sem);
- err=copy_to_user(name, &system_utsname, sizeof (*name));
+ err=copy_to_user(name, utsname(), sizeof (*name));
up_read(&uts_sem);
if (personality(current->personality) == PER_LINUX32)
err |= copy_to_user(&name->machine, "i686", 5);
diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c
index 6449ea8..76bf7c2 100644
--- a/arch/x86_64/kernel/sys_x86_64.c
+++ b/arch/x86_64/kernel/sys_x86_64.c
@@ -148,7 +148,7 @@ asmlinkage long sys_uname(struct new_uts
{
int err;
down_read(&uts_sem);
- err = copy_to_user(name, &system_utsname, sizeof (*name));
+ err = copy_to_user(name, utsname(), sizeof (*name));
up_read(&uts_sem);
if (personality(current->personality) == PER_LINUX32)
err |= copy_to_user(&name->machine, "i686", 5);
diff --git a/arch/xtensa/kernel/syscalls.c b/arch/xtensa/kernel/syscalls.c
index f20c649..30060c1 100644
--- a/arch/xtensa/kernel/syscalls.c
+++ b/arch/xtensa/kernel/syscalls.c
@@ -129,7 +129,7 @@ out:

int sys_uname(struct old_utsname * name)
{
- if (name && !copy_to_user(name, &system_utsname, sizeof (*name)))
+ if (name && !copy_to_user(name, utsname(), sizeof (*name)))
return 0;
return -EFAULT;
}
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 58f3512..a891421 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -888,8 +888,8 @@ static void init_std_data(struct entropy

do_gettimeofday(&tv);
add_entropy_words(r, (__u32 *)&tv, sizeof(tv)/4);
- add_entropy_words(r, (__u32 *)&system_utsname,
- sizeof(system_utsname)/4);
+ add_entropy_words(r, (__u32 *)utsname(),
+ sizeof(*(utsname()))/4);
}

static int __init rand_initialize(void)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d2ec806..b6c0886 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -765,12 +765,12 @@ cifs_parse_mount_options(char *options,
separator[1] = 0;

memset(vol->source_rfc1001_name,0x20,15);
- for(i=0;i < strnlen(system_utsname.nodename,15);i++) {
+ for(i=0;i < strnlen(utsname()->nodename,15);i++) {
/* does not have to be a perfect mapping since the field is
informational, only used for servers that do not support
port 445 and it can be overridden at mount time */
vol->source_rfc1001_name[i] =
- toupper(system_utsname.nodename[i]);
+ toupper(utsname()->nodename[i]);
}
vol->source_rfc1001_name[15] = 0;
/* null target name indicates to use *SMBSERVR default called name
@@ -2077,7 +2077,7 @@ CIFSSessSetup(unsigned int xid, struct c
32, nls_codepage);
bcc_ptr += 2 * bytes_returned;
bytes_returned =
- cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release,
+ cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release,
32, nls_codepage);
bcc_ptr += 2 * bytes_returned;
bcc_ptr += 2;
@@ -2104,8 +2104,8 @@ CIFSSessSetup(unsigned int xid, struct c
}
strcpy(bcc_ptr, "Linux version ");
bcc_ptr += strlen("Linux version ");
- strcpy(bcc_ptr, system_utsname.release);
- bcc_ptr += strlen(system_utsname.release) + 1;
+ strcpy(bcc_ptr, utsname()->release);
+ bcc_ptr += strlen(utsname()->release) + 1;
strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
}
@@ -2346,7 +2346,7 @@ CIFSSpnegoSessSetup(unsigned int xid, st
32, nls_codepage);
bcc_ptr += 2 * bytes_returned;
bytes_returned =
- cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, 32,
+ cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release, 32,
nls_codepage);
bcc_ptr += 2 * bytes_returned;
bcc_ptr += 2;
@@ -2371,8 +2371,8 @@ CIFSSpnegoSessSetup(unsigned int xid, st
}
strcpy(bcc_ptr, "Linux version ");
bcc_ptr += strlen("Linux version ");
- strcpy(bcc_ptr, system_utsname.release);
- bcc_ptr += strlen(system_utsname.release) + 1;
+ strcpy(bcc_ptr, utsname()->release);
+ bcc_ptr += strlen(utsname()->release) + 1;
strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
}
@@ -2622,7 +2622,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned i
32, nls_codepage);
bcc_ptr += 2 * bytes_returned;
bytes_returned =
- cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, 32,
+ cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release, 32,
nls_codepage);
bcc_ptr += 2 * bytes_returned;
bcc_ptr += 2; /* null terminate Linux version */
@@ -2639,8 +2639,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned i
} else { /* ASCII */
strcpy(bcc_ptr, "Linux version ");
bcc_ptr += strlen("Linux version ");
- strcpy(bcc_ptr, system_utsname.release);
- bcc_ptr += strlen(system_utsname.release) + 1;
+ strcpy(bcc_ptr, utsname()->release);
+ bcc_ptr += strlen(utsname()->release) + 1;
strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
bcc_ptr++; /* empty domain field */
@@ -3001,7 +3001,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xi
32, nls_codepage);
bcc_ptr += 2 * bytes_returned;
bytes_returned =
- cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, 32,
+ cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release, 32,
nls_codepage);
bcc_ptr += 2 * bytes_returned;
bcc_ptr += 2; /* null term version string */
@@ -3053,8 +3053,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xi

strcpy(bcc_ptr, "Linux version ");
bcc_ptr += strlen("Linux version ");
- strcpy(bcc_ptr, system_utsname.release);
- bcc_ptr += strlen(system_utsname.release) + 1;
+ strcpy(bcc_ptr, utsname()->release);
+ bcc_ptr += strlen(utsname()->release) + 1;
strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
bcc_ptr++; /* null domain */
diff --git a/fs/exec.c b/fs/exec.c
index 3a79d97..cbb3270 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1337,7 +1337,7 @@ static void format_corename(char *corena
case 'h':
down_read(&uts_sem);
rc = snprintf(out_ptr, out_end - out_ptr,
- "%s", system_utsname.nodename);
+ "%s", utsname()->nodename);
up_read(&uts_sem);
if (rc > out_end - out_ptr)
goto out;
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index f96e381..915e596 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -130,11 +130,11 @@ static void nlmclnt_setlockargs(struct n
nlmclnt_next_cookie(&argp->cookie);
argp->state = nsm_local_state;
memcpy(&lock->fh, NFS_FH(fl->fl_file->f_dentry->d_inode), sizeof(struct nfs_fh));
- lock->caller = system_utsname.nodename;
+ lock->caller = utsname()->nodename;
lock->oh.data = req->a_owner;
lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s",
(unsigned int)fl->fl_u.nfs_fl.owner->pid,
- system_utsname.nodename);
+ utsname()->nodename);
lock->svid = fl->fl_u.nfs_fl.owner->pid;
lock->fl.fl_start = fl->fl_start;
lock->fl.fl_end = fl->fl_end;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 3fc683f..547aaa3 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -152,7 +152,7 @@ xdr_encode_common(struct rpc_rqst *rqstp
*/
sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(argp->addr));
if (!(p = xdr_encode_string(p, buffer))
- || !(p = xdr_encode_string(p, system_utsname.nodename)))
+ || !(p = xdr_encode_string(p, utsname()->nodename)))
return ERR_PTR(-EIO);
*p++ = htonl(argp->prog);
*p++ = htonl(argp->vers);
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 3ef7391..ec93c35 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -326,7 +326,7 @@ static int nlmsvc_setgrantargs(struct nl
{
locks_copy_lock(&call->a_args.lock.fl, &lock->fl);
memcpy(&call->a_args.lock.fh, &lock->fh, sizeof(call->a_args.lock.fh));
- call->a_args.lock.caller = system_utsname.nodename;
+ call->a_args.lock.caller = utsname()->nodename;
call->a_args.lock.oh.len = lock->oh.len;

/* set default data area */
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index f22a376..4eec051 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -516,7 +516,7 @@ nlmclt_decode_res(struct rpc_rqst *req,
*/
#define NLM_void_sz 0
#define NLM_cookie_sz 1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
-#define NLM_caller_sz 1+XDR_QUADLEN(sizeof(system_utsname.nodename))
+#define NLM_caller_sz 1+XDR_QUADLEN(sizeof(utsname()->nodename))
#define NLM_netobj_sz 1+XDR_QUADLEN(XDR_MAX_NETOBJ)
/* #define NLM_owner_sz 1+XDR_QUADLEN(NLM_MAXOWNER) */
#define NLM_fhandle_sz 1+XDR_QUADLEN(NFS2_FHSIZE)
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index c0a754e..1d656a6 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -312,7 +312,7 @@ static int __init root_nfs_name(char *na
/* Override them by options set on kernel command-line */
root_nfs_parse(name, buf);

- cp = system_utsname.nodename;
+ cp = utsname()->nodename;
if (strlen(buf) + strlen(cp) > NFS_MAXPATHLEN) {
printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n");
return -1;
diff --git a/include/asm-i386/elf.h b/include/asm-i386/elf.h
index 4153d80..1b06c44 100644
--- a/include/asm-i386/elf.h
+++ b/include/asm-i386/elf.h
@@ -108,7 +108,7 @@ typedef struct user_fxsr_struct elf_fpxr
For the moment, we have only optimizations for the Intel generations,
but that could change... */

-#define ELF_PLATFORM (system_utsname.machine)
+#define ELF_PLATFORM (utsname()->machine)

#ifdef __KERNEL__
#define SET_PERSONALITY(ex, ibcs2) do { } while (0)
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 995f89d..ac15b87 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -80,7 +80,7 @@ struct nlm_wait;
/*
* Memory chunk for NLM client RPC request.
*/
-#define NLMCLNT_OHSIZE (sizeof(system_utsname.nodename)+10)
+#define NLMCLNT_OHSIZE (sizeof(utsname()->nodename)+10)
struct nlm_rqst {
unsigned int a_flags; /* initial RPC task flags */
struct nlm_host * a_host; /* host handle */
diff --git a/kernel/sys.c b/kernel/sys.c
index 0b6ec0e..bcaa48e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1671,7 +1671,7 @@ asmlinkage long sys_newuname(struct new_
int errno = 0;

down_read(&uts_sem);
- if (copy_to_user(name,&system_utsname,sizeof *name))
+ if (copy_to_user(name,utsname(),sizeof *name))
errno = -EFAULT;
up_read(&uts_sem);
return errno;
@@ -1689,8 +1689,8 @@ asmlinkage long sys_sethostname(char __u
down_write(&uts_sem);
errno = -EFAULT;
if (!copy_from_user(tmp, name, len)) {
- memcpy(system_utsname.nodename, tmp, len);
- system_utsname.nodename[len] = 0;
+ memcpy(utsname()->nodename, tmp, len);
+ utsname()->nodename[len] = 0;
errno = 0;
}
up_write(&uts_sem);
@@ -1706,11 +1706,11 @@ asmlinkage long sys_gethostname(char __u
if (len < 0)
return -EINVAL;
down_read(&uts_sem);
- i = 1 + strlen(system_utsname.nodename);
+ i = 1 + strlen(utsname()->nodename);
if (i > len)
i = len;
errno = 0;
- if (copy_to_user(name, system_utsname.nodename, i))
+ if (copy_to_user(name, utsname()->nodename, i))
errno = -EFAULT;
up_read(&uts_sem);
return errno;
@@ -1735,8 +1735,8 @@ asmlinkage long sys_setdomainname(char _
down_write(&uts_sem);
errno = -EFAULT;
if (!copy_from_user(tmp, name, len)) {
- memcpy(system_utsname.domainname, tmp, len);
- system_utsname.domainname[len] = 0;
+ memcpy(utsname()->domainname, tmp, len);
+ utsname()->domainname[len] = 0;
errno = 0;
}
up_write(&uts_sem);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index cb8a92f..b9bdf0f 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -806,7 +806,7 @@ static void __init ic_do_bootp_ext(u8 *e
}
break;
case 12: /* Host name */
- ic_bootp_string(system_utsname.nodename, ext+1, *ext, __NEW_UTS_LEN);
+ ic_bootp_string(utsname()->nodename, ext+1, *ext, __NEW_UTS_LEN);
ic_host_name_set = 1;
break;
case 15: /* Domain name (DNS) */
@@ -817,7 +817,7 @@ static void __init ic_do_bootp_ext(u8 *e
ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path));
break;
case 40: /* NIS Domain name (_not_ DNS) */
- ic_bootp_string(system_utsname.domainname, ext+1, *ext, __NEW_UTS_LEN);
+ ic_bootp_string(utsname()->domainname, ext+1, *ext, __NEW_UTS_LEN);
break;
}
}
@@ -1369,7 +1369,7 @@ static int __init ip_auto_config(void)
printk(", mask=%u.%u.%u.%u", NIPQUAD(ic_netmask));
printk(", gw=%u.%u.%u.%u", NIPQUAD(ic_gateway));
printk(",\n host=%s, domain=%s, nis-domain=%s",
- system_utsname.nodename, ic_domain, system_utsname.domainname);
+ utsname()->nodename, ic_domain, utsname()->domainname);
printk(",\n bootserver=%u.%u.%u.%u", NIPQUAD(ic_servaddr));
printk(", rootserver=%u.%u.%u.%u", NIPQUAD(root_server_addr));
printk(", rootpath=%s", root_server_path);
@@ -1479,11 +1479,11 @@ static int __init ip_auto_config_setup(c
case 4:
if ((dp = strchr(ip, '.'))) {
*dp++ = '\0';
- strlcpy(system_utsname.domainname, dp,
- sizeof(system_utsname.domainname));
+ strlcpy(utsname()->domainname, dp,
+ sizeof(utsname()->domainname));
}
- strlcpy(system_utsname.nodename, ip,
- sizeof(system_utsname.nodename));
+ strlcpy(utsname()->nodename, ip,
+ sizeof(utsname()->nodename));
ic_host_name_set = 1;
break;
case 5:
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index aa8965e..1d00e41 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -176,10 +176,10 @@ rpc_new_client(struct rpc_xprt *xprt, ch
}

/* save the nodename */
- clnt->cl_nodelen = strlen(system_utsname.nodename);
+ clnt->cl_nodelen = strlen(utsname()->nodename);
if (clnt->cl_nodelen > UNX_MAXNODENAME)
clnt->cl_nodelen = UNX_MAXNODENAME;
- memcpy(clnt->cl_nodename, system_utsname.nodename, clnt->cl_nodelen);
+ memcpy(clnt->cl_nodename, utsname()->nodename, clnt->cl_nodelen);
return clnt;

out_no_auth:
--
1.1.6

2006-05-18 15:50:29

by Serge E. Hallyn

[permalink] [raw]
Subject: [PATCH 6/9] namespaces: utsname: implement utsname namespaces

This patch defines the uts namespace and some manipulators.
Adds the uts namespace to task_struct, and initializes a
system-wide init namespace.

It leaves a #define for system_utsname so sysctl will compile.
This define will be removed in a separate patch.

Signed-off-by: Serge Hallyn <[email protected]>

---

include/linux/init_task.h | 2 ++
include/linux/nsproxy.h | 2 ++
include/linux/sched.h | 1 +
include/linux/utsname.h | 52 ++++++++++++++++++++++++++++++++++++++++++---
init/Kconfig | 8 +++++++
init/version.c | 22 +++++++++++--------
kernel/Makefile | 1 +
kernel/nsproxy.c | 14 ++++++++++++
kernel/utsname.c | 43 +++++++++++++++++++++++++++++++++++++
9 files changed, 133 insertions(+), 12 deletions(-)
create mode 100644 kernel/utsname.c

d590372d659cf4bb3676f8ee7a509e173685b6ee
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 672dc04..ceb68b7 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -3,6 +3,7 @@

#include <linux/file.h>
#include <linux/rcupdate.h>
+#include <linux/utsname.h>

#define INIT_FDTABLE \
{ \
@@ -70,6 +71,7 @@ extern struct nsproxy init_nsproxy;
#define INIT_NSPROXY(nsproxy) { \
.count = ATOMIC_INIT(1), \
.nslock = SPIN_LOCK_UNLOCKED, \
+ .uts_ns = &init_uts_ns, \
.namespace = NULL, \
}

diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 7ebe666..9c2e0ad 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -5,6 +5,7 @@
#include <linux/sched.h>

struct namespace;
+struct uts_namespace;

/*
* A structure to contain pointers to all per-process
@@ -21,6 +22,7 @@ struct namespace;
struct nsproxy {
atomic_t count;
spinlock_t nslock;
+ struct uts_namespace *uts_ns;
struct namespace *namespace;
};
extern struct nsproxy init_nsproxy;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f2c945b..3332d5e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -685,6 +685,7 @@ static inline void prefetch_stack(struct
struct audit_context; /* See audit.c */
struct mempolicy;
struct pipe_inode_info;
+struct uts_namespace;

enum sleep_type {
SLEEP_NORMAL,
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 77e97a5..e6120e7 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -1,6 +1,11 @@
#ifndef _LINUX_UTSNAME_H
#define _LINUX_UTSNAME_H

+#include <linux/sched.h>
+#include <linux/kref.h>
+#include <linux/nsproxy.h>
+#include <asm/atomic.h>
+
#define __OLD_UTS_LEN 8

struct oldold_utsname {
@@ -30,17 +35,58 @@ struct new_utsname {
char domainname[65];
};

-extern struct new_utsname system_utsname;
+struct uts_namespace {
+ struct kref kref;
+ struct new_utsname name;
+};
+extern struct uts_namespace init_uts_ns;
+
+static inline void get_uts_ns(struct uts_namespace *ns)
+{
+ kref_get(&ns->kref);
+}
+
+#ifdef CONFIG_UTS_NS
+extern int copy_utsname(int flags, struct task_struct *tsk);
+extern void free_uts_ns(struct kref *kref);
+
+static inline void put_uts_ns(struct uts_namespace *ns)
+{
+ kref_put(&ns->kref, free_uts_ns);
+}
+
+static inline void exit_utsname(struct task_struct *p)
+{
+ struct uts_namespace *uts_ns = p->nsproxy->uts_ns;
+ if (uts_ns) {
+ put_uts_ns(uts_ns);
+ }
+}
+
+#else
+static inline int copy_utsname(int flags, struct task_struct *tsk)
+{
+ return 0;
+}
+static inline void put_uts_ns(struct uts_namespace *ns)
+{
+}
+static inline void exit_utsname(struct task_struct *p)
+{
+}
+#endif

static inline struct new_utsname *utsname(void)
{
- return &system_utsname;
+ return &current->nsproxy->uts_ns->name;
}

static inline struct new_utsname *init_utsname(void)
{
- return &system_utsname;
+ return &init_uts_ns.name;
}

+#define system_utsname init_uts_ns.name
+
extern struct rw_semaphore uts_sem;
#endif
diff --git a/init/Kconfig b/init/Kconfig
index 3b36a1d..8460e5a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -166,6 +166,14 @@ config SYSCTL
building a kernel for install/rescue disks or your system is very
limited in memory.

+config UTS_NS
+ bool "UTS Namespaces"
+ default n
+ help
+ Support uts namespaces. This allows containers, i.e.
+ vservers, to use uts namespaces to provide different
+ uts info for different servers. If unsure, say N.
+
config AUDIT
bool "Auditing support"
depends on NET
diff --git a/init/version.c b/init/version.c
index 3ddc3ce..78cef48 100644
--- a/init/version.c
+++ b/init/version.c
@@ -11,23 +11,27 @@
#include <linux/uts.h>
#include <linux/utsname.h>
#include <linux/version.h>
+#include <linux/sched.h>

#define version(a) Version_ ## a
#define version_string(a) version(a)

int version_string(LINUX_VERSION_CODE);

-struct new_utsname system_utsname = {
- .sysname = UTS_SYSNAME,
- .nodename = UTS_NODENAME,
- .release = UTS_RELEASE,
- .version = UTS_VERSION,
- .machine = UTS_MACHINE,
- .domainname = UTS_DOMAINNAME,
+struct uts_namespace init_uts_ns = {
+ .kref = {
+ .refcount = ATOMIC_INIT(2),
+ },
+ .name = {
+ .sysname = UTS_SYSNAME,
+ .nodename = UTS_NODENAME,
+ .release = UTS_RELEASE,
+ .version = UTS_VERSION,
+ .machine = UTS_MACHINE,
+ .domainname = UTS_DOMAINNAME,
+ },
};

-EXPORT_SYMBOL(system_utsname);
-
const char linux_banner[] =
"Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@"
LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION "\n";
diff --git a/kernel/Makefile b/kernel/Makefile
index 215fb33..ab7426c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -38,6 +38,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_RELAY) += relay.o
+obj-$(CONFIG_UTS_NS) += utsname.o

ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <[email protected]>, the -fno-omit-frame-pointer is
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 4103f58..d2c6e94 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -14,6 +14,7 @@
#include <linux/version.h>
#include <linux/nsproxy.h>
#include <linux/namespace.h>
+#include <linux/utsname.h>

static inline void get_nsproxy(struct nsproxy *ns)
{
@@ -57,6 +58,8 @@ struct nsproxy *dup_namespaces(struct ns
if (ns) {
if (ns->namespace)
get_namespace(ns->namespace);
+ if (ns->uts_ns)
+ get_uts_ns(ns->uts_ns);
}

return ns;
@@ -95,6 +98,15 @@ int copy_namespaces(int flags, struct ta
goto out;
}

+ err = copy_utsname(flags, tsk);
+ if (err) {
+ if (new_ns->namespace)
+ put_namespace(new_ns->namespace);
+ tsk->nsproxy = old_ns;
+ put_nsproxy(new_ns);
+ goto out;
+ }
+
out:
put_nsproxy(old_ns);
return err;
@@ -104,5 +116,7 @@ void free_nsproxy(struct nsproxy *ns)
{
if (ns->namespace)
put_namespace(ns->namespace);
+ if (ns->uts_ns)
+ put_uts_ns(ns->uts_ns);
kfree(ns);
}
diff --git a/kernel/utsname.c b/kernel/utsname.c
new file mode 100644
index 0000000..2818c9b
--- /dev/null
+++ b/kernel/utsname.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2004 IBM Corporation
+ *
+ * Author: Serge Hallyn <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <linux/compile.h>
+#include <linux/module.h>
+#include <linux/uts.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+
+/*
+ * Copy task tsk's utsname namespace, or clone it if flags
+ * specifies CLONE_NEWUTS. In latter case, changes to the
+ * utsname of this process won't be seen by parent, and vice
+ * versa.
+ */
+int copy_utsname(int flags, struct task_struct *tsk)
+{
+ struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
+ int err = 0;
+
+ if (!old_ns)
+ return 0;
+
+ get_uts_ns(old_ns);
+
+ return err;
+}
+
+void free_uts_ns(struct kref *kref)
+{
+ struct uts_namespace *ns;
+
+ ns = container_of(kref, struct uts_namespace, kref);
+ kfree(ns);
+}
--
1.1.6

2006-05-18 15:50:16

by Serge E. Hallyn

[permalink] [raw]
Subject: [PATCH 5/9] namespaces: utsname: use init_utsname when appropriate

In some places, particularly drivers and __init code, the init utsns is the
appropriate one to use. This patch replaces those with a the init_utsname
helper.

Changes: Removed several uses of init_utsname(). Hope I picked all the
right ones in net/ipv4/ipconfig.c. These are now changed to
utsname() (the per-process namespace utsname) in the previous
patch (2/7)

Signed-off-by: Serge E. Hallyn <[email protected]>

---

arch/arm/kernel/setup.c | 2 +-
arch/arm26/kernel/setup.c | 2 +-
arch/cris/kernel/setup.c | 2 +-
arch/i386/kernel/process.c | 6 +++---
arch/i386/kernel/traps.c | 6 +++---
arch/powerpc/kernel/process.c | 2 +-
arch/powerpc/kernel/setup_64.c | 2 +-
arch/powerpc/platforms/pseries/setup.c | 2 +-
arch/sh/kernel/setup.c | 2 +-
arch/um/kernel/um_arch.c | 6 +++---
arch/um/sys-x86_64/sysrq.c | 2 +-
arch/x86_64/kernel/process.c | 6 +++---
drivers/infiniband/hw/ipath/ipath_verbs.c | 2 +-
drivers/parisc/led.c | 2 +-
drivers/scsi/lpfc/lpfc_ct.c | 8 ++++----
drivers/usb/core/hcd.c | 4 ++--
drivers/usb/gadget/ether.c | 2 +-
drivers/usb/gadget/file_storage.c | 2 +-
drivers/usb/gadget/serial.c | 2 +-
drivers/usb/gadget/zero.c | 2 +-
include/asm-i386/bugs.h | 2 +-
include/asm-sh/bugs.h | 2 +-
kernel/power/snapshot.c | 10 +++++-----
net/ipv4/ipconfig.c | 2 +-
sound/core/info_oss.c | 10 +++++-----
25 files changed, 45 insertions(+), 45 deletions(-)

9d66fd897bde2b3180d9646b5f559c72d494115d
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 9fc9af8..b610568 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -319,7 +319,7 @@ static void __init setup_processor(void)
cpu_name, processor_id, (int)processor_id & 15,
proc_arch[cpu_architecture()]);

- sprintf(system_utsname.machine, "%s%c", list->arch_name, ENDIANNESS);
+ sprintf(init_utsname()->machine, "%s%c", list->arch_name, ENDIANNESS);
sprintf(elf_platform, "%s%c", list->elf_name, ENDIANNESS);
elf_hwcap = list->elf_hwcap;
#ifndef CONFIG_ARM_THUMB
diff --git a/arch/arm26/kernel/setup.c b/arch/arm26/kernel/setup.c
index 4eb329e..8e6a441 100644
--- a/arch/arm26/kernel/setup.c
+++ b/arch/arm26/kernel/setup.c
@@ -144,7 +144,7 @@ static void __init setup_processor(void)

dump_cpu_info();

- sprintf(system_utsname.machine, "%s", list->arch_name);
+ sprintf(init_utsname()->machine, "%s", list->arch_name);
sprintf(elf_platform, "%s", list->elf_name);
elf_hwcap = list->elf_hwcap;

diff --git a/arch/cris/kernel/setup.c b/arch/cris/kernel/setup.c
index 619a6ee..1974c01 100644
--- a/arch/cris/kernel/setup.c
+++ b/arch/cris/kernel/setup.c
@@ -161,7 +161,7 @@ setup_arch(char **cmdline_p)
show_etrax_copyright();

/* Setup utsname */
- strcpy(system_utsname.machine, cris_machine_name);
+ strcpy(init_utsname()->machine, cris_machine_name);
}

static void *c_start(struct seq_file *m, loff_t *pos)
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 6259afe..da2e439 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -297,9 +297,9 @@ void show_regs(struct pt_regs * regs)
if (user_mode_vm(regs))
printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
printk(" EFLAGS: %08lx %s (%s %.*s)\n",
- regs->eflags, print_tainted(), system_utsname.release,
- (int)strcspn(system_utsname.version, " "),
- system_utsname.version);
+ regs->eflags, print_tainted(), init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
regs->eax,regs->ebx,regs->ecx,regs->edx);
printk("ESI: %08lx EDI: %08lx EBP: %08lx",
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 2d22f57..c029749 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -260,9 +260,9 @@ void show_registers(struct pt_regs *regs
printk(KERN_EMERG "CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\n"
"EFLAGS: %08lx (%s %.*s) \n",
smp_processor_id(), 0xffff & regs->xcs, regs->eip,
- print_tainted(), regs->eflags, system_utsname.release,
- (int)strcspn(system_utsname.version, " "),
- system_utsname.version);
+ print_tainted(), regs->eflags, init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip);
printk(KERN_EMERG "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
regs->eax, regs->ebx, regs->ecx, regs->edx);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 2dd47d2..6ce9e10 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -425,7 +425,7 @@ void show_regs(struct pt_regs * regs)
printk("NIP: "REG" LR: "REG" CTR: "REG"\n",
regs->nip, regs->link, regs->ctr);
printk("REGS: %p TRAP: %04lx %s (%s)\n",
- regs, regs->trap, print_tainted(), system_utsname.release);
+ regs, regs->trap, print_tainted(), init_utsname()->release);
printk("MSR: "REG" ", regs->msr);
printbits(regs->msr, msr_bits);
printk(" CR: %08lX XER: %08lX\n", regs->ccr, regs->xer);
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 4467c49..c124e0a 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -435,7 +435,7 @@ void __init setup_system(void)
smp_release_cpus();
#endif

- printk("Starting Linux PPC64 %s\n", system_utsname.version);
+ printk("Starting Linux PPC64 %s\n", init_utsname()->version);

printk("-----------------------------------------------------\n");
printk("ppc64_pft_size = 0x%lx\n", ppc64_pft_size);
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 5eb55ef..58b7a74 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -255,7 +255,7 @@ static int __init pSeries_init_panel(voi
{
/* Manually leave the kernel version on the panel. */
ppc_md.progress("Linux ppc64\n", 0);
- ppc_md.progress(system_utsname.version, 0);
+ ppc_md.progress(init_utsname()->version, 0);

return 0;
}
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index bb229ef..024401e 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -481,7 +481,7 @@ static int show_cpuinfo(struct seq_file
seq_printf(m, "machine\t\t: %s\n", get_system_type());

seq_printf(m, "processor\t: %d\n", cpu);
- seq_printf(m, "cpu family\t: %s\n", system_utsname.machine);
+ seq_printf(m, "cpu family\t: %s\n", init_utsname()->machine);
seq_printf(m, "cpu type\t: %s\n", get_cpu_subtype());

show_cpuflags(m);
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index 7d51dd7..b49dd7d 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -167,7 +167,7 @@ static char *usage_string =

static int __init uml_version_setup(char *line, int *add)
{
- printf("%s\n", system_utsname.release);
+ printf("%s\n", init_utsname()->release);
exit(0);

return 0;
@@ -278,7 +278,7 @@ static int __init Usage(char *line, int
{
const char **p;

- printf(usage_string, system_utsname.release);
+ printf(usage_string, init_utsname()->release);
p = &__uml_help_start;
while (p < &__uml_help_end) {
printf("%s", *p);
@@ -400,7 +400,7 @@ int linux_main(int argc, char **argv)
/* Reserve up to 4M after the current brk */
uml_reserved = ROUND_4M(brk_start) + (1 << 22);

- setup_machinename(system_utsname.machine);
+ setup_machinename(init_utsname()->machine);

#ifdef CONFIG_CMDLINE_ON_HOST
argv1_begin = argv[1];
diff --git a/arch/um/sys-x86_64/sysrq.c b/arch/um/sys-x86_64/sysrq.c
index d0a25af..ce3e07f 100644
--- a/arch/um/sys-x86_64/sysrq.c
+++ b/arch/um/sys-x86_64/sysrq.c
@@ -16,7 +16,7 @@ void __show_regs(struct pt_regs * regs)
printk("\n");
print_modules();
printk("Pid: %d, comm: %.20s %s %s\n",
- current->pid, current->comm, print_tainted(), system_utsname.release);
+ current->pid, current->comm, print_tainted(), init_utsname()->release);
printk("RIP: %04lx:[<%016lx>] ", PT_REGS_CS(regs) & 0xffff,
PT_REGS_RIP(regs));
printk("\nRSP: %016lx EFLAGS: %08lx\n", PT_REGS_RSP(regs),
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index fb903e6..113d4ac 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -292,9 +292,9 @@ void __show_regs(struct pt_regs * regs)
print_modules();
printk("Pid: %d, comm: %.20s %s %s %.*s\n",
current->pid, current->comm, print_tainted(),
- system_utsname.release,
- (int)strcspn(system_utsname.version, " "),
- system_utsname.version);
+ init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
printk_address(regs->rip);
printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c
index cb9e387..b4ddd70 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.c
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.c
@@ -1029,7 +1029,7 @@ static void *ipath_register_ib_device(in
dev->process_mad = ipath_process_mad;

snprintf(dev->node_desc, sizeof(dev->node_desc),
- IPATH_IDSTR " %s kernel_SMA", system_utsname.nodename);
+ IPATH_IDSTR " %s kernel_SMA", init_utsname()->nodename);

ret = ib_register_device(dev);
if (ret)
diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c
index 298f2dd..1d778d2 100644
--- a/drivers/parisc/led.c
+++ b/drivers/parisc/led.c
@@ -684,7 +684,7 @@ int __init led_init(void)
int ret;

snprintf(lcd_text_default, sizeof(lcd_text_default),
- "Linux %s", system_utsname.release);
+ "Linux %s", init_utsname()->release);

/* Work around the buggy PDC of KittyHawk-machines */
switch (CPU_HVERSION) {
diff --git a/drivers/scsi/lpfc/lpfc_ct.c b/drivers/scsi/lpfc/lpfc_ct.c
index b65ee57..83f53fb 100644
--- a/drivers/scsi/lpfc/lpfc_ct.c
+++ b/drivers/scsi/lpfc/lpfc_ct.c
@@ -961,8 +961,8 @@ lpfc_fdmi_cmd(struct lpfc_hba * phba, st
ae = (ATTRIBUTE_ENTRY *) ((uint8_t *) rh + size);
ae->ad.bits.AttrType = be16_to_cpu(OS_NAME_VERSION);
sprintf(ae->un.OsNameVersion, "%s %s %s",
- system_utsname.sysname, system_utsname.release,
- system_utsname.version);
+ init_utsname()->sysname, init_utsname()->release,
+ init_utsname()->version);
len = strlen(ae->un.OsNameVersion);
len += (len & 3) ? (4 - (len & 3)) : 4;
ae->ad.bits.AttrLen = be16_to_cpu(FOURBYTES + len);
@@ -1080,7 +1080,7 @@ lpfc_fdmi_cmd(struct lpfc_hba * phba, st
size);
ae->ad.bits.AttrType = be16_to_cpu(HOST_NAME);
sprintf(ae->un.HostName, "%s",
- system_utsname.nodename);
+ init_utsname()->nodename);
len = strlen(ae->un.HostName);
len += (len & 3) ? (4 - (len & 3)) : 4;
ae->ad.bits.AttrLen =
@@ -1168,7 +1168,7 @@ lpfc_fdmi_tmo_handler(struct lpfc_hba *p

ndlp = lpfc_findnode_did(phba, NLP_SEARCH_ALL, FDMI_DID);
if (ndlp) {
- if (system_utsname.nodename[0] != '\0') {
+ if (init_utsname()->nodename[0] != '\0') {
lpfc_fdmi_cmd(phba, ndlp, SLI_MGMT_DHBA);
} else {
mod_timer(&phba->fc_fdmitmo, jiffies + HZ * 60);
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index e2e00ba..3260688 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -318,8 +318,8 @@ static int rh_string (

// id 3 == vendor description
} else if (id == 3) {
- snprintf (buf, sizeof buf, "%s %s %s", system_utsname.sysname,
- system_utsname.release, hcd->driver->description);
+ snprintf (buf, sizeof buf, "%s %s %s", init_utsname()->sysname,
+ init_utsname()->release, hcd->driver->description);

// unsupported IDs --> "protocol stall"
} else
diff --git a/drivers/usb/gadget/ether.c b/drivers/usb/gadget/ether.c
index 9c4422a..76ad9b4 100644
--- a/drivers/usb/gadget/ether.c
+++ b/drivers/usb/gadget/ether.c
@@ -2242,7 +2242,7 @@ eth_bind (struct usb_gadget *gadget)
return -ENODEV;
}
snprintf (manufacturer, sizeof manufacturer, "%s %s/%s",
- system_utsname.sysname, system_utsname.release,
+ init_utsname()->sysname, init_utsname()->release,
gadget->name);

/* If there's an RNDIS configuration, that's what Windows wants to
diff --git a/drivers/usb/gadget/file_storage.c b/drivers/usb/gadget/file_storage.c
index 6f88747..53d9581 100644
--- a/drivers/usb/gadget/file_storage.c
+++ b/drivers/usb/gadget/file_storage.c
@@ -3985,7 +3985,7 @@ static int __init fsg_bind(struct usb_ga
usb_gadget_set_selfpowered(gadget);

snprintf(manufacturer, sizeof manufacturer, "%s %s with %s",
- system_utsname.sysname, system_utsname.release,
+ init_utsname()->sysname, init_utsname()->release,
gadget->name);

/* On a real device, serial[] would be loaded from permanent
diff --git a/drivers/usb/gadget/serial.c b/drivers/usb/gadget/serial.c
index b992546..a2f905b 100644
--- a/drivers/usb/gadget/serial.c
+++ b/drivers/usb/gadget/serial.c
@@ -1496,7 +1496,7 @@ static int __init gs_bind(struct usb_gad
return -ENOMEM;

snprintf(manufacturer, sizeof(manufacturer), "%s %s with %s",
- system_utsname.sysname, system_utsname.release,
+ init_utsname()->sysname, init_utsname()->release,
gadget->name);

memset(dev, 0, sizeof(struct gs_dev));
diff --git a/drivers/usb/gadget/zero.c b/drivers/usb/gadget/zero.c
index 68e3d8f..4c888bc 100644
--- a/drivers/usb/gadget/zero.c
+++ b/drivers/usb/gadget/zero.c
@@ -1243,7 +1243,7 @@ autoconf_fail:
EP_OUT_NAME, EP_IN_NAME);

snprintf (manufacturer, sizeof manufacturer, "%s %s with %s",
- system_utsname.sysname, system_utsname.release,
+ init_utsname()->sysname, init_utsname()->release,
gadget->name);

return 0;
diff --git a/include/asm-i386/bugs.h b/include/asm-i386/bugs.h
index 50233e0..6cb79fe 100644
--- a/include/asm-i386/bugs.h
+++ b/include/asm-i386/bugs.h
@@ -190,6 +190,6 @@ static void __init check_bugs(void)
check_fpu();
check_hlt();
check_popad();
- system_utsname.machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
+ init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
alternative_instructions();
}
diff --git a/include/asm-sh/bugs.h b/include/asm-sh/bugs.h
index a6de3d0..d09933c 100644
--- a/include/asm-sh/bugs.h
+++ b/include/asm-sh/bugs.h
@@ -18,7 +18,7 @@ static void __init check_bugs(void)
{
extern char *get_cpu_subtype(void);
extern unsigned long loops_per_jiffy;
- char *p= &system_utsname.machine[2]; /* "sh" */
+ char *p= &init_utsname()->machine[2]; /* "sh" */

cpu_data->loops_per_jiffy = loops_per_jiffy;

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3eeedbb..1ca6f95 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -524,7 +524,7 @@ static void init_header(struct swsusp_in
memset(info, 0, sizeof(struct swsusp_info));
info->version_code = LINUX_VERSION_CODE;
info->num_physpages = num_physpages;
- memcpy(&info->uts, &system_utsname, sizeof(system_utsname));
+ memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname));
info->cpus = num_online_cpus();
info->image_pages = nr_copy_pages;
info->pages = nr_copy_pages + nr_meta_pages + 1;
@@ -663,13 +663,13 @@ static int check_header(struct swsusp_in
reason = "kernel version";
if (info->num_physpages != num_physpages)
reason = "memory size";
- if (strcmp(info->uts.sysname,system_utsname.sysname))
+ if (strcmp(info->uts.sysname,init_utsname()->sysname))
reason = "system type";
- if (strcmp(info->uts.release,system_utsname.release))
+ if (strcmp(info->uts.release,init_utsname()->release))
reason = "kernel release";
- if (strcmp(info->uts.version,system_utsname.version))
+ if (strcmp(info->uts.version,init_utsname()->version))
reason = "version";
- if (strcmp(info->uts.machine,system_utsname.machine))
+ if (strcmp(info->uts.machine,init_utsname()->machine))
reason = "machine";
if (reason) {
printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index b9bdf0f..4c13acb 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -367,7 +367,7 @@ static int __init ic_defaults(void)
*/

if (!ic_host_name_set)
- sprintf(system_utsname.nodename, "%u.%u.%u.%u", NIPQUAD(ic_myaddr));
+ sprintf(init_utsname()->nodename, "%u.%u.%u.%u", NIPQUAD(ic_myaddr));

if (root_server_addr == INADDR_NONE)
root_server_addr = ic_servaddr;
diff --git a/sound/core/info_oss.c b/sound/core/info_oss.c
index f9ce854..35662bb 100644
--- a/sound/core/info_oss.c
+++ b/sound/core/info_oss.c
@@ -94,11 +94,11 @@ static void snd_sndstat_proc_read(struct
{
snd_iprintf(buffer, "Sound Driver:3.8.1a-980706 (ALSA v" CONFIG_SND_VERSION " emulation code)\n");
snd_iprintf(buffer, "Kernel: %s %s %s %s %s\n",
- system_utsname.sysname,
- system_utsname.nodename,
- system_utsname.release,
- system_utsname.version,
- system_utsname.machine);
+ init_utsname()->sysname,
+ init_utsname()->nodename,
+ init_utsname()->release,
+ init_utsname()->version,
+ init_utsname()->machine);
snd_iprintf(buffer, "Config options: 0\n");
snd_iprintf(buffer, "\nInstalled drivers: \n");
snd_iprintf(buffer, "Type 10: ALSA emulation\n");
--
1.1.6

2006-05-18 15:49:31

by Serge E. Hallyn

[permalink] [raw]
Subject: [PATCH 3/9] namespaces: utsname: introduce temporary helpers

Define utsname() and init_utsname() which return &system_utsname.
Users of system_utsname will be changed to use these helpers, after
which system_utsname will disappear.

Signed-off-by: Serge E. Hallyn <[email protected]>

---

include/linux/utsname.h | 10 ++++++++++
1 files changed, 10 insertions(+), 0 deletions(-)

f409a5279295edf372d670452fd84199fd6ba7e5
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 13e1da0..77e97a5 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -32,5 +32,15 @@ struct new_utsname {

extern struct new_utsname system_utsname;

+static inline struct new_utsname *utsname(void)
+{
+ return &system_utsname;
+}
+
+static inline struct new_utsname *init_utsname(void)
+{
+ return &system_utsname;
+}
+
extern struct rw_semaphore uts_sem;
#endif
--
1.1.6

2006-05-18 15:50:55

by Serge E. Hallyn

[permalink] [raw]
Subject: [PATCH 7/9] namespaces: utsname: sysctl hack

Sysctl uts patch. This clearly will need to be done another way, but
since sysctl itself needs to be container aware, 'the right thing' is
a separate patchset.

Signed-off-by: Serge E. Hallyn <[email protected]>

---

kernel/sysctl.c | 20 ++++++++++----------
1 files changed, 10 insertions(+), 10 deletions(-)

e03808f9c1b803ff67e396e806c062c97b4073aa
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e82726f..ab36b41 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -233,8 +233,8 @@ static ctl_table kern_table[] = {
{
.ctl_name = KERN_OSTYPE,
.procname = "ostype",
- .data = system_utsname.sysname,
- .maxlen = sizeof(system_utsname.sysname),
+ .data = init_uts_ns.name.sysname,
+ .maxlen = sizeof(init_uts_ns.name.sysname),
.mode = 0444,
.proc_handler = &proc_doutsstring,
.strategy = &sysctl_string,
@@ -242,8 +242,8 @@ static ctl_table kern_table[] = {
{
.ctl_name = KERN_OSRELEASE,
.procname = "osrelease",
- .data = system_utsname.release,
- .maxlen = sizeof(system_utsname.release),
+ .data = init_uts_ns.name.release,
+ .maxlen = sizeof(init_uts_ns.name.release),
.mode = 0444,
.proc_handler = &proc_doutsstring,
.strategy = &sysctl_string,
@@ -251,8 +251,8 @@ static ctl_table kern_table[] = {
{
.ctl_name = KERN_VERSION,
.procname = "version",
- .data = system_utsname.version,
- .maxlen = sizeof(system_utsname.version),
+ .data = init_uts_ns.name.version,
+ .maxlen = sizeof(init_uts_ns.name.version),
.mode = 0444,
.proc_handler = &proc_doutsstring,
.strategy = &sysctl_string,
@@ -260,8 +260,8 @@ static ctl_table kern_table[] = {
{
.ctl_name = KERN_NODENAME,
.procname = "hostname",
- .data = system_utsname.nodename,
- .maxlen = sizeof(system_utsname.nodename),
+ .data = init_uts_ns.name.nodename,
+ .maxlen = sizeof(init_uts_ns.name.nodename),
.mode = 0644,
.proc_handler = &proc_doutsstring,
.strategy = &sysctl_string,
@@ -269,8 +269,8 @@ static ctl_table kern_table[] = {
{
.ctl_name = KERN_DOMAINNAME,
.procname = "domainname",
- .data = system_utsname.domainname,
- .maxlen = sizeof(system_utsname.domainname),
+ .data = init_uts_ns.name.domainname,
+ .maxlen = sizeof(init_uts_ns.name.domainname),
.mode = 0644,
.proc_handler = &proc_doutsstring,
.strategy = &sysctl_string,
--
1.1.6

2006-05-18 15:51:35

by Serge E. Hallyn

[permalink] [raw]
Subject: [PATCH 9/9] namespaces: utsname: implement CLONE_NEWUTS flag

Implement a CLONE_NEWUTS flag, and use it at clone and sys_unshare.

Signed-off-by: Serge Hallyn <[email protected]>

---

include/linux/sched.h | 1 +
include/linux/utsname.h | 7 ++++++
kernel/fork.c | 20 +++++++++++++++---
kernel/nsproxy.c | 2 +-
kernel/utsname.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 79 insertions(+), 4 deletions(-)

8d097a939e5b69f068665d99a68d2deb13811d75
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3332d5e..55671b2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -62,6 +62,7 @@ struct exec_domain;
#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
#define CLONE_STOPPED 0x02000000 /* Start in stopped state */
+#define CLONE_NEWUTS 0x04000000 /* New utsname group? */

/*
* List of flags we want to share for kernel threads,
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 15dafa9..0d500fe 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -47,6 +47,8 @@ static inline void get_uts_ns(struct uts
}

#ifdef CONFIG_UTS_NS
+extern int unshare_utsname(unsigned long unshare_flags,
+ struct uts_namespace **new_uts);
extern int copy_utsname(int flags, struct task_struct *tsk);
extern void free_uts_ns(struct kref *kref);

@@ -64,6 +66,11 @@ static inline void exit_utsname(struct t
}

#else
+static inline int unshare_utsname(unsigned long unshare_flags,
+ struct uts_namespace **new_uts)
+{
+ return -EINVAL;
+}
static inline int copy_utsname(int flags, struct task_struct *tsk)
{
return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index ddab7e7..cdc549e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1560,13 +1560,14 @@ asmlinkage long sys_unshare(unsigned lon
struct files_struct *fd, *new_fd = NULL;
struct sem_undo_list *new_ulist = NULL;
struct nsproxy *new_nsproxy, *old_nsproxy;
+ struct uts_namespace *uts, *new_uts = NULL;

check_unshare_flags(&unshare_flags);

/* Return -EINVAL for all unsupported flags */
err = -EINVAL;
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
- CLONE_VM|CLONE_FILES|CLONE_SYSVSEM))
+ CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|CLONE_NEWUTS))
goto bad_unshare_out;

if ((err = unshare_thread(unshare_flags)))
@@ -1583,14 +1584,17 @@ asmlinkage long sys_unshare(unsigned lon
goto bad_unshare_cleanup_vm;
if ((err = unshare_semundo(unshare_flags, &new_ulist)))
goto bad_unshare_cleanup_fd;
+ if ((err = unshare_utsname(unshare_flags, &new_uts)))
+ goto bad_unshare_cleanup_semundo;

- if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) {
+ if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist ||
+ new_uts) {

old_nsproxy = current->nsproxy;
new_nsproxy = dup_namespaces(old_nsproxy);
if (!new_nsproxy) {
err = -ENOMEM;
- goto bad_unshare_cleanup_semundo;
+ goto bad_unshare_cleanup_uts;
}

task_lock(current);
@@ -1629,10 +1633,20 @@ asmlinkage long sys_unshare(unsigned lon
new_fd = fd;
}

+ if (new_uts) {
+ uts = current->nsproxy->uts_ns;
+ current->nsproxy->uts_ns = new_uts;
+ new_uts = uts;
+ }
+
task_unlock(current);
put_nsproxy(old_nsproxy);
}

+bad_unshare_cleanup_uts:
+ if (new_uts)
+ put_uts_ns(new_uts);
+
bad_unshare_cleanup_semundo:
bad_unshare_cleanup_fd:
if (new_fd)
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index d2c6e94..f958551 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -80,7 +80,7 @@ int copy_namespaces(int flags, struct ta

get_nsproxy(old_ns);

- if (!(flags & CLONE_NEWNS))
+ if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS)))
return 0;

new_ns = clone_namespaces(old_ns);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 2818c9b..2c45490 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -16,6 +16,41 @@
#include <linux/version.h>

/*
+ * Clone a new ns copying an original utsname, setting refcount to 1
+ * @old_ns: namespace to clone
+ * Return NULL on error (failure to kmalloc), new ns otherwise
+ */
+struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
+{
+ struct uts_namespace *ns;
+
+ ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
+ if (ns) {
+ memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
+ kref_init(&ns->kref);
+ }
+ return ns;
+}
+
+/*
+ * unshare the current process' utsname namespace.
+ * called only in sys_unshare()
+ */
+int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts)
+{
+ if (unshare_flags & CLONE_NEWUTS) {
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ *new_uts = clone_uts_ns(current->nsproxy->uts_ns);
+ if (!*new_uts)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/*
* Copy task tsk's utsname namespace, or clone it if flags
* specifies CLONE_NEWUTS. In latter case, changes to the
* utsname of this process won't be seen by parent, and vice
@@ -24,6 +59,7 @@
int copy_utsname(int flags, struct task_struct *tsk)
{
struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
+ struct uts_namespace *new_ns;
int err = 0;

if (!old_ns)
@@ -31,6 +67,23 @@ int copy_utsname(int flags, struct task_

get_uts_ns(old_ns);

+ if (!(flags & CLONE_NEWUTS))
+ return 0;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ new_ns = clone_uts_ns(old_ns);
+ if (!new_ns) {
+ err = -ENOMEM;
+ goto out;
+ }
+ tsk->nsproxy->uts_ns = new_ns;
+
+out:
+ put_uts_ns(old_ns);
return err;
}

--
1.1.6

2006-05-18 15:51:06

by Serge E. Hallyn

[permalink] [raw]
Subject: [PATCH 8/9] namespaces: utsname: remove system_utsname

The system_utsname isn't needed now that kernel/sysctl.c is fixed.
Nuke it.

Signed-off-by: Serge E. Hallyn <[email protected]>

---

include/linux/utsname.h | 2 --
1 files changed, 0 insertions(+), 2 deletions(-)

8b2614c2cad35f261afe2fde3fec8a393126e095
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index e6120e7..15dafa9 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -86,7 +86,5 @@ static inline struct new_utsname *init_u
return &init_uts_ns.name;
}

-#define system_utsname init_uts_ns.name
-
extern struct rw_semaphore uts_sem;
#endif
--
1.1.6

2006-05-18 17:35:35

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

"Serge E. Hallyn" <[email protected]> wrote:
>
> This patchset introduces a per-process utsname namespace. These can
> be used by openvz, vserver, and application migration to virtualize and
> isolate utsname info (i.e. hostname). More resources will follow, until
> hopefully most or all vserver and openvz functionality can be implemented
> by controlling resource namespaces from userspace.
>

Generally, I think that the whole approach of virtualising the OS so it can
run multiple independent instances of userspace is a good one. It's an
extension and a strengthening of things which Linux is already doing and
it pushes further along a path we've been taking for many years. If done
right, it's even possible that each of these featurettes could improve the
kernel in its own right - better layering, separation, etc.

The approach which you appear to be taking is to separate the bits of
functionality apart and to present them as separate works each of which is
reviewed-by, acceptable-to and will-be-used-by all of the interested
projects. That's ideal, and is very much appreciated.


All of which begs the question "now what?".

What we do _not_ want to do is to merge up a pile of infrastructural stuff
which never gets used. On the other hand, we don't want to be in a
position where nothing is merged into mainline until the entirety of
vserver &&/|| openvs is ready to be merged.

I see two ways of justifying a mainline merge of things such as this

a) We make an up-front decision that Linux _will_ have OS-virtualisation
capability in the future and just start putting in place the pieces for
that, even if some of them are not immediately useful.

I suspect that'd be acceptable, although I worry that we'd get
partway through and some issues would come up which are irreconcilable
amongst the various groups.

It would help set minds at ease if someone could produce a
bullet-point list of what features the kernel will need to get it to the
stage where "most or all vserver and openvz functionality can be
implemented by controlling resource namespaces from userspace." Then we
can discuss that list, make sure that everyone's pretty much in
agreement.

It would be good if that list were to identify which features are
useful to Linux in their own right, and which ones only make sense within
a whole virtualise-the-OS setup.

b) Only merge into mainline those feature which make sense in a
standalone fashion. eg, we don't merge this patchset unless the
"per-process utsname namespace" feature is useful to and usable by a
sufficiently broad group of existing Linux users.

I suspect this will be a difficult approach.

The third way would be to buffer it all up in -mm until everything is
sufficiently in place and then slam it all in. That might not be feasible
for various reasons - please advise..

A fourth way would be for someone over there to run a git tree - you all
happily work away, I redistribute it in -mm for testing and one day it's
all ready to merge. I don't really like this approach. It ends up meaning
that nobody else reviews the new code, nobody else understands what it's
doing, etc. It's generally subversive of the way we do things.

Eric, Kirill, Herbert: let us know your thoughts, please.

2006-05-18 19:25:11

by John Kelly

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

On Thu, 18 May 2006 10:34:30 -0700, Andrew Morton <[email protected]>
wrote:

>I see two ways of justifying a mainline merge of things such as this

>a) We make an up-front decision that Linux _will_ have OS-virtualisation
> capability in the future

After using OpenVZ for a short time, I wonder how I ever managed
without it. For application development and testing, having a little
sandbox with only a few PIDs running makes it easier to debug things.


> and just start putting in place the pieces for that, even if some
> of them are not immediately useful. I suspect that'd be acceptable,
> although I worry that we'd get partway through and some issues would
> come up which are irreconcilable amongst the various groups.

>From a user's POV, I want it ASAP. As for conflicts, why not cross
that bridge when you come to it?


2006-05-18 23:03:20

by Paul Mackerras

[permalink] [raw]
Subject: Re: [PATCH 8/9] namespaces: utsname: remove system_utsname

Serge E. Hallyn writes:

> The system_utsname isn't needed now that kernel/sysctl.c is fixed.
> Nuke it.

You don't seem to have grepped for existing uses of system_utsname, of
which there are a bunch under arch/powerpc at least...

Paul.

2006-05-18 23:04:53

by Paul Mackerras

[permalink] [raw]
Subject: Re: [PATCH 8/9] namespaces: utsname: remove system_utsname

I wrote:

> > The system_utsname isn't needed now that kernel/sysctl.c is fixed.
> > Nuke it.
>
> You don't seem to have grepped for existing uses of system_utsname, of
> which there are a bunch under arch/powerpc at least...

Ignore me, I saw this patch before the others in your series...

Paul.

2006-05-18 23:29:05

by Sam Vilain

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

Andrew Morton wrote:

>Generally, I think that the whole approach of virtualising the OS so it can
>run multiple independent instances of userspace is a good one.
>[...]
>All of which begs the question "now what?".
>[...]
> It would help set minds at ease if someone could produce a
> bullet-point list of what features the kernel will need to get it to the
> stage where "most or all vserver and openvz functionality can be
> implemented by controlling resource namespaces from userspace." Then we
> can discuss that list, make sure that everyone's pretty much in
> agreement.
>
>

This is a heartening position to hear from someone such as yourself; we
seem to be at a near consensus of the way forward.

Here's a list based on the one I came up with when I originally started
my line of development, which got shot down so badly it lost a few
priority points on my workqueue scheduler :-).

0. features that don't need namespaces per se

a. Bind Mount Options (mount --bind -o ro, etc)
b. FS - immutable linkage invert (immulink)

1. core vserver patch - no features (this stuff is succeeded by Serge's
set)
a. struct and ps addition; internal API and refcounting
b. syscall, and switch (to be canned)
c. /proc visibility
d. debugging
e. history

2. isolation features

a. IPC, semaphore, and signal restrictions
b. proc/array filtering
c. IPv4 chbind
d. FS chroot() barrier
e. general /proc filtering
f. ptrace
g. process admin: alloc_uid, find_user, sys_setpriority

3. virtualisation features

a. uts information
b. initpid virtualisation
c. uptime
d. time
e. load average
f. ksyslog
g. vshelper (reboot support)
h. vroot (quota, fs IOCTL, etc)
i. general PID virtualisation (eric)
j. ngnet (network stack virtualisation)

4. resource tracking features

a. scheduler tracking hook
b. FS namespace counting
c. FS namespace tagging
d. ulimits
e. RSS usage
f. IO - async tracking

5. resource sharing features

a. scheduling v1 - TBF and vavavoom
b. disk scheduler integration
c. RSS limits
d. FS - mad cow

6. resource limit features

a. scheduler
b. rlimits
c. disklimits

7. super whizzy features

a. Namespace checkpointing
b. Namespace migration
c. HA Cluster Computing (think Tandem)

Can anyone see any that are missed?

As far as how it is tested etc, I have no particular preferences,
whatever people are happy with. I'll continue to track submissions in
the utsl.gen.nz repository:

http://utsl.gen.nz/gitweb/?p=vserver

I'll import Serge's new submission there now.

Sam.

2006-05-18 23:43:39

by Sam Vilain

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

Sam Vilain wrote:

>whatever people are happy with. I'll continue to track submissions in
>the utsl.gen.nz repository:
>
>http://utsl.gen.nz/gitweb/?p=vserver
>
>I'll import Serge's new submission there now.
>
>

Ok, that's done. Applies cleanly against 2.6.17-rc4

Sam.

2006-05-19 00:00:09

by Randy Dunlap

[permalink] [raw]
Subject: Re: [PATCH 4/9] namespaces: utsname: switch to using uts namespaces

On Thu, 18 May 2006 10:49:36 -0500 Serge E. Hallyn wrote:

> Replace references to system_utsname to the per-process uts namespace
> where appropriate. This includes things like uname.
>
> Changes: Per Eric Biederman's comments, use the per-process uts namespace
> for ELF_PLATFORM, sunrpc, and parts of net/ipv4/ipconfig.c
>
> Signed-off-by: Serge E. Hallyn <[email protected]>
>
> ---
>
> 9ee063adf4d2287583dbb0a71d1d5f80d7ae011f
> diff --git a/arch/i386/kernel/sys_i386.c b/arch/i386/kernel/sys_i386.c
> index 8fdb1fb..4af731d 100644
> --- a/arch/i386/kernel/sys_i386.c
> +++ b/arch/i386/kernel/sys_i386.c
> @@ -210,7 +210,7 @@ asmlinkage int sys_uname(struct old_utsn
> if (!name)
> return -EFAULT;
> down_read(&uts_sem);
> - err=copy_to_user(name, &system_utsname, sizeof (*name));
> + err=copy_to_user(name, utsname(), sizeof (*name));

It would be really nice if you would fix spacing while you are here,
like a space a each side of '='.

and a space after ',' in the function calls below.

> up_read(&uts_sem);
> return err?-EFAULT:0;
> }
> @@ -226,15 +226,15 @@ asmlinkage int sys_olduname(struct oldol
>
> down_read(&uts_sem);
>
> - error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN);
> + error = __copy_to_user(&name->sysname,&utsname()->sysname,__OLD_UTS_LEN);
> error |= __put_user(0,name->sysname+__OLD_UTS_LEN);
> - error |= __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN);
> + error |= __copy_to_user(&name->nodename,&utsname()->nodename,__OLD_UTS_LEN);
> error |= __put_user(0,name->nodename+__OLD_UTS_LEN);
> - error |= __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN);
> + error |= __copy_to_user(&name->release,&utsname()->release,__OLD_UTS_LEN);
> error |= __put_user(0,name->release+__OLD_UTS_LEN);
> - error |= __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN);
> + error |= __copy_to_user(&name->version,&utsname()->version,__OLD_UTS_LEN);
> error |= __put_user(0,name->version+__OLD_UTS_LEN);
> - error |= __copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN);
> + error |= __copy_to_user(&name->machine,&utsname()->machine,__OLD_UTS_LEN);
> error |= __put_user(0,name->machine+__OLD_UTS_LEN);
>
> up_read(&uts_sem);
> diff --git a/arch/m32r/kernel/sys_m32r.c b/arch/m32r/kernel/sys_m32r.c
> index 670cb49..11412c0 100644
> --- a/arch/m32r/kernel/sys_m32r.c
> +++ b/arch/m32r/kernel/sys_m32r.c
> @@ -206,7 +206,7 @@ asmlinkage int sys_uname(struct old_utsn
> if (!name)
> return -EFAULT;
> down_read(&uts_sem);
> - err=copy_to_user(name, &system_utsname, sizeof (*name));
> + err=copy_to_user(name, utsname(), sizeof (*name));

spacing

> up_read(&uts_sem);
> return err?-EFAULT:0;
> }
> diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c
> index a7d2bb3..66f999b 100644
> --- a/arch/mips/kernel/linux32.c
> +++ b/arch/mips/kernel/linux32.c
> @@ -1040,7 +1040,7 @@ asmlinkage long sys32_newuname(struct ne
> int ret = 0;
>
> down_read(&uts_sem);
> - if (copy_to_user(name,&system_utsname,sizeof *name))
> + if (copy_to_user(name,utsname(),sizeof *name))

spacing

> ret = -EFAULT;
> up_read(&uts_sem);
>
> diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c
> index 2aeaa2f..8b13d57 100644
> --- a/arch/mips/kernel/syscall.c
> +++ b/arch/mips/kernel/syscall.c
> @@ -232,7 +232,7 @@ out:
> */
> asmlinkage int sys_uname(struct old_utsname __user * name)
> {
> - if (name && !copy_to_user(name, &system_utsname, sizeof (*name)))
> + if (name && !copy_to_user(name, utsname(), sizeof (*name)))


OK, here's my big comment/question. I want to see <nodename> increased to
256 bytes (per current POSIX), so each field of struct <variant>_utsname
needs be copied individually (I think) instead of doing a single
struct copy.

I've been working on this for the past few weeks (among other
things). Sorry about the timing.
I could send patches for this against mainline in a few days,
but I'll be glad to listen to how it would be easiest for all of us
to handle.

I'm probably a little over half done with my patches.
They will end up adding a lib/utsname.c that has functions for:
put_oldold_unmame() // to user
put_old_uname() // to user
put_new_uname() // to user
put_posix_uname() // to user



> return 0;
> return -EFAULT;
> }
> @@ -249,15 +249,15 @@ asmlinkage int sys_olduname(struct oldol
> if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname)))
> return -EFAULT;
>
> - error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN);
> + error = __copy_to_user(&name->sysname,&utsname()->sysname,__OLD_UTS_LEN);
> error -= __put_user(0,name->sysname+__OLD_UTS_LEN);
> - error -= __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN);
> + error -= __copy_to_user(&name->nodename,&utsname()->nodename,__OLD_UTS_LEN);
> error -= __put_user(0,name->nodename+__OLD_UTS_LEN);
> - error -= __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN);
> + error -= __copy_to_user(&name->release,&utsname()->release,__OLD_UTS_LEN);
> error -= __put_user(0,name->release+__OLD_UTS_LEN);
> - error -= __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN);
> + error -= __copy_to_user(&name->version,&utsname()->version,__OLD_UTS_LEN);
> error -= __put_user(0,name->version+__OLD_UTS_LEN);
> - error -= __copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN);
> + error -= __copy_to_user(&name->machine,&utsname()->machine,__OLD_UTS_LEN);
> error = __put_user(0,name->machine+__OLD_UTS_LEN);
> error = error ? -EFAULT : 0;

spaces


> diff --git a/arch/parisc/hpux/sys_hpux.c b/arch/parisc/hpux/sys_hpux.c
> index 05273cc..9fc2c08 100644
> --- a/arch/parisc/hpux/sys_hpux.c
> +++ b/arch/parisc/hpux/sys_hpux.c
> @@ -266,15 +266,15 @@ static int hpux_uname(struct hpux_utsnam
>
> down_read(&uts_sem);
>
> - error = __copy_to_user(&name->sysname,&system_utsname.sysname,HPUX_UTSLEN-1);
> + error = __copy_to_user(&name->sysname,&utsname()->sysname,HPUX_UTSLEN-1);
> error |= __put_user(0,name->sysname+HPUX_UTSLEN-1);
> - error |= __copy_to_user(&name->nodename,&system_utsname.nodename,HPUX_UTSLEN-1);
> + error |= __copy_to_user(&name->nodename,&utsname()->nodename,HPUX_UTSLEN-1);
> error |= __put_user(0,name->nodename+HPUX_UTSLEN-1);
> - error |= __copy_to_user(&name->release,&system_utsname.release,HPUX_UTSLEN-1);
> + error |= __copy_to_user(&name->release,&utsname()->release,HPUX_UTSLEN-1);
> error |= __put_user(0,name->release+HPUX_UTSLEN-1);
> - error |= __copy_to_user(&name->version,&system_utsname.version,HPUX_UTSLEN-1);
> + error |= __copy_to_user(&name->version,&utsname()->version,HPUX_UTSLEN-1);
> error |= __put_user(0,name->version+HPUX_UTSLEN-1);
> - error |= __copy_to_user(&name->machine,&system_utsname.machine,HPUX_UTSLEN-1);
> + error |= __copy_to_user(&name->machine,&utsname()->machine,HPUX_UTSLEN-1);
> error |= __put_user(0,name->machine+HPUX_UTSLEN-1);

spacing

> up_read(&uts_sem);

> diff --git a/arch/sh/kernel/sys_sh.c b/arch/sh/kernel/sys_sh.c
> index 917b2f3..e4966b2 100644
> --- a/arch/sh/kernel/sys_sh.c
> +++ b/arch/sh/kernel/sys_sh.c
> @@ -267,7 +267,7 @@ asmlinkage int sys_uname(struct old_utsn
> if (!name)
> return -EFAULT;
> down_read(&uts_sem);
> - err=copy_to_user(name, &system_utsname, sizeof (*name));
> + err=copy_to_user(name, utsname(), sizeof (*name));

spacing

> up_read(&uts_sem);
> return err?-EFAULT:0;
> }
> diff --git a/arch/sh64/kernel/sys_sh64.c b/arch/sh64/kernel/sys_sh64.c
> index 58ff7d5..a8dc88c 100644
> --- a/arch/sh64/kernel/sys_sh64.c
> +++ b/arch/sh64/kernel/sys_sh64.c
> @@ -279,7 +279,7 @@ asmlinkage int sys_uname(struct old_utsn
> if (!name)
> return -EFAULT;
> down_read(&uts_sem);
> - err=copy_to_user(name, &system_utsname, sizeof (*name));
> + err=copy_to_user(name, utsname(), sizeof (*name));

spacing

> up_read(&uts_sem);
> return err?-EFAULT:0;
> }
> diff --git a/arch/sparc/kernel/sys_sunos.c b/arch/sparc/kernel/sys_sunos.c
> index 288de27..9f9206f 100644
> --- a/arch/sparc/kernel/sys_sunos.c
> +++ b/arch/sparc/kernel/sys_sunos.c
> @@ -483,13 +483,13 @@ asmlinkage int sunos_uname(struct sunos_
> {
> int ret;
> down_read(&uts_sem);
> - ret = copy_to_user(&name->sname[0], &system_utsname.sysname[0], sizeof(name->sname) - 1);
> + ret = copy_to_user(&name->sname[0], &utsname()->sysname[0], sizeof(name->sname) - 1);
> if (!ret) {
> - ret |= __copy_to_user(&name->nname[0], &system_utsname.nodename[0], sizeof(name->nname) - 1);
> + ret |= __copy_to_user(&name->nname[0], &utsname()->nodename[0], sizeof(name->nname) - 1);
> ret |= __put_user('\0', &name->nname[8]);
> - ret |= __copy_to_user(&name->rel[0], &system_utsname.release[0], sizeof(name->rel) - 1);
> - ret |= __copy_to_user(&name->ver[0], &system_utsname.version[0], sizeof(name->ver) - 1);
> - ret |= __copy_to_user(&name->mach[0], &system_utsname.machine[0], sizeof(name->mach) - 1);
> + ret |= __copy_to_user(&name->rel[0], &utsname()->release[0], sizeof(name->rel) - 1);
> + ret |= __copy_to_user(&name->ver[0], &utsname()->version[0], sizeof(name->ver) - 1);
> + ret |= __copy_to_user(&name->mach[0], &utsname()->machine[0], sizeof(name->mach) - 1);

Oh, please limit to 80 column width while you are here (+ other places).

> }
> up_read(&uts_sem);
> return ret ? -EFAULT : 0;

> diff --git a/arch/um/kernel/syscall_kern.c b/arch/um/kernel/syscall_kern.c
> index 37d3978..d90e9ed 100644
> --- a/arch/um/kernel/syscall_kern.c
> +++ b/arch/um/kernel/syscall_kern.c
> @@ -110,7 +110,7 @@ long sys_uname(struct old_utsname __user
> if (!name)
> return -EFAULT;
> down_read(&uts_sem);
> - err=copy_to_user(name, &system_utsname, sizeof (*name));
> + err=copy_to_user(name, utsname(), sizeof (*name));

spacing

> up_read(&uts_sem);
> return err?-EFAULT:0;
> }
> @@ -126,19 +126,19 @@ long sys_olduname(struct oldold_utsname
>
> down_read(&uts_sem);
>
> - error = __copy_to_user(&name->sysname,&system_utsname.sysname,
> + error = __copy_to_user(&name->sysname,&utsname()->sysname,
> __OLD_UTS_LEN);
> error |= __put_user(0,name->sysname+__OLD_UTS_LEN);
> - error |= __copy_to_user(&name->nodename,&system_utsname.nodename,
> + error |= __copy_to_user(&name->nodename,&utsname()->nodename,
> __OLD_UTS_LEN);
> error |= __put_user(0,name->nodename+__OLD_UTS_LEN);
> - error |= __copy_to_user(&name->release,&system_utsname.release,
> + error |= __copy_to_user(&name->release,&utsname()->release,
> __OLD_UTS_LEN);
> error |= __put_user(0,name->release+__OLD_UTS_LEN);
> - error |= __copy_to_user(&name->version,&system_utsname.version,
> + error |= __copy_to_user(&name->version,&utsname()->version,
> __OLD_UTS_LEN);
> error |= __put_user(0,name->version+__OLD_UTS_LEN);
> - error |= __copy_to_user(&name->machine,&system_utsname.machine,
> + error |= __copy_to_user(&name->machine,&utsname()->machine,
> __OLD_UTS_LEN);
> error |= __put_user(0,name->machine+__OLD_UTS_LEN);

spacing

> diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
> index f182b20..6e0a19d 100644
> --- a/arch/x86_64/ia32/sys_ia32.c
> +++ b/arch/x86_64/ia32/sys_ia32.c
> @@ -801,13 +801,13 @@ asmlinkage long sys32_olduname(struct ol
>
> down_read(&uts_sem);
>
> - error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN);
> + error = __copy_to_user(&name->sysname,&utsname()->sysname,__OLD_UTS_LEN);
> __put_user(0,name->sysname+__OLD_UTS_LEN);
> - __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN);
> + __copy_to_user(&name->nodename,&utsname()->nodename,__OLD_UTS_LEN);
> __put_user(0,name->nodename+__OLD_UTS_LEN);
> - __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN);
> + __copy_to_user(&name->release,&utsname()->release,__OLD_UTS_LEN);
> __put_user(0,name->release+__OLD_UTS_LEN);
> - __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN);
> + __copy_to_user(&name->version,&utsname()->version,__OLD_UTS_LEN);
> __put_user(0,name->version+__OLD_UTS_LEN);

spacing

> {
> char *arch = "x86_64";
> @@ -830,7 +830,7 @@ long sys32_uname(struct old_utsname __us
> if (!name)
> return -EFAULT;
> down_read(&uts_sem);
> - err=copy_to_user(name, &system_utsname, sizeof (*name));
> + err=copy_to_user(name, utsname(), sizeof (*name));

ditto

> up_read(&uts_sem);
> if (personality(current->personality) == PER_LINUX32)
> err |= copy_to_user(&name->machine, "i686", 5);

> diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
> index d2ec806..b6c0886 100644
> --- a/fs/cifs/connect.c
> +++ b/fs/cifs/connect.c
> @@ -765,12 +765,12 @@ cifs_parse_mount_options(char *options,
> separator[1] = 0;
>
> memset(vol->source_rfc1001_name,0x20,15);
> - for(i=0;i < strnlen(system_utsname.nodename,15);i++) {
> + for(i=0;i < strnlen(utsname()->nodename,15);i++) {

spacing

> /* does not have to be a perfect mapping since the field is
> informational, only used for servers that do not support
> port 445 and it can be overridden at mount time */
> vol->source_rfc1001_name[i] =
> - toupper(system_utsname.nodename[i]);
> + toupper(utsname()->nodename[i]);
> }
> vol->source_rfc1001_name[15] = 0;
> /* null target name indicates to use *SMBSERVR default called name

> diff --git a/kernel/sys.c b/kernel/sys.c
> index 0b6ec0e..bcaa48e 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -1671,7 +1671,7 @@ asmlinkage long sys_newuname(struct new_
> int errno = 0;
>
> down_read(&uts_sem);
> - if (copy_to_user(name,&system_utsname,sizeof *name))
> + if (copy_to_user(name,utsname(),sizeof *name))

spacing

> errno = -EFAULT;
> up_read(&uts_sem);
> return errno;

Thanks,
---
~Randy

2006-05-19 02:21:24

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH 4/9] namespaces: utsname: switch to using uts namespaces

Quoting Randy.Dunlap ([email protected]):
> > --- a/arch/i386/kernel/sys_i386.c
> > +++ b/arch/i386/kernel/sys_i386.c
> > @@ -210,7 +210,7 @@ asmlinkage int sys_uname(struct old_utsn
> > if (!name)
> > return -EFAULT;
> > down_read(&uts_sem);
> > - err=copy_to_user(name, &system_utsname, sizeof (*name));
> > + err=copy_to_user(name, utsname(), sizeof (*name));
>
> It would be really nice if you would fix spacing while you are here,
> like a space a each side of '='.
>
> and a space after ',' in the function calls below.

Ok. Then in blocks like the following:

> > - error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN);
> > + error = __copy_to_user(&name->sysname,&utsname()->sysname,__OLD_UTS_LEN);
> > error |= __put_user(0,name->sysname+__OLD_UTS_LEN);
> > - error |= __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN);
> > + error |= __copy_to_user(&name->nodename,&utsname()->nodename,__OLD_UTS_LEN);
> > error |= __put_user(0,name->nodename+__OLD_UTS_LEN);
> > - error |= __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN);
> > + error |= __copy_to_user(&name->release,&utsname()->release,__OLD_UTS_LEN);
> > error |= __put_user(0,name->release+__OLD_UTS_LEN);
> > - error |= __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN);
> > + error |= __copy_to_user(&name->version,&utsname()->version,__OLD_UTS_LEN);
> > error |= __put_user(0,name->version+__OLD_UTS_LEN);
> > - error |= __copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN);
> > + error |= __copy_to_user(&name->machine,&utsname()->machine,__OLD_UTS_LEN);
> > error |= __put_user(0,name->machine+__OLD_UTS_LEN);

Should I leave it as is, to keep the consistent look? Change just the
lines I'm editing, making it inconsistent? Or change the whole block,
making my patch seem a bit larger than it really is, but giving the
nicest end result?

I suppose I could insert a separate patchset fixing up the spacing in
those blocks but making no real changes at all, then apply my patch on
top of that...?

> > --- a/arch/mips/kernel/syscall.c
> > +++ b/arch/mips/kernel/syscall.c
> > @@ -232,7 +232,7 @@ out:
> > */
> > asmlinkage int sys_uname(struct old_utsname __user * name)
> > {
> > - if (name && !copy_to_user(name, &system_utsname, sizeof (*name)))
> > + if (name && !copy_to_user(name, utsname(), sizeof (*name)))
>
>
> OK, here's my big comment/question. I want to see <nodename> increased to
> 256 bytes (per current POSIX), so each field of struct <variant>_utsname
> needs be copied individually (I think) instead of doing a single
> struct copy.
>
> I've been working on this for the past few weeks (among other
> things). Sorry about the timing.
> I could send patches for this against mainline in a few days,
> but I'll be glad to listen to how it would be easiest for all of us
> to handle.
>
> I'm probably a little over half done with my patches.
> They will end up adding a lib/utsname.c that has functions for:
> put_oldold_unmame() // to user
> put_old_uname() // to user
> put_new_uname() // to user
> put_posix_uname() // to user

Ok, so long as these functions accept a utsname, we should be able to
just change what we pass in to these functions to being the namespace's
utsname, right? Or am I missing the really nasty part?

thanks,
-serge

2006-05-19 02:42:34

by Randy Dunlap

[permalink] [raw]
Subject: Re: [PATCH 4/9] namespaces: utsname: switch to using uts namespaces

On Thu, 18 May 2006 21:21:14 -0500 Serge E. Hallyn wrote:

> Quoting Randy.Dunlap ([email protected]):
> > > --- a/arch/i386/kernel/sys_i386.c
> > > +++ b/arch/i386/kernel/sys_i386.c
> > > @@ -210,7 +210,7 @@ asmlinkage int sys_uname(struct old_utsn
> > > if (!name)
> > > return -EFAULT;
> > > down_read(&uts_sem);
> > > - err=copy_to_user(name, &system_utsname, sizeof (*name));
> > > + err=copy_to_user(name, utsname(), sizeof (*name));
> >
> > It would be really nice if you would fix spacing while you are here,
> > like a space a each side of '='.
> >
> > and a space after ',' in the function calls below.
>
> Ok. Then in blocks like the following:
>
> > > - error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN);
> > > + error = __copy_to_user(&name->sysname,&utsname()->sysname,__OLD_UTS_LEN);
> > > error |= __put_user(0,name->sysname+__OLD_UTS_LEN);
> > > - error |= __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN);
> > > + error |= __copy_to_user(&name->nodename,&utsname()->nodename,__OLD_UTS_LEN);
> > > error |= __put_user(0,name->nodename+__OLD_UTS_LEN);
> > > - error |= __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN);
> > > + error |= __copy_to_user(&name->release,&utsname()->release,__OLD_UTS_LEN);
> > > error |= __put_user(0,name->release+__OLD_UTS_LEN);
> > > - error |= __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN);
> > > + error |= __copy_to_user(&name->version,&utsname()->version,__OLD_UTS_LEN);
> > > error |= __put_user(0,name->version+__OLD_UTS_LEN);
> > > - error |= __copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN);
> > > + error |= __copy_to_user(&name->machine,&utsname()->machine,__OLD_UTS_LEN);
> > > error |= __put_user(0,name->machine+__OLD_UTS_LEN);
>
> Should I leave it as is, to keep the consistent look? Change just the
> lines I'm editing, making it inconsistent? Or change the whole block,
> making my patch seem a bit larger than it really is, but giving the
> nicest end result?

I'd go for the latter, along with my other comment of breaking them
to fit into 80 columns also.

> I suppose I could insert a separate patchset fixing up the spacing in
> those blocks but making no real changes at all, then apply my patch on
> top of that...?
>
> > > --- a/arch/mips/kernel/syscall.c
> > > +++ b/arch/mips/kernel/syscall.c
> > > @@ -232,7 +232,7 @@ out:
> > > */
> > > asmlinkage int sys_uname(struct old_utsname __user * name)
> > > {
> > > - if (name && !copy_to_user(name, &system_utsname, sizeof (*name)))
> > > + if (name && !copy_to_user(name, utsname(), sizeof (*name)))
> >
> >
> > OK, here's my big comment/question. I want to see <nodename> increased to
> > 256 bytes (per current POSIX), so each field of struct <variant>_utsname
> > needs be copied individually (I think) instead of doing a single
> > struct copy.
> >
> > I've been working on this for the past few weeks (among other
> > things). Sorry about the timing.
> > I could send patches for this against mainline in a few days,
> > but I'll be glad to listen to how it would be easiest for all of us
> > to handle.
> >
> > I'm probably a little over half done with my patches.
> > They will end up adding a lib/utsname.c that has functions for:
> > put_oldold_unmame() // to user
> > put_old_uname() // to user
> > put_new_uname() // to user
> > put_posix_uname() // to user
>
> Ok, so long as these functions accept a utsname, we should be able to
> just change what we pass in to these functions to being the namespace's
> utsname, right? Or am I missing the really nasty part?

The nodename field changes from 65 chars (struct new_utsname) to 256 chars
(struct posix_utsname), and nodename is not the final field in the
struct, so it's no longer safe to do a simple struct copy. Each
field in the struct needs to be copied individually if the target is
not a struct posix_utsname. It's not rocket science.

---
~Randy

2006-05-19 03:12:58

by Sam Vilain

[permalink] [raw]
Subject: Re: [PATCH 4/9] namespaces: utsname: switch to using uts namespaces

Serge E. Hallyn wrote:

>I suppose I could insert a separate patchset fixing up the spacing in
>those blocks but making no real changes at all, then apply my patch on
>top of that...?
>
>

While you're fixing whitespace, patch 1 and 6 add trailing whitespace.
Use a current version of git to see the warnings.

Sam.

>
>
>>>--- a/arch/mips/kernel/syscall.c
>>>+++ b/arch/mips/kernel/syscall.c
>>>@@ -232,7 +232,7 @@ out:
>>> */
>>> asmlinkage int sys_uname(struct old_utsname __user * name)
>>> {
>>>- if (name && !copy_to_user(name, &system_utsname, sizeof (*name)))
>>>+ if (name && !copy_to_user(name, utsname(), sizeof (*name)))
>>>
>>>
>>OK, here's my big comment/question. I want to see <nodename> increased to
>>256 bytes (per current POSIX), so each field of struct <variant>_utsname
>>needs be copied individually (I think) instead of doing a single
>>struct copy.
>>
>>I've been working on this for the past few weeks (among other
>>things). Sorry about the timing.
>>I could send patches for this against mainline in a few days,
>>but I'll be glad to listen to how it would be easiest for all of us
>>to handle.
>>
>>I'm probably a little over half done with my patches.
>>They will end up adding a lib/utsname.c that has functions for:
>> put_oldold_unmame() // to user
>> put_old_uname() // to user
>> put_new_uname() // to user
>> put_posix_uname() // to user
>>
>>
>
>Ok, so long as these functions accept a utsname, we should be able to
>just change what we pass in to these functions to being the namespace's
>utsname, right? Or am I missing the really nasty part?
>
>thanks,
>-serge
>
>

2006-05-19 04:24:42

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

> Can anyone see any that are missed?

I have no idea if this fits, as I am no virtual kernel wizard,
but how about various NUMA stuff, such as what CPUs and Memory
Nodes are online, and the three ways of controlling task and
memory placement on them:
sched_setaffinity/sched_getaffinity
set_mempolicy/get_mempolicy/mbind
/dev/cpuset

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2006-05-19 08:52:15

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

"Serge E. Hallyn" <[email protected]> writes:

> This patchset introduces a per-process utsname namespace. These can
> be used by openvz, vserver, and application migration to virtualize and
> isolate utsname info (i.e. hostname). More resources will follow, until
> hopefully most or all vserver and openvz functionality can be implemented
> by controlling resource namespaces from userspace.
>
> Previous utsname submissions placed a pointer to the utsname namespace
> straight in the task_struct. This patchset (and the last one) moves
> it and the filesystem namespace pointer into struct nsproxy, which is
> shared by processes sharing all namespaces. The intent is to keep
> the taskstruct smaller as the number of namespaces grows.


Previously you mentioned:
> BTW - a first set of comparison results showed nsproxy to have better
> dbench and tbench throughput, and worse kernbench performance. Which
> may make sense given that nsproxy results in lower memory usage but
> likely increased cache misses due to extra pointer dereference.

Is this still true? Or did our final reference counting tweak fix
the kernbench numbers?

I just want to be certain that we don't add an optimization,
that reduces performance.

> Changes:
> - the reference count on fs namespace and uts namespace now
> refers to the number of nsproxies pointing to it
> - some consolidation of namespace cloning and exit code to
> clean up kernel/{fork,exit}.c
> - passed ltp and ltpstress on smp power, x86, and x86-64
> boxes.

Nice.

Eric

2006-05-19 09:06:58

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 4/9] namespaces: utsname: switch to using uts namespaces

"Randy.Dunlap" <[email protected]> writes:

> On Thu, 18 May 2006 10:49:36 -0500 Serge E. Hallyn wrote:
>
>> Replace references to system_utsname to the per-process uts namespace
>> where appropriate. This includes things like uname.
>>
>> Changes: Per Eric Biederman's comments, use the per-process uts namespace
>> for ELF_PLATFORM, sunrpc, and parts of net/ipv4/ipconfig.c
>>
>> Signed-off-by: Serge E. Hallyn <[email protected]>

>
> OK, here's my big comment/question. I want to see <nodename> increased to
> 256 bytes (per current POSIX), so each field of struct <variant>_utsname
> needs be copied individually (I think) instead of doing a single
> struct copy.

Where is it specified? Looking at the spec as SUSV3 I don't see a size
specified for nodename.

> I've been working on this for the past few weeks (among other
> things). Sorry about the timing.
> I could send patches for this against mainline in a few days,
> but I'll be glad to listen to how it would be easiest for all of us
> to handle.
>
> I'm probably a little over half done with my patches.
> They will end up adding a lib/utsname.c that has functions for:
> put_oldold_uname() // to user
> put_old_uname() // to user
> put_new_uname() // to user
> put_posix_uname() // to user

Sounds reasonable, if we really need a 256 byte nodename.

As long as they take a pointer to the appropriate utsname
structure these patches should not fundamentally conflict.

Eric

2006-05-19 09:24:43

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

Paul Jackson <[email protected]> writes:

>> Can anyone see any that are missed?
>
> I have no idea if this fits, as I am no virtual kernel wizard,
> but how about various NUMA stuff, such as what CPUs and Memory
> Nodes are online, and the three ways of controlling task and
> memory placement on them:
> sched_setaffinity/sched_getaffinity
> set_mempolicy/get_mempolicy/mbind
> /dev/cpuset

I expect especially on very large machines for some of this to be done
in conjunction with setting up the isolated instances of user space.
But anything actually touching the hardware is an independent dimension.

Eric

2006-05-19 11:43:47

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

Andrew Morton <[email protected]> writes:

> All of which begs the question "now what?".

I think we are at the point where it is time to start merging patches
into -mm, and having the discussion on what the merge plans are
for the rest of this code.

> What we do _not_ want to do is to merge up a pile of infrastructural stuff
> which never gets used. On the other hand, we don't want to be in a
> position where nothing is merged into mainline until the entirety of
> vserver &&/|| openvs is ready to be merged.


The namespaces I see needed for a useable result are:
- fs namespace (already merged)
- uts namespace
- sysvipc namespace
- time namespace
- uid/gid (keys?) namespace
- network namespace
- pid namespace

> I see two ways of justifying a mainline merge of things such as this
>
> a) We make an up-front decision that Linux _will_ have OS-virtualisation
> capability in the future and just start putting in place the pieces for
> that, even if some of them are not immediately useful.
>
> I suspect that'd be acceptable, although I worry that we'd get
> partway through and some issues would come up which are irreconcilable
> amongst the various groups.

I think I see a third way of justifying a mainline merge. We make an
up-front decision that we will improve the existing chroot jail
functionality in Linux and start making improvements. Even if some of
the improvements are quite small.

Except for partial steps while the code is being refactored, we should
never have steps that are not immediately useful.

This reduces the danger of irreconcilable differences, because being
part way through is still useful.

The only namespace that I see as really contentious is the pid
namespace, and even there I don't think we have read an impasse.
There remains a bunch of patches left to write that replace raw pid_t
values with struct pid references, but once that happens the patches
to implement the pid namespace will be small, and I don't see any
previous problems that we can't resolve when the conversation happens.

> It would help set minds at ease if someone could produce a
> bullet-point list of what features the kernel will need to get it to the
> stage where "most or all vserver and openvz functionality can be
> implemented by controlling resource namespaces from userspace." Then we
> can discuss that list, make sure that everyone's pretty much in
> agreement.

So this is slightly the wrong question. If you look at Sam's list you
will see that there are several independent dimensions to the complete
solution. Most of them dealing with the increase in the number of users
and the amount of work that is happening on a single kernel in this
context.

Basically we need to expect a lot of kernel tuning after we get the
basics working.

The proper question is: What needs to happen before we can run separate
user space instances?

The namespaces I have previously listed. There is also a lot of
cleanup work with sysctl, proc, sysfs, netlink and some other
fundamental interfaces that needs to happen as well. Until each
namespace gets merged we are in a race with other people looking
at enhancing those namespaces. So a complete of what needs to be
fixed is impossible.


> b) Only merge into mainline those feature which make sense in a
> standalone fashion. eg, we don't merge this patchset unless the
> "per-process utsname namespace" feature is useful to and usable by a
> sufficiently broad group of existing Linux users.
>
> I suspect this will be a difficult approach.

I agree if the feature must be useful and usable by a sufficiently
broad group of existing Linux users. Of course I suspect the current
fs namespace fails this test.

I would rather the criteria be, that the functionality that is well
defined and not detrimental to the rest of users.

> The third way would be to buffer it all up in -mm until everything is
> sufficiently in place and then slam it all in. That might not be feasible
> for various reasons - please advise..

Fundamentally I don't think there are problems buffering things up in -mm,
but I worry that we would start having -mm too different from the
stable kernel at some point.

For some of the pieces like the networking stack we need to go through
the respective maintainers, and their development trees to avoid
conflicts. For the sysvipc, utsname, and we have avoided that
because they are absolutely trivial namespaces and they don't have
active maintainers.

> A fourth way would be for someone over there to run a git tree - you all
> happily work away, I redistribute it in -mm for testing and one day it's
> all ready to merge. I don't really like this approach. It ends up meaning
> that nobody else reviews the new code, nobody else understands what it's
> doing, etc. It's generally subversive of the way we do things.

The only part of this picture that might make sense is if we have a
process by which we can decide if patches are good and acceptable to
the various projects independent of deciding if they are good for
the kernel proper, which might take some of the burden off of the
rest of the kernel maintainers.

If we were working in an area of the kernel where we didn't affect
anyone else it would be business as usual and not really subversive.
But since we can't implement things this way I agree that this
code needs to be reviewed as much as possible.

> Eric, Kirill, Herbert: let us know your thoughts, please.

Eric

2006-05-19 12:00:13

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 4/9] namespaces: utsname: switch to using uts namespaces

"Randy.Dunlap" <[email protected]> writes:

> OK, here's my big comment/question. I want to see <nodename> increased to
> 256 bytes (per current POSIX), so each field of struct <variant>_utsname
> needs be copied individually (I think) instead of doing a single
> struct copy.
>
> I've been working on this for the past few weeks (among other
> things). Sorry about the timing.
> I could send patches for this against mainline in a few days,
> but I'll be glad to listen to how it would be easiest for all of us
> to handle.
>
> I'm probably a little over half done with my patches.
> They will end up adding a lib/utsname.c that has functions for:
> put_oldold_unmame() // to user
> put_old_uname() // to user
> put_new_uname() // to user
> put_posix_uname() // to user

Looking 256 at least makes sense to hold a dns fully qualified domain
name. So even if it isn't specified by posix is make sense.

Can we please make the structure we return to user space look something
like:

struct long_utsname {
char *sysname;
char *nodename;
char *release;
char *version;
char *machine;
char *domainname;
char buf[0];
}

int sys_long_uname(char *buf, size_t bufsz);

So we don't hard code the maximum length of these strings into the user
interface, and can just return more by increasing our buffer size.

Eric

2006-05-19 12:42:37

by Herbert Poetzl

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

On Thu, May 18, 2006 at 10:34:30AM -0700, Andrew Morton wrote:
> "Serge E. Hallyn" <[email protected]> wrote:
> >
> > This patchset introduces a per-process utsname namespace. These can
> > be used by openvz, vserver, and application migration to virtualize and
> > isolate utsname info (i.e. hostname). More resources will follow, until
> > hopefully most or all vserver and openvz functionality can be implemented
> > by controlling resource namespaces from userspace.
>
> Generally, I think that the whole approach of virtualising the OS
> so it can run multiple independent instances of userspace is a good
> one. It's an extension and a strengthening of things which Linux is
> already doing and it pushes further along a path we've been taking
> for many years.

yes, I too think that Linux has been moving in that
direction for a long time now, maybe even too long, so
that other OSes (like BSD or Solaris) already managed
to get a virtualization layer up and running (although
maybe at a much simpler level than we plan to do)

> If done right, it's even possible that each of these featurettes could
> improve the kernel in its own right - better layering, separation,
> etc.

agreed, most 'features' will require a cleanup of some
otherwise completely untouched areas, which (hopefully)
will improve those areas ...

> The approach which you appear to be taking is to separate the bits
> of functionality apart and to present them as separate works each of
> which is reviewed-by, acceptable-to and will-be-used-by all of the
> interested projects. That's ideal, and is very much appreciated.

IMHO many things will make perfect sense on their own
even without the 'other' virtualizations or isolations.
With Linux-VServer it's an every day occurance, that
folks just 'cherry pick' the isolation features and build
their own level of virtual/isolated environment.

at this point, many thanks to Sam, Eric and Serge
who do a really good job in massaging patches :)

> All of which begs the question "now what?".
>
> What we do _not_ want to do is to merge up a pile of infrastructural
> stuff which never gets used. On the other hand, we don't want to be in
> a position where nothing is merged into mainline until the entirety of
> vserver &&/|| openvs is ready to be merged.

yes, I agree here, and I'm pretty sure that we are
still missing many 'stakeholders' here just because
we do not see all possible areas of use ... let me
give a simple example here:

"pid virtualization"

- Linux-VServer doesn't really need that right now.
we are perfectly fine with "pid isolation" here, we
only "virtualize" the init pid to make pstree happy

- Snapshot/Restart and Migration will require "full"
pid virtualization (that's where Eric and OpenVZ
are heading towards)

- OpenSSI and *Mosix require system wide pid spaces
which probably could be implemented with virtual
pid spaces as well

- many security addons provide something called pid
randomization, and I think they could probably
benefit from a virtual pid space, too

now does that mean that e.g. Linux-VServer is against
"pid virtualization"? well, we are mainly against all
_unnecessary_ overhead and strictly against losing the
ability to keep it simple for the user, i.e. somebody
who does not require all that stuff should be able to
pick the features (or spaces) she really needs ...

> I see two ways of justifying a mainline merge of things such as this
>
> a) We make an up-front decision that Linux _will_ have
> OS-virtualisation capability in the future and just start putting
> in place the pieces for that, even if some of them are not
> immediately useful.

as long as this doesn't automatically mean bloat, I'm
more than happy with such a decision ...

> I suspect that'd be acceptable, although I worry that we'd get
> partway through and some issues would come up which are
> irreconcilable amongst the various groups.

I'm pretty sure that we _will_ hit some issues, but
I'm also sure that we will be able to work out those
issues, after all the 'end user' has to decide what
should be in mainline and what not ...

> It would help set minds at ease if someone could produce a
> bullet-point list of what features the kernel will need to get
> it to the stage where "most or all vserver and openvz functionality
> can be implemented by controlling resource namespaces from
> userspace." Then we can discuss that list, make sure that
> everyone's pretty much in agreement.

excellent idea, will start preparing such a list
from our PoV, so that we can merge that with the
other lists ...

> It would be good if that list were to identify which features are
> useful to Linux in their own right, and which ones only make
> sense within a whole virtualise-the-OS setup.

that's probably the hardest part ...

> b) Only merge into mainline those feature which make sense in a
> standalone fashion. eg, we don't merge this patchset unless the
> "per-process utsname namespace" feature is useful to and usable
> by a sufficiently broad group of existing Linux users.

the question here is, who are the users and _how_
will they get the feature? I see the following cases
here, which might overlap ...

- the feature makes perfectly sense in mainline as
standalone feature (maybe even adds no overhead
and/or simplifies/generalizes design) but has no
direct 'user' per se (think private namespaces
or linux capabilities)

- the feature is used by a number of projects in
very different ways to improve or even realize
certain 'other' features (think ext2/3 xattrs)

- the feature (although it adds pretty much overhead
and/or complicates the design) is really useful
for everyday use, and most folks who discover it
do not understand how they could live without it
(think various attributes on vfs mounts :)

> I suspect this will be a difficult approach.

yes, but should 'we' decide to take this approach, we
can at least guarantee that we (Linux-VServer) will
try to make use of those new features as soon as they
appear in mainline (as we did until now)

> The third way would be to buffer it all up in -mm until everything
> is sufficiently in place and then slam it all in.

for me, that sounds like a pretty bad idea, at least
for the first steps -- though we might consider this
approach for the last 10 or 20 percent, when we just
have to put the pieces together ...

> That might not be feasible for various reasons - please advise..
>
> A fourth way would be for someone over there to run a git tree - you
> all happily work away, I redistribute it in -mm for testing and one
> day it's all ready to merge. I don't really like this approach. It
> ends up meaning that nobody else reviews the new code, nobody else
> understands what it's doing, etc. It's generally subversive of the way
> we do things.

let me say that I'm strictly against such an approach
as it would be very similar to merging any of the
existing projects without further mainline consideration

> Eric, Kirill, Herbert: let us know your thoughts, please.

thanks for your work and time, we appreciate it

best,
Herbert

2006-05-19 13:30:47

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

Quoting Eric W. Biederman ([email protected]):
> "Serge E. Hallyn" <[email protected]> writes:
>
> > This patchset introduces a per-process utsname namespace. These can
> > be used by openvz, vserver, and application migration to virtualize and
> > isolate utsname info (i.e. hostname). More resources will follow, until
> > hopefully most or all vserver and openvz functionality can be implemented
> > by controlling resource namespaces from userspace.
> >
> > Previous utsname submissions placed a pointer to the utsname namespace
> > straight in the task_struct. This patchset (and the last one) moves
> > it and the filesystem namespace pointer into struct nsproxy, which is
> > shared by processes sharing all namespaces. The intent is to keep
> > the taskstruct smaller as the number of namespaces grows.
>
>
> Previously you mentioned:
> > BTW - a first set of comparison results showed nsproxy to have better
> > dbench and tbench throughput, and worse kernbench performance. Which
> > may make sense given that nsproxy results in lower memory usage but
> > likely increased cache misses due to extra pointer dereference.
>
> Is this still true? Or did our final reference counting tweak fix
> the kernbench numbers?

I haven't checked that. I'll start a new set of runs later this
morning, should get the results out saturday.

-serge

2006-05-19 13:48:00

by Andrey Savochkin

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

Hi Andrew,

What you are saying indeed makes a lot of sense.
We want to start merging virtualization code some way or another.
Yet, if we merge code step-by-step, we do not want a pile of unused
infrastructure for the beginning, which may happen to be not entirely useful
in the future, or even create obstacles for development.
And in the course of merging, we would like to overcome differences in
opinions of various group, and live happily ever after.

I have a practical proposal.
We can start with presenting and merging the most interesting part, network
containers. We discuss details, possible approaches, and related subsystems,
until networking is finished to its utmost detail.
This will create an example of virtualization of a non-trivial subsystem,
and we will have to agree on basic principles of virtualization of related
subsystems like proc.

Virtualization of networking presents a lot of challenges and decision-making
points with respect to user-visible interfaces: proc, sysctl, netlink events
(and netlink sockets themselves), and so on. This code will also become
immediately useful as an improvement over chroot.
I am sure that when we come to a mutually acceptable solution with respect to
networking, virtualization of all other subsystems can be implemented and
merged without many questions.

What do people think about this plan?

Best regards,

Andrey

On Thu, May 18, 2006 at 10:34:30AM -0700, Andrew Morton wrote:
[snip]
>
> All of which begs the question "now what?".
>
> What we do _not_ want to do is to merge up a pile of infrastructural stuff
> which never gets used. On the other hand, we don't want to be in a
> position where nothing is merged into mainline until the entirety of
> vserver &&/|| openvs is ready to be merged.
>
> I see two ways of justifying a mainline merge of things such as this
>
> a) We make an up-front decision that Linux _will_ have OS-virtualisation
> capability in the future and just start putting in place the pieces for
> that, even if some of them are not immediately useful.
>
> I suspect that'd be acceptable, although I worry that we'd get
> partway through and some issues would come up which are irreconcilable
> amongst the various groups.
>
> It would help set minds at ease if someone could produce a
> bullet-point list of what features the kernel will need to get it to the
> stage where "most or all vserver and openvz functionality can be
> implemented by controlling resource namespaces from userspace." Then we
> can discuss that list, make sure that everyone's pretty much in
> agreement.
>
> It would be good if that list were to identify which features are
> useful to Linux in their own right, and which ones only make sense within
> a whole virtualise-the-OS setup.
>
> b) Only merge into mainline those feature which make sense in a
> standalone fashion. eg, we don't merge this patchset unless the
> "per-process utsname namespace" feature is useful to and usable by a
> sufficiently broad group of existing Linux users.
>
> I suspect this will be a difficult approach.
>
> The third way would be to buffer it all up in -mm until everything is
> sufficiently in place and then slam it all in. That might not be feasible
> for various reasons - please advise..
>
> A fourth way would be for someone over there to run a git tree - you all
> happily work away, I redistribute it in -mm for testing and one day it's
> all ready to merge. I don't really like this approach. It ends up meaning
> that nobody else reviews the new code, nobody else understands what it's
> doing, etc. It's generally subversive of the way we do things.
>
> Eric, Kirill, Herbert: let us know your thoughts, please.

2006-05-19 15:14:35

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

Herbert Poetzl <[email protected]> wrote:
>
> let me
> give a simple example here:

Examples are useful.

> "pid virtualization"
>
> - Linux-VServer doesn't really need that right now.
> we are perfectly fine with "pid isolation" here, we
> only "virtualize" the init pid to make pstree happy
>
> - Snapshot/Restart and Migration will require "full"
> pid virtualization (that's where Eric and OpenVZ
> are heading towards)

snapshot/restart/migration worry me. If they require complete
serialisation of complex kernel data structures then we have a problem,
because it means that any time anyone changes such a structure they need to
update (and test) the serialisation.

This may be a show-stopper, in which case maybe we only need to virtualise
pid #1.

> - OpenSSI and *Mosix require system wide pid spaces
> which probably could be implemented with virtual
> pid spaces as well
>
> - many security addons provide something called pid
> randomization, and I think they could probably
> benefit from a virtual pid space, too

ok.

Anyway. Thanks, guys. It sound like most of this work will be nicely
separable so we can think about each bit as it comes along.

2006-05-19 15:26:18

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

Andrey Savochkin <[email protected]> wrote:
>
> I have a practical proposal.
> We can start with presenting and merging the most interesting part, network
> containers. We discuss details, possible approaches, and related subsystems,
> until networking is finished to its utmost detail.
> This will create an example of virtualization of a non-trivial subsystem,
> and we will have to agree on basic principles of virtualization of related
> subsystems like proc.
>
> Virtualization of networking presents a lot of challenges and decision-making
> points with respect to user-visible interfaces: proc, sysctl, netlink events
> (and netlink sockets themselves), and so on. This code will also become
> immediately useful as an improvement over chroot.
> I am sure that when we come to a mutually acceptable solution with respect to
> networking, virtualization of all other subsystems can be implemented and
> merged without many questions.
>
> What do people think about this plan?

It sounds like that feature might be the
most-likely-to-cause-maintainer-revolt one, in which case yes, it is
absolutely definitely the one to start with.

Because if it ends up that an acceptable approach cannot be found, and if
this feature is compulsory for any sane virtualisation implementation then
that's it - game over. We want to discover such blockers as early in the
process as possible.

2006-05-19 16:29:25

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

Andrew Morton <[email protected]> writes:

> Herbert Poetzl <[email protected]> wrote:
>>
>> let me
>> give a simple example here:
>
> Examples are useful.
>
>> "pid virtualization"
>>
>> - Linux-VServer doesn't really need that right now.
>> we are perfectly fine with "pid isolation" here, we
>> only "virtualize" the init pid to make pstree happy
>>
>> - Snapshot/Restart and Migration will require "full"
>> pid virtualization (that's where Eric and OpenVZ
>> are heading towards)
>
> snapshot/restart/migration worry me. If they require complete
> serialisation of complex kernel data structures then we have a problem,
> because it means that any time anyone changes such a structure they need to
> update (and test) the serialisation.

There is a strict limit to what is user visible, and if it isn't user visible
we will never need it in a checkpoint. So internal implementation details
should not matter.

> This may be a show-stopper, in which case maybe we only need to virtualise
> pid #1.

Except we do need something for pid isolation, and a pid namespace is
quite possibly the light weight solution. If you can't see the pid it is
clearly isolated from you.

> Anyway. Thanks, guys. It sound like most of this work will be nicely
> separable so we can think about each bit as it comes along.

Yes, and there are enough issues it is significant.

Eric

2006-05-19 16:41:44

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

[email protected] (Eric W. Biederman) wrote:
>
> > Herbert Poetzl <[email protected]> wrote:
> >>
> >> let me
> >> give a simple example here:
> >
> > Examples are useful.
> >
> >> "pid virtualization"
> >>
> >> - Linux-VServer doesn't really need that right now.
> >> we are perfectly fine with "pid isolation" here, we
> >> only "virtualize" the init pid to make pstree happy
> >>
> >> - Snapshot/Restart and Migration will require "full"
> >> pid virtualization (that's where Eric and OpenVZ
> >> are heading towards)
> >
> > snapshot/restart/migration worry me. If they require complete
> > serialisation of complex kernel data structures then we have a problem,
> > because it means that any time anyone changes such a structure they need to
> > update (and test) the serialisation.
>
> There is a strict limit to what is user visible, and if it isn't user visible
> we will never need it in a checkpoint. So internal implementation details
> should not matter.

Migration of currently-open sockets (for example) would require storing of
a lot of state, wouldn't it?

2006-05-19 17:16:46

by Stephen Hemminger

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

On Fri, 19 May 2006 09:40:47 -0700
Andrew Morton <[email protected]> wrote:

> [email protected] (Eric W. Biederman) wrote:
> >
> > > Herbert Poetzl <[email protected]> wrote:
> > >>
> > >> let me
> > >> give a simple example here:
> > >
> > > Examples are useful.
> > >
> > >> "pid virtualization"
> > >>
> > >> - Linux-VServer doesn't really need that right now.
> > >> we are perfectly fine with "pid isolation" here, we
> > >> only "virtualize" the init pid to make pstree happy
> > >>
> > >> - Snapshot/Restart and Migration will require "full"
> > >> pid virtualization (that's where Eric and OpenVZ
> > >> are heading towards)
> > >
> > > snapshot/restart/migration worry me. If they require complete
> > > serialisation of complex kernel data structures then we have a problem,
> > > because it means that any time anyone changes such a structure they need to
> > > update (and test) the serialisation.
> >
> > There is a strict limit to what is user visible, and if it isn't user visible
> > we will never need it in a checkpoint. So internal implementation details
> > should not matter.
>
> Migration of currently-open sockets (for example) would require storing of
> a lot of state, wouldn't it?

Werner has demonstrated TCP connection passing
http://tcpcp.sourceforge.net/

2006-05-19 17:19:51

by Al Boldi

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

Andrey Savochkin wrote:
> I have a practical proposal.
> We can start with presenting and merging the most interesting part,
> network containers. We discuss details, possible approaches, and related
> subsystems, until networking is finished to its utmost detail.
> This will create an example of virtualization of a non-trivial subsystem,
> and we will have to agree on basic principles of virtualization of related
> subsystems like proc.
>
> Virtualization of networking presents a lot of challenges and
> decision-making points with respect to user-visible interfaces: proc,
> sysctl, netlink events (and netlink sockets themselves), and so on. This
> code will also become immediately useful as an improvement over chroot.
> I am sure that when we come to a mutually acceptable solution with respect
> to networking, virtualization of all other subsystems can be implemented
> and merged without many questions.
>
> What do people think about this plan?

Exactly what I thought too, and in general always the best way to move
forward, i.e: "slowly but surely" instead of "big bang".

This would of course imply, that even this subsystem should be kept as
minimalistic as possible, to avoid any side-effects and to just concentrate
on the crux of the problem.

Thanks!

--
Al

2006-05-19 17:37:23

by Randy Dunlap

[permalink] [raw]
Subject: Re: [PATCH 4/9] namespaces: utsname: switch to using uts namespaces

On Fri, 19 May 2006 03:05:23 -0600 Eric W. Biederman wrote:

> "Randy.Dunlap" <[email protected]> writes:
>
> > On Thu, 18 May 2006 10:49:36 -0500 Serge E. Hallyn wrote:
> >
> >> Replace references to system_utsname to the per-process uts namespace
> >> where appropriate. This includes things like uname.
> >>
> >> Changes: Per Eric Biederman's comments, use the per-process uts namespace
> >> for ELF_PLATFORM, sunrpc, and parts of net/ipv4/ipconfig.c
> >>
> >> Signed-off-by: Serge E. Hallyn <[email protected]>
>
> >
> > OK, here's my big comment/question. I want to see <nodename> increased to
> > 256 bytes (per current POSIX), so each field of struct <variant>_utsname
> > needs be copied individually (I think) instead of doing a single
> > struct copy.
>
> Where is it specified? Looking at the spec as SUSV3 I don't see a size
> specified for nodename.

It's actually for hostname. It looks to me like they are used
interchangeably. yes/no?

gethostname:
http://www.opengroup.org/onlinepubs/009695399/functions/gethostname.html
sysconf:
http://www.opengroup.org/onlinepubs/009695399/functions/sysconf.html
unistd.h:
http://www.opengroup.org/onlinepubs/009695399/basedefs/unistd.h.html
limits.h:
http://www.opengroup.org/onlinepubs/009695399/basedefs/limits.h.html

>From the latter:
{HOST_NAME_MAX}
Maximum length of a host name (not including the terminating null) as returned from the gethostname() function.
Minimum Acceptable Value: {_POSIX_HOST_NAME_MAX}
(and)
{_POSIX_HOST_NAME_MAX}
Maximum length of a host name (not including the terminating null) as returned from the gethostname() function.
Value: 255



> > I've been working on this for the past few weeks (among other
> > things). Sorry about the timing.
> > I could send patches for this against mainline in a few days,
> > but I'll be glad to listen to how it would be easiest for all of us
> > to handle.
> >
> > I'm probably a little over half done with my patches.
> > They will end up adding a lib/utsname.c that has functions for:
> > put_oldold_uname() // to user
> > put_old_uname() // to user
> > put_new_uname() // to user
> > put_posix_uname() // to user
>
> Sounds reasonable, if we really need a 256 byte nodename.
>
> As long as they take a pointer to the appropriate utsname
> structure these patches should not fundamentally conflict.


---
~Randy

2006-05-19 17:52:49

by Jeff Dike

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

On Fri, May 19, 2006 at 05:41:45AM -0600, Eric W. Biederman wrote:
> I think I see a third way of justifying a mainline merge. We make an
> up-front decision that we will improve the existing chroot jail
> functionality in Linux and start making improvements. Even if some of
> the improvements are quite small.

FWIW, UML can use this stuff incrementally. So, from my point of view,
it's not an all-or-nothing thing.

Jeff

2006-05-19 18:28:12

by Hua Zhong

[permalink] [raw]
Subject: RE: [PATCH 0/9] namespaces: Introduction

> snapshot/restart/migration worry me. If they require
> complete serialisation of complex kernel data structures then
> we have a problem, because it means that any time anyone
> changes such a structure they need to update (and test) the
> serialisation.

Checkpoint/Restart/Migration could be very complicated if done at OS level (per process/process group/or any subset of an OS). But
it is much simpler if done on virtual machine level (VMWare/Xen) because there is a natural and clear boundary, and doesn't get
affected if the OS kernel internal changes.

It's good to see some progress in supporting virtualization in Linux, but as Andrew put it, some big decisions need to be made
up-front. One big question is actually how many virtualization technologies Linux should support? Particularly, does it need to
support both OS-level virtualization and VM-level virtualization? And why? And to what degree?

My gut feeling is that the VM approach seems much cleaner and modular, without touching too many areas (except some low-level ones)
inside the kernel and in general very well separated. There are two reasons:

1. In a VM system, the architecture is very simple. Hypervisor and guest OS kernel have clear boundaries and interfaces to
communicate, and OS in general pretty much doesn't need to care if it's running on native hardware or inside a VM. So it adds very
little maintanence burden to the kernel developers (and if there is, it's relatively well understood).

2. Hardware support. With more virtualization built into CPU, VM is only going to get simper.

It seems at least the VM approach is much less risky. It might be helpful if someone could explain why we need both.

Hua

2006-05-19 19:38:19

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

Quoting Hua Zhong ([email protected]):
> > snapshot/restart/migration worry me. If they require
> > complete serialisation of complex kernel data structures then
> > we have a problem, because it means that any time anyone
> > changes such a structure they need to update (and test) the
> > serialisation.
>
> Checkpoint/Restart/Migration could be very complicated if done at OS level (per process/process group/or any subset of an OS). But
> it is much simpler if done on virtual machine level (VMWare/Xen) because there is a natural and clear boundary, and doesn't get
> affected if the OS kernel internal changes.
>
> It's good to see some progress in supporting virtualization in Linux, but as Andrew put it, some big decisions need to be made
> up-front. One big question is actually how many virtualization technologies Linux should support? Particularly, does it need to
> support both OS-level virtualization and VM-level virtualization? And why? And to what degree?

Because migration can be used for more than one purpose. One such
purpose is load-balancing large numbers of jobs. If you have large
numbers of jobs, you do not want the resource overhead of a full OS for
each migrateable job.

The reason it is deemed simpler at the vm level, as you point out, is
that resources are naturally isolated. The same work which will prepare
the kernel for vserver/openvz functionality will isolate kernel resources
between vserver/containers, making c/r and migration at that level level
much simpler.

thanks,
-serge

2006-05-19 19:46:54

by John Kelly

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

On Fri, 19 May 2006 11:28:08 -0700, "Hua Zhong" <[email protected]>
wrote:

> how many virtualization technologies Linux should support?

> Particularly, does it need to support both OS-level virtualization

If users want it. I do.


> It seems at least the VM approach is much less risky. It might be helpful
> if someone could explain why we need both.

A better question is, why can't we have both?

I don't have unlimited memory and disk. I need to conserve my
resources as much as possible.

The one-kernel approach saves memory, leaving more for applications.
That's important to me. I don't need to run multiple kernels, and I
don't want to. I only want multiple secure operating environments.

The one-kernel approach also makes it easy to have all VPS in one disk
partition, without the performance penalty of file backed I/O.

If the VM approach is truly less risky, seems to me the Xen/VMware
developers should be able to succeed independently, despite changes
made for in-kernel virtualization.

I'm glad someone asked a question I could answer. :-)


2006-05-19 20:06:24

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

On Fri, 2006-05-19 at 08:13 -0700, Andrew Morton wrote:
> snapshot/restart/migration worry me. If they require complete
> serialisation of complex kernel data structures then we have a problem,
> because it means that any time anyone changes such a structure they need to
> update (and test) the serialisation.

The idea of actually serializing kernel data structures keeps me up at
night. This is especially true when we already have some method of
exporting these structures to userspace.

Take VMAs, for example. Should we have a set of interfaces for saving
and restoring VMAs, or should we just make any program which is doing
checkpoint/restart use /proc/<pid>/maps on checkpoint and mmap() on
restore?

It, of course, isn't that simple. Any interface focused on VMAs inside
the kernel will have the serialization issues you describe. I think
this is such an approach:

http://git.openvz.org/?p=linux-2.6-openvz;a=blob;f=kernel/cpt/cpt_mm.c
http://git.openvz.org/?p=linux-2.6-openvz;a=blob;f=kernel/cpt/rst_mm.c

However, the proc-maps/mmap approach would require new interfaces to be
implemented. There are plenty of attributes not currently readily
visible to userspace like VM_NONLINEAR, or resources which are normally
inaccessible to userspace like deleted files. Those would need extended
user/kernel interfaces.

I know of at least one in-kernel commercial checkpoint/restart product
which was relatively well tested with "a certain large DB that uses
remap_file_pages()" that never even noticed that it completely missed
VM_NONLINEAR support until vm-savvy people saw the code. Scary.

-- Dave

2006-05-19 20:18:41

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

On Fri, 2006-05-19 at 09:40 -0700, Andrew Morton wrote:
> Migration of currently-open sockets (for example) would require storing of
> a lot of state, wouldn't it?

In a word, yes. :)

I don't think the networking guys from either the OpenVZ project or IBM
were cc'd on this. Alexey, Daniel, can you elaborate, or point us to
any existing code?

-- Dave

2006-05-19 20:24:35

by John Kelly

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

On Fri, 19 May 2006 15:45:43 -0400, John Kelly <[email protected]>
wrote:

>If the VM approach is truly less risky, seems to me the Xen/VMware
>developers should be able to succeed independently, despite changes
>made for in-kernel virtualization.

OTOH, maybe the Xen and VMware developers will kill each other off,
and then we won't have to worry about Xen, or VMware.

;-)


2006-05-19 20:53:37

by Alexey Kuznetsov

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

Hello!

> > Migration of currently-open sockets (for example) would require storing of
> > a lot of state, wouldn't it?
>
> In a word, yes. :)

Yes. But, actually, it is not "for example". Socket state is really far more
complicated thing than all the rest. I would say, migration of another
objects is mostly trivial thing.

Actually, what Andrew worried about:

> snapshot/restart/migration worry me. If they require complete
> serialisation of complex kernel data structures then we have a problem,
> because it means that any time anyone changes such a structure they need to
> update (and test) the serialisation.

The answer is: after user space processes referring to objects are suspended,
_surprizingly_, not so much of places, which have trouble with serialization
remain. Actually, no serialization additional to existing one is required.
Sockets are the most complicated, to suspend networking state, after
processes are frozen, we have to:

1. Block access from network.
2. Stop socket timers.

Only after this we can make a coherent snapshot. But it is an exception,
most of objects are in coherent state (all the VM, files etc. etc),
when processes are frozen.


> I don't think the networking guys from either the OpenVZ project or IBM
> were cc'd on this. Alexey, Daniel, can you elaborate, or point us to
> any existing code?

http://git.openvz.org

linux-2.6-openvz/kernel/cpt/. Particularly, kernel/cpt/cpt_socket*.c.
Hairy, but straighforward.

Alexey

2006-05-20 00:59:20

by Sam Vilain

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

On Fri, 2006-05-19 at 17:47 +0400, Andrey Savochkin wrote:
> We can start with presenting and merging the most interesting part, network
> containers. We discuss details, possible approaches, and related subsystems,
> until networking is finished to its utmost detail.
> This will create an example of virtualization of a non-trivial subsystem,
> and we will have to agree on basic principles of virtualization of related
> subsystems like proc.
[...]
> What do people think about this plan?

Network is an interesting one because you have multiple solutions - the
very simple approach of network binding (as used by Jacques Gelina's
original IP vhost work from December 1997), and network virtualisation.
That virtualisation itself can be broken down into providing merely
virtual interfaces (so that, for instance, you can have independent lo
interfaces in the virtual servers) as in Vserver 2.1.x, or providing a
completely virtualised network stack, as in Vserver ngnet (and possibly
OpenVZ?).

Each solution performs the virtualisation at a different level, and has
incrementally higher orders of inefficiency and maintenance
requirements. Yet none of them are essentially better or worse than the
others.

So, we might end up with all three eventually - but binding alone is the
simplest and still extremely useful.

Sam.

2006-05-20 00:59:21

by Sam Vilain

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

On Fri, 2006-05-19 at 05:41 -0600, Eric W. Biederman wrote:
> > It would help set minds at ease if someone could produce a
> > bullet-point list of what features the kernel will need to get it to the
> > stage where "most or all vserver and openvz functionality can be
> > implemented by controlling resource namespaces from userspace." Then we
> > can discuss that list, make sure that everyone's pretty much in
> > agreement.
> So this is slightly the wrong question. If you look at Sam's list you

Yes - the wrong question because it's too top down and encourages
hacks :) It's wrong for the purposes of planning an implementation, but
ok for easing minds about what will be covered, I think.

> will see that there are several independent dimensions to the complete
> solution. Most of them dealing with the increase in the number of users
> and the amount of work that is happening on a single kernel in this
> context.
>
> Basically we need to expect a lot of kernel tuning after we get the
> basics working.
>
> The proper question is: What needs to happen before we can run separate
> user space instances?

My guess would be most of the points under "isolation". The others are
really just fine tuning / resource partitioning and fixing various
things that break under virtualisation because of their design (eg,
quota).

Sam.

2006-05-20 21:24:57

by Herbert Poetzl

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

On Fri, May 19, 2006 at 08:25:16AM -0700, Andrew Morton wrote:
> Andrey Savochkin <[email protected]> wrote:
>>
>> I have a practical proposal. We can start with presenting and
>> merging the most interesting part, network containers. We discuss
>> details, possible approaches, and related subsystems, until
>> networking is finished to its utmost detail. This will create an
>> example of virtualization of a non-trivial subsystem, and we will
>> have to agree on basic principles of virtualization of related
>> subsystems like proc.
>>
>> Virtualization of networking presents a lot of challenges and
>> decision-making points with respect to user-visible interfaces:
>> proc, sysctl, netlink events (and netlink sockets themselves),
>> and so on. This code will also become immediately useful as an
>> improvement over chroot. I am sure that when we come to a mutually
>> acceptable solution with respect to networking, virtualization of
>> all other subsystems can be implemented and merged without many
>> questions.
>>
>> What do people think about this plan?

well, I think it is interesting ...

> It sounds like that feature might be the
> most-likely-to-cause-maintainer-revolt one, in which case yes,
> it is absolutely definitely the one to start with.

yes, I absolutely agree here, this will be one
of the tougher nuts to crack, and therefore it
might be an excellent candidate to proove that
the different virtualization camps can find an
acceptable solution .. together.

> Because if it ends up that an acceptable approach cannot be found,
> and if this feature is compulsory for any sane virtualisation
> implementation then that's it - game over.

this, OTOH is something I'm not convinced of,
because looking at BSD jails, I see a very simple
approach (only one IP, limiting binds) which seems
to be sufficient for all the BSD jails out there

this is probably something which does not meet the
requirements of fully blown distro virtualizations
but actually it might be more than sufficient for
'mainline' linux jails

> We want to discover such blockers as early in the process as
> possible.

yes, I would also appreciate if we could get some
support from the network folks, as I think, most
of them are already working into that direction
(think Van Jacobson's net channels, routing tables)

especially as the network virtualization brings up
a number of questions, which are not easily answered
like the following:

- what policy will be applied inside guests?
+ allow arbitrary packets/rules/routes
+ have some generic limits/basic rules
+ put policy into userspace

- how to 'connect' the virtual interfaces to
the real network?
+ via routing and bridging?
(means duplicate stack traversal and
therefore twice the overhead)
+ via split personality interfaces?
(less overhead, more complicated cases)
+ directly (only by isolation)

- at what level should the virtualization happen?
+ ethernet level (all protocols)
+ ip level (all ip based and control protocols)
+ udp/tcp level

best,
Herbert

2006-05-21 00:50:20

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

Dave Hansen <[email protected]> writes:

> On Fri, 2006-05-19 at 08:13 -0700, Andrew Morton wrote:
>> snapshot/restart/migration worry me. If they require complete
>> serialisation of complex kernel data structures then we have a problem,
>> because it means that any time anyone changes such a structure they need to
>> update (and test) the serialisation.
>
> The idea of actually serializing kernel data structures keeps me up at
> night. This is especially true when we already have some method of
> exporting these structures to userspace.

Serialization of kernel data structures is a thorny issue, that
we are far enough away from that we don't need to tackle just yet.
I do consider it a failure to export/import things properly if you
need to use the same kernel version.

For the short term all that is interesting from a checkpoint/restart/migration
perspective is that you can have multiple instances of global identifiers,
pids, sysvipc ids, etc.

> However, the proc-maps/mmap approach would require new interfaces to be
> implemented. There are plenty of attributes not currently readily
> visible to userspace like VM_NONLINEAR, or resources which are normally
> inaccessible to userspace like deleted files. Those would need extended
> user/kernel interfaces.

Deleted files are accessible through /proc/<pid>/fd.

Eric

2006-05-21 16:28:04

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

Quoting Eric W. Biederman ([email protected]):
> "Serge E. Hallyn" <[email protected]> writes:
>
> > This patchset introduces a per-process utsname namespace. These can
> > be used by openvz, vserver, and application migration to virtualize and
> > isolate utsname info (i.e. hostname). More resources will follow, until
> > hopefully most or all vserver and openvz functionality can be implemented
> > by controlling resource namespaces from userspace.
> >
> > Previous utsname submissions placed a pointer to the utsname namespace
> > straight in the task_struct. This patchset (and the last one) moves
> > it and the filesystem namespace pointer into struct nsproxy, which is
> > shared by processes sharing all namespaces. The intent is to keep
> > the taskstruct smaller as the number of namespaces grows.
>
>
> Previously you mentioned:
> > BTW - a first set of comparison results showed nsproxy to have better
> > dbench and tbench throughput, and worse kernbench performance. Which
> > may make sense given that nsproxy results in lower memory usage but
> > likely increased cache misses due to extra pointer dereference.
>
> Is this still true? Or did our final reference counting tweak fix
> the kernbench numbers?
>
> I just want to be certain that we don't add an optimization,
> that reduces performance.

Here are the numbers with the basic patchsets. But I guess I should
do another round with adding 7 more void*'s to represent additional
namespaces.

(intervals are for 95% CI, tests were each run 15 times)

| with nsproxy | without nsproxy |
kernbench | 68.90 +/- 0.21 | 69.06 +/- 0.22 |
dbench | 386.0 +/- 26.6 | 388.4 +/- 21.0 |
tbench | 391.6 +/- 8.00 | 389.4 +/- 10.95 |

reaim with nsproxy
1 115600.000000 5512.441557
3 246985.712000 9375.780582
5 272309.092000 8029.833742
7 290020.000000 7288.367116
9 298591.580000 5557.531915
11 nan nan
13 nan nan
15 nan nan

reaim without nsproxy
1 110160.000000 5728.697311
3 246985.712000 9375.780582
5 262204.197333 11138.510652
7 288660.000000 6880.898412
9 300631.580000 4351.926692
11 nan nan
13 nan nan
15 nan nan

2006-05-21 18:11:10

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

"Serge E. Hallyn" <[email protected]> writes:

>
> Here are the numbers with the basic patchsets. But I guess I should
> do another round with adding 7 more void*'s to represent additional
> namespaces.

I'm a little slow coming up to speed on these benchmarks.
dbench and tbench are measured in megabytes per second correct?
kernbench is the number of seconds it takes to compile a kernel?
reaim is measured in jobs per minute?

So if I read this right the differences are currently in
the noise levels, from your testing.

> (intervals are for 95% CI, tests were each run 15 times)
>
> | with nsproxy | without nsproxy |
> kernbench | 68.90 +/- 0.21 | 69.06 +/- 0.22 |
> dbench | 386.0 +/- 26.6 | 388.4 +/- 21.0 |
> tbench | 391.6 +/- 8.00 | 389.4 +/- 10.95 |
>
> reaim with nsproxy
> 1 115600.000000 5512.441557
> 3 246985.712000 9375.780582
> 5 272309.092000 8029.833742
> 7 290020.000000 7288.367116
> 9 298591.580000 5557.531915
> 11 nan nan
> 13 nan nan
> 15 nan nan
>
> reaim without nsproxy
> 1 110160.000000 5728.697311
> 3 246985.712000 9375.780582
> 5 262204.197333 11138.510652
> 7 288660.000000 6880.898412
> 9 300631.580000 4351.926692
> 11 nan nan
> 13 nan nan
> 15 nan nan

2006-05-21 22:58:22

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

On P? 19-05-06 08:13:34, Andrew Morton wrote:
> Herbert Poetzl <[email protected]> wrote:
> >
> > let me
> > give a simple example here:
>
> Examples are useful.
>
> > "pid virtualization"
> >
> > - Linux-VServer doesn't really need that right now.
> > we are perfectly fine with "pid isolation" here, we
> > only "virtualize" the init pid to make pstree happy
> >
> > - Snapshot/Restart and Migration will require "full"
> > pid virtualization (that's where Eric and OpenVZ
> > are heading towards)
>
> snapshot/restart/migration worry me. If they require complete
> serialisation of complex kernel data structures then we have a problem,
> because it means that any time anyone changes such a structure they need to
> update (and test) the serialisation.
>
> This may be a show-stopper, in which case maybe we only need to virtualise
> pid #1.

Well, if pid #1 virtualization is only needed for pstree, we may want
to fix pstree instead :-).
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

2006-05-21 23:20:47

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

Pavel Machek <[email protected]> writes:

> Well, if pid #1 virtualization is only needed for pstree, we may want
> to fix pstree instead :-).

One thing that is not clear is if isolation by permission checks is any
easier to implement than isolation with a namespace.

Isolation at permission checks may actually be more expensive in terms
of execution time, and maintenance.

Eric

2006-05-21 23:30:23

by Sam Vilain

[permalink] [raw]
Subject: Re: [PATCH 1/9] namespaces: add nsproxy

Serge E. Hallyn wrote:

>@@ -1585,7 +1591,15 @@ asmlinkage long sys_unshare(unsigned lon
>
> if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) {
>
>+ old_nsproxy = current->nsproxy;
>+ new_nsproxy = dup_namespaces(old_nsproxy);
>+ if (!new_nsproxy) {
>+ err = -ENOMEM;
>+ goto bad_unshare_cleanup_semundo;
>+ }
>+
> task_lock(current);
>
>

We'll get lots of duplicate nsproxy structures before we move all of the
pointers for those subsystems into it. Do we need to dup namespaces on
all of those conditions?

Sam.

2006-05-21 23:32:17

by Herbert Poetzl

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

On Sun, May 21, 2006 at 05:18:50PM -0600, Eric W. Biederman wrote:
> Pavel Machek <[email protected]> writes:
>
> > Well, if pid #1 virtualization is only needed for pstree, we may want
> > to fix pstree instead :-).

yes, actually this and init itself (which uses the
pid to switch between init and telinit behaviour)
are the only two applications we found so far ...

and as far as I know, those work with non pid=1
values on other operating systems (inside containers)

a fix there would definitely be appreciated and
I think it would not hurt normal behaviour ...

> One thing that is not clear is if isolation by permission checks is
> any easier to implement than isolation with a namespace.

for the pid space, I'm not really sure if isolation
is really cheaper than virtualization, but for the
network space for example, a virtualization solution
which is as lightweigth as the isolation is probably
more challenging, although not impossible ...

> Isolation at permission checks may actually be more expensive in terms
> of execution time, and maintenance.

again, for the pid space, maintenance is quite low ..

best,
Herbert

> Eric

2006-05-21 23:40:09

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 1/9] namespaces: add nsproxy

Sam Vilain <[email protected]> writes:

> Serge E. Hallyn wrote:
>
>>@@ -1585,7 +1591,15 @@ asmlinkage long sys_unshare(unsigned lon
>>
>> if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) {
>>
>>+ old_nsproxy = current->nsproxy;
>>+ new_nsproxy = dup_namespaces(old_nsproxy);
>>+ if (!new_nsproxy) {
>>+ err = -ENOMEM;
>>+ goto bad_unshare_cleanup_semundo;
>>+ }
>>+
>> task_lock(current);
>>
>>
>
> We'll get lots of duplicate nsproxy structures before we move all of the
> pointers for those subsystems into it. Do we need to dup namespaces on
> all of those conditions?

Ugh. Good catch. The new nsproxy needs to be just for the fs and the uts
namespace.

I guess that means that test should be moved up a few lines.

Eric

2006-05-22 00:19:37

by Sam Vilain

[permalink] [raw]
Subject: Re: [PATCH 4/9] namespaces: utsname: switch to using uts namespaces

Serge E. Hallyn wrote:

>--- a/arch/alpha/kernel/osf_sys.c
>+++ b/arch/alpha/kernel/osf_sys.c
>@@ -402,15 +402,15 @@ osf_utsname(char __user *name)
>
> down_read(&uts_sem);
> error = -EFAULT;
>- if (copy_to_user(name + 0, system_utsname.sysname, 32))
>+ if (copy_to_user(name + 0, utsname()->sysname, 32))
> goto out;
>diff --git a/arch/i386/kernel/sys_i386.c b/arch/i386/kernel/sys_i386.c
>index 8fdb1fb..4af731d 100644
>--- a/arch/i386/kernel/sys_i386.c
>+++ b/arch/i386/kernel/sys_i386.c
>@@ -210,7 +210,7 @@ asmlinkage int sys_uname(struct old_utsn
> if (!name)
> return -EFAULT;
> down_read(&uts_sem);
>- err=copy_to_user(name, &system_utsname, sizeof (*name));
>+ err=copy_to_user(name, utsname(), sizeof (*name));
> up_read(&uts_sem);
> return err?-EFAULT:0;
> }
>
>

The semaphore (uts_sem) should be moved in the uts_ns structure, no?

It's probably low impact enough to keep it as it is, though. Just a tad
untidy.

Sam.

2006-05-22 12:10:28

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

Quoting Eric W. Biederman ([email protected]):
> "Serge E. Hallyn" <[email protected]> writes:
>
> >
> > Here are the numbers with the basic patchsets. But I guess I should
> > do another round with adding 7 more void*'s to represent additional
> > namespaces.
>
> I'm a little slow coming up to speed on these benchmarks.
> dbench and tbench are measured in megabytes per second correct?
> kernbench is the number of seconds it takes to compile a kernel?
> reaim is measured in jobs per minute?
>
> So if I read this right the differences are currently in
> the noise levels, from your testing.

Yup.

Adding 7 extra void*'s seems to affect only dbench, which
whose degration with the nsproxy falls outside the noise.
The odd thing isn't so much the degradation, but the widely
scattered values, compared to without nsproxy.

| with nsproxy | without nsproxy |
kernbench | 70.23 +/- 0.27 | 70.04 +/- 0.22 |
dbench | 367.1 +/- 32.6 | 410.0 +/- 2.96 |
tbench | 399.3 +/- 12.4 | 399.4 +/- 12.5 |


reaim with nsproxy
1 115600.000000 5512.441557
3 243099.998000 10876.225044
5 270002.798667 11800.545221
7 283291.578667 10897.147984
9 294530.528000 7095.760045
11 nan nan
13 nan nan
15 nan nan

reaim without nsproxy
1 114240.000000 5728.697311
3 254271.426000 11767.994417
5 279965.036000 12615.448140
7 281660.000000 9898.028733
9 302905.264000 5165.026561
11 nan nan
13 nan nan
15 nan nan

2006-05-22 12:39:24

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH 1/9] namespaces: add nsproxy

Quoting Eric W. Biederman ([email protected]):
> Sam Vilain <[email protected]> writes:
>
> > Serge E. Hallyn wrote:
> >
> >>@@ -1585,7 +1591,15 @@ asmlinkage long sys_unshare(unsigned lon
> >>
> >> if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) {
> >>
> >>+ old_nsproxy = current->nsproxy;
> >>+ new_nsproxy = dup_namespaces(old_nsproxy);
> >>+ if (!new_nsproxy) {
> >>+ err = -ENOMEM;
> >>+ goto bad_unshare_cleanup_semundo;
> >>+ }
> >>+
> >> task_lock(current);
> >>
> >>
> >
> > We'll get lots of duplicate nsproxy structures before we move all of the
> > pointers for those subsystems into it. Do we need to dup namespaces on
> > all of those conditions?
>
> Ugh. Good catch. The new nsproxy needs to be just for the fs and the uts
> namespace.
>
> I guess that means that test should be moved up a few lines.

Oh. Yeah. It didn't look odd to me bc it's about the number of
namespaces we are *going* to have :)

Fix follows:

Subject: [PATCH] uts: copy nsproxy only when needed.
From: Serge Hallyn <[email protected]>

The nsproxy was being copied in unshare() when anything was being
unshared, even if it was something not referenced from nsproxy.
This should end up in some cases with far more memory usage than
necessary.

Signed-off-by: Serge Hallyn <[email protected]>

---

kernel/fork.c | 20 ++++++++++++++------
1 files changed, 14 insertions(+), 6 deletions(-)

74d1068458c62302ac8ed38e38a57b692580662f
diff --git a/kernel/fork.c b/kernel/fork.c
index cdc549e..9278a68 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1559,7 +1559,7 @@ asmlinkage long sys_unshare(unsigned lon
struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
struct files_struct *fd, *new_fd = NULL;
struct sem_undo_list *new_ulist = NULL;
- struct nsproxy *new_nsproxy, *old_nsproxy;
+ struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL;
struct uts_namespace *uts, *new_uts = NULL;

check_unshare_flags(&unshare_flags);
@@ -1587,18 +1587,24 @@ asmlinkage long sys_unshare(unsigned lon
if ((err = unshare_utsname(unshare_flags, &new_uts)))
goto bad_unshare_cleanup_semundo;

- if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist ||
- new_uts) {
-
+ if (new_ns || new_uts) {
old_nsproxy = current->nsproxy;
new_nsproxy = dup_namespaces(old_nsproxy);
if (!new_nsproxy) {
err = -ENOMEM;
goto bad_unshare_cleanup_uts;
}
+ }
+
+ if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist ||
+ new_uts) {

task_lock(current);
- current->nsproxy = new_nsproxy;
+
+ if (new_nsproxy) {
+ current->nsproxy = new_nsproxy;
+ new_nsproxy = old_nsproxy;
+ }

if (new_fs) {
fs = current->fs;
@@ -1640,9 +1646,11 @@ asmlinkage long sys_unshare(unsigned lon
}

task_unlock(current);
- put_nsproxy(old_nsproxy);
}

+ if (new_nsproxy)
+ put_nsproxy(new_nsproxy);
+
bad_unshare_cleanup_uts:
if (new_uts)
put_uts_ns(new_uts);
--
1.1.6

2006-05-22 16:46:34

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

"Serge E. Hallyn" <[email protected]> writes:

> Quoting Eric W. Biederman ([email protected]):

> Yup.
>
> Adding 7 extra void*'s seems to affect only dbench, which
> whose degration with the nsproxy falls outside the noise.
> The odd thing isn't so much the degradation, but the widely
> scattered values, compared to without nsproxy.

Well dbench is filesystem heavy so it may have some need to
actually be using the filesystem namespace. Although I can't
think of why it would. Do you wan to track down that performance
regression or do you want to give up on that space optimization for
now?

Eric

2006-05-22 16:56:08

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction


Yep. I bungle my description pretty badly.

The key points.
- Simply messing with pid == 1 is not enough, you need to filter
which pids are accessible.
- pid isolation by permission checks and pid isolation via
pid visibility are competing implementations.
- pid isolation by permission checks (except for the pid == 1 case)
can currently be implemented with a security module.

Eric

2006-05-22 17:26:22

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 0/9] namespaces: Introduction

Andrew Morton <[email protected]> writes:

> Andrey Savochkin <[email protected]> wrote:
>>
>> I have a practical proposal.
>> We can start with presenting and merging the most interesting part, network
>> containers. We discuss details, possible approaches, and related subsystems,
>> until networking is finished to its utmost detail.
>> This will create an example of virtualization of a non-trivial subsystem,
>> and we will have to agree on basic principles of virtualization of related
>> subsystems like proc.
>>
>> Virtualization of networking presents a lot of challenges and decision-making
>> points with respect to user-visible interfaces: proc, sysctl, netlink events
>> (and netlink sockets themselves), and so on. This code will also become
>> immediately useful as an improvement over chroot.
>> I am sure that when we come to a mutually acceptable solution with respect to
>> networking, virtualization of all other subsystems can be implemented and
>> merged without many questions.
>>
>> What do people think about this plan?
>
> It sounds like that feature might be the
> most-likely-to-cause-maintainer-revolt one, in which case yes, it is
> absolutely definitely the one to start with.

It sounds like a case of: That first step is a doozy.
We should be able to resolve proc and sysctl issues with just
the uts namespace. So I don't think we necessarily have to take
everything at once.

Beyond that the real sticky issue and what leads to most of the peculiar
cases is the one thing not addressed by doing the network namespace.
How do we keep someone inside a namespace from accessing files in /proc
and other places that they should not be able to access.

It occurred to me that most of the permission checking issues trivially
go away if you make the permission checks test for equality of
the tuple (uid namespace, uid). At which point a lot of the reasons
people have previously put forth for completely reorganizing proc and
sysfs go away, because users not in their uid namespace will only be
able to access world readable and world writable files. Anything else
will simply be inaccessible.

So I think we need to have a serious look at the uid/gid namespace.

This is a bit of a pain because this brings us face to face with the
uid mapping problem we have avoided for years on network filesystems,
and makes it a problem on local filesystems as well.

Getting both the uid/gid namespace and the network namespace should
get the bulk of the infrastructure problems solved.

I am even happy to do the network namespace first on the understanding
that permission checking problems caused by different users with the
same uid should be ignored until we have handled the uid/gid
namespace.

Eric

2006-05-22 19:44:01

by Cédric Le Goater

[permalink] [raw]
Subject: Re: [PATCH 4/9] namespaces: utsname: switch to using uts namespaces

Randy.Dunlap wrote:
> On Thu, 18 May 2006 10:49:36 -0500 Serge E. Hallyn wrote:
>
>> Replace references to system_utsname to the per-process uts namespace
>> where appropriate. This includes things like uname.
>>
>> Changes: Per Eric Biederman's comments, use the per-process uts namespace
>> for ELF_PLATFORM, sunrpc, and parts of net/ipv4/ipconfig.c
>>
>> Signed-off-by: Serge E. Hallyn <[email protected]>
>>
>> ---
>>
>> 9ee063adf4d2287583dbb0a71d1d5f80d7ae011f
>> diff --git a/arch/i386/kernel/sys_i386.c b/arch/i386/kernel/sys_i386.c
>> index 8fdb1fb..4af731d 100644
>> --- a/arch/i386/kernel/sys_i386.c
>> +++ b/arch/i386/kernel/sys_i386.c
>> @@ -210,7 +210,7 @@ asmlinkage int sys_uname(struct old_utsn
>> if (!name)
>> return -EFAULT;
>> down_read(&uts_sem);
>> - err=copy_to_user(name, &system_utsname, sizeof (*name));
>> + err=copy_to_user(name, utsname(), sizeof (*name));
>
> It would be really nice if you would fix spacing while you are here,
> like a space a each side of '='.
>
> and a space after ',' in the function calls below.

Here's a possible cleanup on top of serge's patchset as found in
2.6.17-rc4-mm3.

C.




Attachments:
fix-spacing.patch (13.72 kB)

2006-05-22 20:16:36

by Randy Dunlap

[permalink] [raw]
Subject: Re: [PATCH 4/9] namespaces: utsname: switch to using uts namespaces

On Mon, 22 May 2006 21:43:37 +0200 Cedric Le Goater wrote:

> Randy.Dunlap wrote:
> >>
> >> 9ee063adf4d2287583dbb0a71d1d5f80d7ae011f
> >> diff --git a/arch/i386/kernel/sys_i386.c b/arch/i386/kernel/sys_i386.c
> >> index 8fdb1fb..4af731d 100644
> >> --- a/arch/i386/kernel/sys_i386.c
> >> +++ b/arch/i386/kernel/sys_i386.c
> >> @@ -210,7 +210,7 @@ asmlinkage int sys_uname(struct old_utsn
> >> if (!name)
> >> return -EFAULT;
> >> down_read(&uts_sem);
> >> - err=copy_to_user(name, &system_utsname, sizeof (*name));
> >> + err=copy_to_user(name, utsname(), sizeof (*name));
> >
> > It would be really nice if you would fix spacing while you are here,
> > like a space a each side of '='.
> >
> > and a space after ',' in the function calls below.
>
> Here's a possible cleanup on top of serge's patchset as found in
> 2.6.17-rc4-mm3.

Yes, thanks, looks good.

---
~Randy