Hi Ingo,
Adds tunables in sysfs to modify a user's cpu share.
A directory is created in sysfs for each new user in the system.
/sys/kernel/uids/<uid>/cpu_share
Reading this file returns the cpu shares granted for the user.
Writing into this file modifies the cpu share for the user. Only an
administrator is allowed to modify a user's cpu share.
Ex:
# cd /sys/kernel/uids/
# cat 512/cpu_share
1024
# echo 2048 > 512/cpu_share
# cat 512/cpu_share
2048
#
Signed-off-by: Srivatsa Vaddagiri <[email protected]>
Signed-off-by: Dhaval Giani <[email protected]>
---
include/linux/sched.h | 11 +++++
kernel/ksysfs.c | 4 +
kernel/sched.c | 5 ++
kernel/user.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++-
4 files changed, 129 insertions(+), 1 deletion(-)
Index: linux-2.6.23-rc8-sched-devel/include/linux/sched.h
===================================================================
--- linux-2.6.23-rc8-sched-devel.orig/include/linux/sched.h
+++ linux-2.6.23-rc8-sched-devel/include/linux/sched.h
@@ -86,6 +86,7 @@ struct sched_param {
#include <linux/timer.h>
#include <linux/hrtimer.h>
#include <linux/task_io_accounting.h>
+#include <linux/kobject.h>
#include <asm/processor.h>
@@ -598,9 +599,18 @@ struct user_struct {
#ifdef CONFIG_FAIR_USER_SCHED
struct task_group *tg;
+ struct kset kset;
+ struct subsys_attribute user_attr;
+ struct work_struct work;
#endif
};
+#ifdef CONFIG_FAIR_USER_SCHED
+extern int uids_kobject_init(void);
+#else
+static inline int uids_kobject_init(void) { return 0; }
+#endif
+
extern struct user_struct *find_user(uid_t);
extern struct user_struct root_user;
@@ -1846,6 +1856,7 @@ extern struct task_group *sched_create_g
extern void sched_destroy_group(struct task_group *tg);
extern void sched_move_task(struct task_struct *tsk);
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+extern unsigned long sched_group_shares(struct task_group *tg);
#endif
Index: linux-2.6.23-rc8-sched-devel/kernel/ksysfs.c
===================================================================
--- linux-2.6.23-rc8-sched-devel.orig/kernel/ksysfs.c
+++ linux-2.6.23-rc8-sched-devel/kernel/ksysfs.c
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kexec.h>
+#include <linux/sched.h>
#define KERNEL_ATTR_RO(_name) \
static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
@@ -116,6 +117,9 @@ static int __init ksysfs_init(void)
¬es_attr);
}
+ if (!error)
+ error = uids_kobject_init();
+
return error;
}
Index: linux-2.6.23-rc8-sched-devel/kernel/sched.c
===================================================================
--- linux-2.6.23-rc8-sched-devel.orig/kernel/sched.c
+++ linux-2.6.23-rc8-sched-devel/kernel/sched.c
@@ -6928,4 +6928,9 @@ int sched_group_set_shares(struct task_g
return 0;
}
+unsigned long sched_group_shares(struct task_group *tg)
+{
+ return tg->shares;
+}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */
Index: linux-2.6.23-rc8-sched-devel/kernel/user.c
===================================================================
--- linux-2.6.23-rc8-sched-devel.orig/kernel/user.c
+++ linux-2.6.23-rc8-sched-devel/kernel/user.c
@@ -56,6 +56,62 @@ struct user_struct root_user = {
};
#ifdef CONFIG_FAIR_USER_SCHED
+
+static struct kobject uids_kobject;
+
+ssize_t cpu_shares_show(struct kset *kset, char *buffer)
+{
+ struct user_struct *up = container_of(kset, struct user_struct, kset);
+
+ return sprintf(buffer, "%lu\n", sched_group_shares(up->tg));
+}
+
+ssize_t cpu_shares_store(struct kset *kset, const char *buffer, size_t size)
+{
+ struct user_struct *up = container_of(kset, struct user_struct, kset);
+ unsigned long shares;
+ int rc;
+
+ sscanf(buffer, "%lu", &shares);
+
+ rc = sched_group_set_shares(up->tg, shares);
+
+ return (rc ? rc : size);
+}
+
+static void user_attr_init(struct subsys_attribute *sa, char *name, int mode)
+{
+ sa->attr.name = name;
+ sa->attr.mode = mode;
+ sa->show = cpu_shares_show;
+ sa->store = cpu_shares_store;
+}
+
+static int user_kobject_create(struct user_struct *up)
+{
+ struct kset *kset = &up->kset;
+ struct kobject *kobj = &kset->kobj;
+ int error;
+
+ memset(kset, 0, sizeof(struct kset));
+ kobj->parent = &uids_kobject;
+ kobject_set_name(kobj, "%d", up->uid);
+ kset_init(kset);
+ user_attr_init(&up->user_attr, "cpu_share", 0644);
+
+ error = kobject_add(kobj);
+
+ if (error)
+ goto done;
+
+ error = sysfs_create_file(kobj, &up->user_attr.attr);
+ if (error)
+ kobject_del(kobj);
+
+done:
+ return error;
+}
+
static void sched_destroy_user(struct user_struct *up)
{
sched_destroy_group(up->tg);
@@ -77,11 +133,53 @@ static void sched_switch_user(struct tas
sched_move_task(p);
}
+int __init uids_kobject_init(void)
+{
+ int error;
+
+ uids_kobject.parent = &kernel_subsys.kobj;
+ kobject_set_name(&uids_kobject, "uids");
+ kobject_init(&uids_kobject);
+ error = kobject_add(&uids_kobject);
+
+ if (!error)
+ error = user_kobject_create(&root_user);
+
+ return error;
+}
+
+static void remove_user_sysfs_dir(struct work_struct *w)
+{
+ struct user_struct *up = container_of(w, struct user_struct, work);
+ struct kobject *kobj = &up->kset.kobj;
+
+ sysfs_remove_file(kobj, &up->user_attr.attr);
+ kobject_del(kobj);
+ kmem_cache_free(uid_cachep, up);
+}
+
+static inline void free_user(struct user_struct *up)
+{
+ struct kobject *kobj = &up->kset.kobj;
+
+ if (!kobj->sd)
+ return;
+
+ INIT_WORK(&up->work, remove_user_sysfs_dir);
+ schedule_work(&up->work);
+}
+
#else /* CONFIG_FAIR_USER_SCHED */
static void sched_destroy_user(struct user_struct *up) { }
static int sched_create_user(struct user_struct *up) { return 0; }
static void sched_switch_user(struct task_struct *p) { }
+static inline int user_kobject_create(struct user_struct *up) { return 0; }
+
+static inline void free_user(struct user_struct *up)
+{
+ kmem_cache_free(uid_cachep, up);
+}
#endif /* CONFIG_FAIR_USER_SCHED */
@@ -145,7 +243,7 @@ void free_uid(struct user_struct *up)
sched_destroy_user(up);
key_put(up->uid_keyring);
key_put(up->session_keyring);
- kmem_cache_free(uid_cachep, up);
+ free_user(up);
} else {
local_irq_restore(flags);
}
@@ -155,6 +253,7 @@ struct user_struct * alloc_uid(struct us
{
struct hlist_head *hashent = uidhashentry(ns, uid);
struct user_struct *up;
+ int create_sysfs_dir = 0;
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
@@ -205,10 +304,19 @@ struct user_struct * alloc_uid(struct us
} else {
uid_hash_insert(new, hashent);
up = new;
+ create_sysfs_dir = 1;
}
spin_unlock_irq(&uidhash_lock);
}
+
+ if (create_sysfs_dir) {
+ if (user_kobject_create(up)) {
+ free_uid(up);
+ up = NULL;
+ }
+ }
+
return up;
}
--
regards,
Dhaval
hi Dhaval,
* Dhaval Giani <[email protected]> wrote:
> Hi Ingo,
>
> Adds tunables in sysfs to modify a user's cpu share.
>
> A directory is created in sysfs for each new user in the system.
>
> /sys/kernel/uids/<uid>/cpu_share
>
> Reading this file returns the cpu shares granted for the user.
> Writing into this file modifies the cpu share for the user. Only an
> administrator is allowed to modify a user's cpu share.
>
> Ex:
> # cd /sys/kernel/uids/
> # cat 512/cpu_share
> 1024
> # echo 2048 > 512/cpu_share
> # cat 512/cpu_share
> 2048
> #
looks good to me! I think this API is pretty straightforward. I've put
this into my tree and have updated the sched-devel git tree:
git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-devel.git
but there seems to be a bug in it, i get this:
kobject_add failed for 500 with -EEXIST, don't try to register things with the same name in the same directory.
[<c01e1730>] kobject_shadow_add+0x13b/0x169
[<c01e1795>] kobject_set_name+0x28/0x91
[<c01299d5>] user_kobject_create+0x6a/0x90
[<c0129d3c>] alloc_uid+0x141/0x181
[<c012d05c>] set_user+0x1c/0x91
[<c012e4c7>] sys_setresuid+0xd9/0x17d
[<c0103cf6>] sysenter_past_esp+0x5f/0x85
=======================
Ingo
On Mon, Oct 01, 2007 at 04:44:02PM +0200, Ingo Molnar wrote:
> but there seems to be a bug in it, i get this:
>
> kobject_add failed for 500 with -EEXIST, don't try to register things with the same name in the same directory.
Looks like a remove_dir/create_dir race ..
free_user() alloc_uid()
uid_hash_remove(up);
uid_hash_insert(new, hashent);
schedule_work()
user_kobject_create();
kobject_add(); -> Boom!
remove_user_sysfs_dir();
/me looks up sysfs programming examples ..
--
Regards,
vatsa
On Mon, Oct 01, 2007 at 07:34:54PM +0530, Dhaval Giani wrote:
> Hi Ingo,
>
> Adds tunables in sysfs to modify a user's cpu share.
>
> A directory is created in sysfs for each new user in the system.
>
> /sys/kernel/uids/<uid>/cpu_share
>
> Reading this file returns the cpu shares granted for the user.
> Writing into this file modifies the cpu share for the user. Only an
> administrator is allowed to modify a user's cpu share.
>
> Ex:
> # cd /sys/kernel/uids/
> # cat 512/cpu_share
> 1024
> # echo 2048 > 512/cpu_share
> # cat 512/cpu_share
> 2048
> #
Can we start adding stuff to Documentation/ for new files created
in sysfs ? There's so much undocumented stuff in there now that
it's unfunny.
A great start would be 'wtf is a cpu_share and why would I want to
change it' ?
Dave
--
http://www.codemonkey.org.uk
On Mon, Oct 01, 2007 at 12:12:21PM -0400, Dave Jones wrote:
> Can we start adding stuff to Documentation/ for new files created
> in sysfs ? There's so much undocumented stuff in there now that
> it's unfunny.
Hi Dave,
We will in the next version of the patch. At this point, the patch was
sent out to get an idea of whether sysfs is the right place for this interface
or not.
> A great start would be 'wtf is a cpu_share and why would I want to
> change it' ?
Quick answer before we release the next version : This has to do with
the CONFIG_FAIR_USER_SCHED feature which allows dividing CPU bandwidth
equally between all users of a system. Before this patch, CPU bandwidth was
divided equally among all non-root users. After this patch, it will
be possible to control CPU bandwidth allocation to each user separately.
For ex: by setting user "vatsa"'s cpu_share to be twice that of user
"guest", it would be possible for user "vatsa" to get 2x CPU bandwidth that of
user "guest"
--
Regards,
vatsa
On Mon, 2007-10-01 at 16:44 +0200, Ingo Molnar wrote:
> > Adds tunables in sysfs to modify a user's cpu share.
> >
> > A directory is created in sysfs for each new user in the system.
> >
> > /sys/kernel/uids/<uid>/cpu_share
> >
> > Reading this file returns the cpu shares granted for the user.
> > Writing into this file modifies the cpu share for the user. Only an
> > administrator is allowed to modify a user's cpu share.
> >
> > Ex:
> > # cd /sys/kernel/uids/
> > # cat 512/cpu_share
> > 1024
> > # echo 2048 > 512/cpu_share
> > # cat 512/cpu_share
> > 2048
> > #
>
> looks good to me! I think this API is pretty straightforward. I've put
> this into my tree and have updated the sched-devel git tree:
>
While a sysfs interface is OK and somewhat orthogonal to the interface
proposed the containers patches, I think maybe a new syscall should be
considered.
Since we now have a fair share cpu scheduler, maybe an interface to
specify the cpu share directly (alternatively to priority) make sense.
For processes, it may become more intuitive (and precise) to set the
processing share directly than setting a priority which is converted to
a share.
Maybe something similar to ioprio_set() and ioprio_get() syscalls:
- per user cpu share
- per user group cpu share
- per process cpu share
- per process group cpu share
Best regards,
- Eric
On Tue, Oct 02, 2007 at 06:12:39PM -0400, Eric St-Laurent wrote:
> While a sysfs interface is OK and somewhat orthogonal to the interface
> proposed the containers patches, I think maybe a new syscall should be
> considered.
We had discussed syscall vs filesystem based interface for resource
management [1] and there was a heavy bias favoring filesystem based interface,
based on which the container (now "cgroup") filesystem evolved.
Where we already have one interface defined, I would be against adding
an equivalent syscall interface.
Note that this "fair-user" scheduling can in theory be accomplished
using the same cgroup based interface, but requires some extra setup in
userspace (either to run a daemon which moves tasks to appropriate
control groups/containers upon their uid change OR to modify initrd to mount
cgroup filesystem at early bootup time). I expect most distros to enable
CONFIG_FAIR_CGROUP_SCHED (control group based fair group scheduler) and not
CONFIG_FAIR_USER_SHCED (user id based fair group scheduler). The only
reason why we are providing CONFIG_FAIR_USER_SCHED and the associated
sysfs interface is to help test group scheduler w/o requiring knowledge
of cgroup filesystem.
Reference:
1. http://marc.info/?l=linux-kernel&m=116231242201300&w=2
--
Regards,
vatsa
Hi Ingo,
Can you please drop commit b1add858a10cece3a68b2d8cb9e7350843700a58 (last
version of this patch) and try this instead?
---
Adds tunables in sysfs to modify a user's cpu share.
A directory is created in sysfs for each new user in the system.
/sys/kernel/uids/<uid>/cpu_share
Reading this file returns the cpu shares granted for the user.
Writing into this file modifies the cpu share for the user. Only an
administrator is allowed to modify a user's cpu share.
Ex:
# cd /sys/kernel/uids/
# cat 512/cpu_share
1024
# echo 2048 > 512/cpu_share
# cat 512/cpu_share
2048
#
Changelog since v1:
1. Added a mutex to serialize directory creation/destruction for a user in
sysfs
2. Added a spinlock in the task_group structure to serialize writes to
tg->shares.
3. Removed /proc/root_user_cpu_shares.
4. Added Documentation about the group scheduler.
Signed-off-by: Srivatsa Vaddagiri <[email protected]>
Signed-off-by: Dhaval Giani <[email protected]>
---
Documentation/sched-design-CFS.txt | 67 ++++++++++
include/linux/sched.h | 11 +
kernel/ksysfs.c | 8 +
kernel/sched.c | 14 +-
kernel/sched_debug.c | 48 -------
kernel/user.c | 242 ++++++++++++++++++++++++++++++++-----
6 files changed, 310 insertions(+), 80 deletions(-)
Index: linux-2.6.23-rc8-sched-devel/Documentation/sched-design-CFS.txt
===================================================================
--- linux-2.6.23-rc8-sched-devel.orig/Documentation/sched-design-CFS.txt
+++ linux-2.6.23-rc8-sched-devel/Documentation/sched-design-CFS.txt
@@ -117,3 +117,70 @@ Some implementation details:
iterators of the scheduling modules are used. The balancing code got
quite a bit simpler as a result.
+
+Group scheduler extension to CFS
+================================
+
+Normally the scheduler operates on individual tasks and strives to provide
+fair CPU time to each task. Sometimes, it may be desirable to group tasks
+and provide fair CPU time to each such task group. For example, it may
+be desirable to first provide fair CPU time to each user on the system
+and then to each task belonging to a user.
+
+CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets
+SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such
+groups. At present, there are two (mutually exclusive) mechanisms to group
+tasks for CPU bandwidth control purpose:
+
+ - Based on user id (CONFIG_FAIR_USER_SCHED)
+ In this option, tasks are grouped according to their user id.
+ - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED)
+ This options lets the administrator create arbitrary groups
+ of tasks, using the "cgroup" pseudo filesystem. See
+ Documentation/cgroups.txt for more information about this
+ filesystem.
+
+Only one of these options to group tasks can be chosen and not both.
+
+Group scheduler tunables:
+
+When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for
+each new user and a "cpu_share" file is added in that directory.
+
+ # cd /sys/kernel/uids
+ # cat 512/cpu_share # Display user 512's CPU share
+ 1024
+ # echo 2048 > 512/cpu_share # Modify user 512's CPU share
+ # cat 512/cpu_share # Display user 512's CPU share
+ 2048
+ #
+
+CPU bandwidth between two users are divided in the ratio of their CPU shares.
+For ex: if you would like user "root" to get twice the bandwidth of user
+"guest", then set the cpu_share for both the users such that "root"'s
+cpu_share is twice "guest"'s cpu_share
+
+
+When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created
+for each group created using the pseudo filesystem. See example steps
+below to create task groups and modify their CPU share using the "cgroups"
+pseudo filesystem
+
+ # mkdir /dev/cpuctl
+ # mount -t cgroup -ocpu none /dev/cpuctl
+ # cd /dev/cpuctl
+
+ # mkdir multimedia # create "multimedia" group of tasks
+ # mkdir browser # create "browser" group of tasks
+
+ # #Configure the multimedia group to receive twice the CPU bandwidth
+ # #that of browser group
+
+ # echo 2048 > multimedia/cpu.shares
+ # echo 1024 > browser/cpu.shares
+
+ # firefox & # Launch firefox and move it to "browser" group
+ # echo <firefox_pid> > browser/tasks
+
+ # #Launch gmplayer (or your favourite movie player)
+ # echo <movie_player_pid> > multimedia/tasks
Index: linux-2.6.23-rc8-sched-devel/include/linux/sched.h
===================================================================
--- linux-2.6.23-rc8-sched-devel.orig/include/linux/sched.h
+++ linux-2.6.23-rc8-sched-devel/include/linux/sched.h
@@ -86,6 +86,7 @@ struct sched_param {
#include <linux/timer.h>
#include <linux/hrtimer.h>
#include <linux/task_io_accounting.h>
+#include <linux/kobject.h>
#include <asm/processor.h>
@@ -598,9 +599,18 @@ struct user_struct {
#ifdef CONFIG_FAIR_USER_SCHED
struct task_group *tg;
+ struct kset kset;
+ struct subsys_attribute user_attr;
+ struct work_struct work;
#endif
};
+#ifdef CONFIG_FAIR_USER_SCHED
+extern int uids_kobject_init(void);
+#else
+static inline int uids_kobject_init(void) { return 0; }
+#endif
+
extern struct user_struct *find_user(uid_t);
extern struct user_struct root_user;
@@ -1846,6 +1856,7 @@ extern struct task_group *sched_create_g
extern void sched_destroy_group(struct task_group *tg);
extern void sched_move_task(struct task_struct *tsk);
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+extern unsigned long sched_group_shares(struct task_group *tg);
#endif
Index: linux-2.6.23-rc8-sched-devel/kernel/ksysfs.c
===================================================================
--- linux-2.6.23-rc8-sched-devel.orig/kernel/ksysfs.c
+++ linux-2.6.23-rc8-sched-devel/kernel/ksysfs.c
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kexec.h>
+#include <linux/sched.h>
#define KERNEL_ATTR_RO(_name) \
static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
@@ -116,6 +117,13 @@ static int __init ksysfs_init(void)
¬es_attr);
}
+ /*
+ * Create "/sys/kernel/uids" directory and corresponding root user's
+ * directory under it.
+ */
+ if (!error)
+ error = uids_kobject_init();
+
return error;
}
Index: linux-2.6.23-rc8-sched-devel/kernel/sched.c
===================================================================
--- linux-2.6.23-rc8-sched-devel.orig/kernel/sched.c
+++ linux-2.6.23-rc8-sched-devel/kernel/sched.c
@@ -161,6 +161,8 @@ struct task_group {
/* runqueue "owned" by this group on each cpu */
struct cfs_rq **cfs_rq;
unsigned long shares;
+ /* spinlock to serialize modification to shares */
+ spinlock_t lock;
};
/* Default task group's sched entity on each cpu */
@@ -6552,6 +6554,7 @@ void __init sched_init(void)
se->parent = NULL;
}
init_task_group.shares = init_task_group_load;
+ spin_lock_init(&init_task_group.lock);
#endif
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -6796,6 +6799,7 @@ struct task_group *sched_create_group(vo
}
tg->shares = NICE_0_LOAD;
+ spin_lock_init(&tg->lock);
return tg;
@@ -6916,8 +6920,9 @@ int sched_group_set_shares(struct task_g
{
int i;
+ spin_lock(&tg->lock);
if (tg->shares == shares)
- return 0;
+ goto done;
/* return -EINVAL if the new value is not sane */
@@ -6925,7 +6930,14 @@ int sched_group_set_shares(struct task_g
for_each_possible_cpu(i)
set_se_shares(tg->se[i], shares);
+done:
+ spin_unlock(&tg->lock);
return 0;
}
+unsigned long sched_group_shares(struct task_group *tg)
+{
+ return tg->shares;
+}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */
Index: linux-2.6.23-rc8-sched-devel/kernel/sched_debug.c
===================================================================
--- linux-2.6.23-rc8-sched-devel.orig/kernel/sched_debug.c
+++ linux-2.6.23-rc8-sched-devel/kernel/sched_debug.c
@@ -231,45 +231,6 @@ static void sysrq_sched_debug_show(void)
sched_debug_show(NULL, NULL);
}
-#ifdef CONFIG_FAIR_USER_SCHED
-
-static DEFINE_MUTEX(root_user_share_mutex);
-
-static int
-root_user_share_read_proc(char *page, char **start, off_t off, int count,
- int *eof, void *data)
-{
- return sprintf(page, "%d\n", init_task_group_load);
-}
-
-static int
-root_user_share_write_proc(struct file *file, const char __user *buffer,
- unsigned long count, void *data)
-{
- unsigned long shares;
- char kbuf[sizeof(unsigned long)+1];
- int rc = 0;
-
- if (copy_from_user(kbuf, buffer, sizeof(kbuf)))
- return -EFAULT;
-
- shares = simple_strtoul(kbuf, NULL, 0);
-
- if (!shares)
- shares = NICE_0_LOAD;
-
- mutex_lock(&root_user_share_mutex);
-
- init_task_group_load = shares;
- rc = sched_group_set_shares(&init_task_group, shares);
-
- mutex_unlock(&root_user_share_mutex);
-
- return (rc < 0 ? rc : count);
-}
-
-#endif /* CONFIG_FAIR_USER_SCHED */
-
static int sched_debug_open(struct inode *inode, struct file *filp)
{
return single_open(filp, sched_debug_show, NULL);
@@ -292,15 +253,6 @@ static int __init init_sched_debug_procf
pe->proc_fops = &sched_debug_fops;
-#ifdef CONFIG_FAIR_USER_SCHED
- pe = create_proc_entry("root_user_cpu_share", 0644, NULL);
- if (!pe)
- return -ENOMEM;
-
- pe->read_proc = root_user_share_read_proc;
- pe->write_proc = root_user_share_write_proc;
-#endif
-
return 0;
}
Index: linux-2.6.23-rc8-sched-devel/kernel/user.c
===================================================================
--- linux-2.6.23-rc8-sched-devel.orig/kernel/user.c
+++ linux-2.6.23-rc8-sched-devel/kernel/user.c
@@ -55,7 +55,41 @@ struct user_struct root_user = {
#endif
};
+/*
+ * These routines must be called with the uidhash spinlock held!
+ */
+static inline void uid_hash_insert(struct user_struct *up,
+ struct hlist_head *hashent)
+{
+ hlist_add_head(&up->uidhash_node, hashent);
+}
+
+static inline void uid_hash_remove(struct user_struct *up)
+{
+ hlist_del_init(&up->uidhash_node);
+}
+
+static inline struct user_struct *uid_hash_find(uid_t uid,
+ struct hlist_head *hashent)
+{
+ struct user_struct *user;
+ struct hlist_node *h;
+
+ hlist_for_each_entry(user, h, hashent, uidhash_node) {
+ if (user->uid == uid) {
+ atomic_inc(&user->__count);
+ return user;
+ }
+ }
+
+ return NULL;
+}
+
#ifdef CONFIG_FAIR_USER_SCHED
+
+static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */
+static DEFINE_MUTEX(uids_mutex);
+
static void sched_destroy_user(struct user_struct *up)
{
sched_destroy_group(up->tg);
@@ -77,42 +111,173 @@ static void sched_switch_user(struct tas
sched_move_task(p);
}
-#else /* CONFIG_FAIR_USER_SCHED */
+static inline void uids_mutex_lock(void)
+{
+ mutex_lock(&uids_mutex);
+}
-static void sched_destroy_user(struct user_struct *up) { }
-static int sched_create_user(struct user_struct *up) { return 0; }
-static void sched_switch_user(struct task_struct *p) { }
+static inline void uids_mutex_unlock(void)
+{
+ mutex_unlock(&uids_mutex);
+}
-#endif /* CONFIG_FAIR_USER_SCHED */
+/* return cpu shares held by the user */
+ssize_t cpu_shares_show(struct kset *kset, char *buffer)
+{
+ struct user_struct *up = container_of(kset, struct user_struct, kset);
-/*
- * These routines must be called with the uidhash spinlock held!
- */
-static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
+ return sprintf(buffer, "%lu\n", sched_group_shares(up->tg));
+}
+
+/* modify cpu shares held by the user */
+ssize_t cpu_shares_store(struct kset *kset, const char *buffer, size_t size)
{
- hlist_add_head(&up->uidhash_node, hashent);
+ struct user_struct *up = container_of(kset, struct user_struct, kset);
+ unsigned long shares;
+ int rc;
+
+ sscanf(buffer, "%lu", &shares);
+
+ rc = sched_group_set_shares(up->tg, shares);
+
+ return (rc ? rc : size);
}
-static inline void uid_hash_remove(struct user_struct *up)
+static void user_attr_init(struct subsys_attribute *sa, char *name, int mode)
{
- hlist_del_init(&up->uidhash_node);
+ sa->attr.name = name;
+ sa->attr.mode = mode;
+ sa->show = cpu_shares_show;
+ sa->store = cpu_shares_store;
}
-static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
+/* Create "/sys/kernel/uids/<uid>" directory and
+ * "/sys/kernel/uids/<uid>/cpu_share" file for this user.
+ */
+static int user_kobject_create(struct user_struct *up)
{
- struct user_struct *user;
- struct hlist_node *h;
+ struct kset *kset = &up->kset;
+ struct kobject *kobj = &kset->kobj;
+ int error;
+
+ memset(kset, 0, sizeof(struct kset));
+ kobj->parent = &uids_kobject; /* create under /sys/kernel/uids dir */
+ kobject_set_name(kobj, "%d", up->uid);
+ kset_init(kset);
+ user_attr_init(&up->user_attr, "cpu_share", 0644);
+
+ error = kobject_add(kobj);
+ if (error)
+ goto done;
+
+ error = sysfs_create_file(kobj, &up->user_attr.attr);
+ if (error)
+ kobject_del(kobj);
+
+done:
+ return error;
+}
+
+/* create these in sysfs filesystem:
+ * "/sys/kernel/uids" directory
+ * "/sys/kernel/uids/0" directory (for root user)
+ * "/sys/kernel/uids/0/cpu_share" file (for root user)
+ */
+int __init uids_kobject_init(void)
+{
+ int error;
- hlist_for_each_entry(user, h, hashent, uidhash_node) {
- if(user->uid == uid) {
- atomic_inc(&user->__count);
- return user;
- }
+ /* create under /sys/kernel dir */
+ uids_kobject.parent = &kernel_subsys.kobj;
+ kobject_set_name(&uids_kobject, "uids");
+ kobject_init(&uids_kobject);
+
+ error = kobject_add(&uids_kobject);
+ if (!error)
+ error = user_kobject_create(&root_user);
+
+ return error;
+}
+
+/* work function to remove sysfs directory for a user and free up
+ * corresponding structures.
+ */
+static void remove_user_sysfs_dir(struct work_struct *w)
+{
+ struct user_struct *up = container_of(w, struct user_struct, work);
+ struct kobject *kobj = &up->kset.kobj;
+ unsigned long flags;
+ int remove_user = 0;
+
+ /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
+ * atomic.
+ */
+ uids_mutex_lock();
+
+ local_irq_save(flags);
+
+ if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
+ uid_hash_remove(up);
+ remove_user = 1;
+ spin_unlock_irqrestore(&uidhash_lock, flags);
+ } else {
+ local_irq_restore(flags);
}
- return NULL;
+ if (!remove_user)
+ goto done;
+
+ sysfs_remove_file(kobj, &up->user_attr.attr);
+ kobject_del(kobj);
+
+ sched_destroy_user(up);
+ key_put(up->uid_keyring);
+ key_put(up->session_keyring);
+ kmem_cache_free(uid_cachep, up);
+
+done:
+ uids_mutex_unlock();
+}
+
+/* IRQs are disabled and uidhash_lock is held upon function entry.
+ * IRQ state (as stored in flags) is restored and uidhash_lock released
+ * upon function exit.
+ */
+static inline void free_user(struct user_struct *up, unsigned long flags)
+{
+ /* restore back the count */
+ atomic_inc(&up->__count);
+ spin_unlock_irqrestore(&uidhash_lock, flags);
+
+ INIT_WORK(&up->work, remove_user_sysfs_dir);
+ schedule_work(&up->work);
+}
+
+#else /* CONFIG_FAIR_USER_SCHED */
+
+static void sched_destroy_user(struct user_struct *up) { }
+static int sched_create_user(struct user_struct *up) { return 0; }
+static void sched_switch_user(struct task_struct *p) { }
+static inline int user_kobject_create(struct user_struct *up) { return 0; }
+static inline void uids_mutex_lock(void) { }
+static inline void uids_mutex_unlock(void) { }
+
+/* IRQs are disabled and uidhash_lock is held upon function entry.
+ * IRQ state (as stored in flags) is restored and uidhash_lock released
+ * upon function exit.
+ */
+static inline void free_user(struct user_struct *up, unsigned long flags)
+{
+ uid_hash_remove(up);
+ spin_unlock_irqrestore(&uidhash_lock, flags);
+ sched_destroy_user(up);
+ key_put(up->uid_keyring);
+ key_put(up->session_keyring);
+ kmem_cache_free(uid_cachep, up);
}
+#endif /* CONFIG_FAIR_USER_SCHED */
+
/*
* Locate the user_struct for the passed UID. If found, take a ref on it. The
* caller must undo that ref with free_uid().
@@ -139,16 +304,10 @@ void free_uid(struct user_struct *up)
return;
local_irq_save(flags);
- if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
- uid_hash_remove(up);
- spin_unlock_irqrestore(&uidhash_lock, flags);
- sched_destroy_user(up);
- key_put(up->uid_keyring);
- key_put(up->session_keyring);
- kmem_cache_free(uid_cachep, up);
- } else {
+ if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
+ free_user(up, flags);
+ else
local_irq_restore(flags);
- }
}
struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
@@ -156,6 +315,11 @@ struct user_struct * alloc_uid(struct us
struct hlist_head *hashent = uidhashentry(ns, uid);
struct user_struct *up;
+ /* Make uid_hash_find() + user_kobject_create() + uid_hash_insert()
+ * atomic.
+ */
+ uids_mutex_lock();
+
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
spin_unlock_irq(&uidhash_lock);
@@ -191,6 +355,15 @@ struct user_struct * alloc_uid(struct us
return NULL;
}
+ if (user_kobject_create(new)) {
+ sched_destroy_user(new);
+ key_put(new->uid_keyring);
+ key_put(new->session_keyring);
+ kmem_cache_free(uid_cachep, new);
+ uids_mutex_unlock();
+ return NULL;
+ }
+
/*
* Before adding this, check whether we raced
* on adding the same user already..
@@ -198,7 +371,11 @@ struct user_struct * alloc_uid(struct us
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
if (up) {
- sched_destroy_user(new);
+ /* This case is not possible when CONFIG_FAIR_USER_SCHED
+ * is defined, since we serialize alloc_uid() using
+ * uids_mutex. Hence no need to call
+ * sched_destroy_user() or remove_user_sysfs_dir().
+ */
key_put(new->uid_keyring);
key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
@@ -209,6 +386,9 @@ struct user_struct * alloc_uid(struct us
spin_unlock_irq(&uidhash_lock);
}
+
+ uids_mutex_unlock();
+
return up;
}
--
regards,
Dhaval
* Dhaval Giani <[email protected]> wrote:
> Hi Ingo,
>
> Can you please drop commit b1add858a10cece3a68b2d8cb9e7350843700a58 (last
> version of this patch) and try this instead?
> Changelog since v1:
> 1. Added a mutex to serialize directory creation/destruction for a user in
> sysfs
> 2. Added a spinlock in the task_group structure to serialize writes to
> tg->shares.
> 3. Removed /proc/root_user_cpu_shares.
> 4. Added Documentation about the group scheduler.
thanks - I have added this to the queue.
i'm wondering about the following: could not (yet) existing UIDs be made
configurable too? I.e. if i do this in a bootup script:
echo 2048 > /sys/kernel/uids/500/cpu_share
this should just work too, regardless of there not being any UID 500
tasks yet. Likewise, once configured, the /sys/kernel/uids/* directories
(with the settings in them) should probably not go away either.
Ingo
> > Changelog since v1:
> > 1. Added a mutex to serialize directory creation/destruction for a user in
> > sysfs
> > 2. Added a spinlock in the task_group structure to serialize writes to
> > tg->shares.
> > 3. Removed /proc/root_user_cpu_shares.
> > 4. Added Documentation about the group scheduler.
>
> thanks - I have added this to the queue.
>
> i'm wondering about the following: could not (yet) existing UIDs be made
> configurable too? I.e. if i do this in a bootup script:
>
> echo 2048 > /sys/kernel/uids/500/cpu_share
>
> this should just work too, regardless of there not being any UID 500
> tasks yet. Likewise, once configured, the /sys/kernel/uids/* directories
> (with the settings in them) should probably not go away either.
Shouldn't that be done via uevents? E.g. UID x gets added to the sysfs tree,
generates a uevent and a script then figures out the cpu_share and sets it.
That way you also don't need to keep the directories. No?
Heiko Carstens wrote:
>>> Changelog since v1:
>>> 1. Added a mutex to serialize directory creation/destruction for a user in
>>> sysfs
>>> 2. Added a spinlock in the task_group structure to serialize writes to
>>> tg->shares.
>>> 3. Removed /proc/root_user_cpu_shares.
>>> 4. Added Documentation about the group scheduler.
>> thanks - I have added this to the queue.
>>
>> i'm wondering about the following: could not (yet) existing UIDs be made
>> configurable too? I.e. if i do this in a bootup script:
>>
>> echo 2048 > /sys/kernel/uids/500/cpu_share
>>
>> this should just work too, regardless of there not being any UID 500
>> tasks yet. Likewise, once configured, the /sys/kernel/uids/* directories
>> (with the settings in them) should probably not go away either.
>
> Shouldn't that be done via uevents? E.g. UID x gets added to the sysfs tree,
> generates a uevent and a script then figures out the cpu_share and sets it.
> That way you also don't need to keep the directories. No?
That sounds complex administratively. It means that instead of setting a
higher or lower than default once and having it persist until reboot I
have to create a script, which *will* in some cases be left in place
even after the need has gone.
I agree with Ingo, once it's done it should be persistent.
And as another administrative convenience I can look at that set of
values and see what shares are being used, even when the user is not
currently active.
Final question, how do setuid processes map into this implementation?
--
Bill Davidsen <[email protected]>
"We have more to fear from the bungling of the incompetent than from
the machinations of the wicked." - from Slashdot
On Thu, Oct 04, 2007 at 12:02:01PM -0400, Bill Davidsen wrote:
> >>i'm wondering about the following: could not (yet) existing UIDs be made
> >>configurable too? I.e. if i do this in a bootup script:
> >>
> >> echo 2048 > /sys/kernel/uids/500/cpu_share
> >>
> >>this should just work too, regardless of there not being any UID 500
> >>tasks yet. Likewise, once configured, the /sys/kernel/uids/* directories
> >>(with the settings in them) should probably not go away either.
> >
> >Shouldn't that be done via uevents? E.g. UID x gets added to the sysfs
> >tree,
> >generates a uevent and a script then figures out the cpu_share and sets it.
> >That way you also don't need to keep the directories. No?
>
> That sounds complex administratively. It means that instead of setting a
> higher or lower than default once and having it persist until reboot I
> have to create a script, which *will* in some cases be left in place
> even after the need has gone.
>
> I agree with Ingo, once it's done it should be persistent.
> And as another administrative convenience I can look at that set of
> values and see what shares are being used, even when the user is not
> currently active.
Although the need seems very real, I am thinking about the implementation
aspect of this in the kernel i.e how will this be implementable?
1. The current patch proposes a sysfs based interface, where a new
directory is created for every new user created who logs into the
system. To meet the requirement Ingo suggested, it would require the
ability to create directories in sysfs in advance of (user_struct) objects
that aren't yet there - which is not possible to implement in sysfs
afaik
2. configfs seems to allow creation of directories (i.e kernel objects) from
userland. Every new directory created should translate to a
user_struct object being created in the kernel (and inserted in
uid_hash table). Would this be acceptable?
Also, IMHO, CONFIG_FAIR_USER_SCHED is there only as a toy, to test fair group
scheduling and I expect distros to support CONFIG_FAIR_CGROUP_SCHED instead
which allows "control group" (or process containers) based fair group
scheduling.
Using CONFIG_FAIR_CGROUP_SCHED it is still possible to provide user-id based
fair group scheduling, in two ways:
1. Have a daemon which listens for UID change events
(PROC_EVENT_UID) and move the task to appropriate "control
groups" and set the "control group" shares
2. Implement a "user" subsystem registered with "cgroup" core,
which automatically creates new "control groups" whenever
a new user is being added to the system. This is very similar
to "ns" subsystem (kernel/ns_cgroup.c in -mm tree).
Thus in order to provide fair user scheduling with this option,
distro needs to modify initrd to:
# mkdir /dev/usercpucontrol
# mount -t cgroup -ouser,cpu none /dev/usercpucontrol
Using a combination of these two options and a /etc configuration file
which specifies the cpu shares to be given to a user, it should be
possible for distro to give a good fair-user based scheduler.
> Final question, how do setuid processes map into this implementation?
We seem to be going by the real uid of a task (which is what tsk->user
points at) to decide its CPU bandwidth. Is that a cause of concern?
--
Regards,
vatsa
On Thu, 04 Oct 2007 10:54:51 +0200, Heiko Carstens said:
> > echo 2048 > /sys/kernel/uids/500/cpu_share
> >
> > this should just work too, regardless of there not being any UID 500
> > tasks yet. Likewise, once configured, the /sys/kernel/uids/* directories
> > (with the settings in them) should probably not go away either.
>
> Shouldn't that be done via uevents? E.g. UID x gets added to the sysfs tree,
> generates a uevent and a script then figures out the cpu_share and sets it.
That would tend to be a tad racy - a site may want to set limits in the
hypothetical /sys/kernel/uids/NNN before the program has a chance to fork-bomb
or otherwise make it difficult to set a limitfrom within another userspace
process. It's similar to why you want a process to be launched with all its
ulimit's set, rather than set them after the fork/exec happens...
On Thu, Oct 04, 2007 at 05:32:17PM -0400, [email protected] wrote:
> > Shouldn't that be done via uevents? E.g. UID x gets added to the sysfs tree,
> > generates a uevent and a script then figures out the cpu_share and sets it.
>
> That would tend to be a tad racy - a site may want to set limits in the
> hypothetical /sys/kernel/uids/NNN before the program has a chance to fork-bomb
> or otherwise make it difficult to set a limitfrom within another userspace
Note that there is a default limit enforced on every new user (1024
value for the user's cpu share). This limit should contain any fork-bomb that
the user may launch immediately.
> process. It's similar to why you want a process to be launched with all its
> ulimit's set, rather than set them after the fork/exec happens...
--
Regards,
vatsa
On Thu, Oct 04, 2007 at 10:54:51AM +0200, Heiko Carstens wrote:
> > i'm wondering about the following: could not (yet) existing UIDs be made
> > configurable too? I.e. if i do this in a bootup script:
> >
> > echo 2048 > /sys/kernel/uids/500/cpu_share
> >
> > this should just work too, regardless of there not being any UID 500
> > tasks yet. Likewise, once configured, the /sys/kernel/uids/* directories
> > (with the settings in them) should probably not go away either.
>
> Shouldn't that be done via uevents? E.g. UID x gets added to the sysfs tree,
> generates a uevent and a script then figures out the cpu_share and sets it.
> That way you also don't need to keep the directories. No?
Heiko,
Thanks for the hint. Here's a patch to enable generation of
uevents for user creation/deletion. These uevents can be handled in
userspace to configure a new user's cpu share.
Note : After bootup I could test that new user's cpu share is configured
as per a configuration file (/etc/user_cpu_share.conf). However this
mechanism didnt work for root user. Perhaps uevent for root user is
generated way too early?
A HOWTO text file is also attached explaining how to make use of these
uevents in userspace.
Ingo,
This patch applies on top of latest sched-devel tree. Pls review
and apply ..
---
Generate uevents when a user is being created/destroyed. These events
could be used to configure cpu share of a new user.
Signed-off-by : Srivatsa Vaddagiri <[email protected]>
Signed-off-by : Dhaval Giani <[email protected]>
---
kernel/user.c | 4 ++++
1 files changed, 4 insertions(+)
Index: current/kernel/user.c
===================================================================
--- current.orig/kernel/user.c
+++ current/kernel/user.c
@@ -174,6 +174,8 @@ static int user_kobject_create(struct us
if (error)
kobject_del(kobj);
+ kobject_uevent(kobj, KOBJ_ADD);
+
done:
return error;
}
@@ -189,6 +191,7 @@ int __init uids_kobject_init(void)
/* create under /sys/kernel dir */
uids_kobject.parent = &kernel_subsys.kobj;
+ uids_kobject.kset = &kernel_subsys;
kobject_set_name(&uids_kobject, "uids");
kobject_init(&uids_kobject);
@@ -228,6 +231,7 @@ static void remove_user_sysfs_dir(struct
goto done;
sysfs_remove_file(kobj, &up->user_attr.attr);
+ kobject_uevent(kobj, KOBJ_REMOVE);
kobject_del(kobj);
sched_destroy_user(up);
--
Regards,
vatsa
* Srivatsa Vaddagiri <[email protected]> wrote:
> > Shouldn't that be done via uevents? E.g. UID x gets added to the
> > sysfs tree, generates a uevent and a script then figures out the
> > cpu_share and sets it. That way you also don't need to keep the
> > directories. No?
>
> Heiko,
> Thanks for the hint. Here's a patch to enable generation of
> uevents for user creation/deletion. These uevents can be handled in
> userspace to configure a new user's cpu share.
>
> Note : After bootup I could test that new user's cpu share is
> configured as per a configuration file (/etc/user_cpu_share.conf).
> However this mechanism didnt work for root user. Perhaps uevent for
> root user is generated way too early?
>
> A HOWTO text file is also attached explaining how to make use of these
> uevents in userspace.
>
> Ingo,
> This patch applies on top of latest sched-devel tree. Pls
> review and apply ..
thanks, applied. This looks reassuringly simple and straightforward!
Ingo