LinuxLists.cc - [PATCH 4/6] containers: Simple CPU accounting container subsystem

2006-12-22 15:09:31

Subject: [PATCH 4/6] containers: Simple CPU accounting container subsystem

This demonstrates how to use the generic container subsystem for a
simple resource tracker that counts the total CPU time used by all
processes in a container, during the time that they're members of the
container.

Signed-off-by: Paul Menage <[email protected]>

---
include/linux/cpu_acct.h | 14 +++++
init/Kconfig | 7 ++
kernel/Makefile | 1
kernel/cpu_acct.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++
kernel/sched.c | 6 ++
5 files changed, 145 insertions(+)

Index: container-2.6.20-rc1/include/linux/cpu_acct.h
===================================================================
--- /dev/null
+++ container-2.6.20-rc1/include/linux/cpu_acct.h
@@ -0,0 +1,14 @@
+
+#ifndef _LINUX_CPU_ACCT_H
+#define _LINUX_CPU_ACCT_H
+
+#include <linux/container.h>
+#include <asm/cputime.h>
+
+#ifdef CONFIG_CONTAINER_CPUACCT
+extern void cpuacct_charge(struct task_struct *, cputime_t cputime);
+#else
+static void inline cpuacct_charge(struct task_struct *p, cputime_t cputime) {}
+#endif
+
+#endif
Index: container-2.6.20-rc1/init/Kconfig
===================================================================
--- container-2.6.20-rc1.orig/init/Kconfig
+++ container-2.6.20-rc1/init/Kconfig
@@ -290,6 +290,13 @@ config PROC_PID_CPUSET
depends on CPUSETS
default y

+config CONTAINER_CPUACCT
+ bool "Simple CPU accounting container subsystem"
+ select CONTAINERS
+ help
+ Provides a simple Resource Controller for monitoring the
+ total CPU consumed by the tasks in a container
+
config RELAY
bool "Kernel->user space relay support (formerly relayfs)"
help
Index: container-2.6.20-rc1/kernel/cpu_acct.c
===================================================================
--- /dev/null
+++ container-2.6.20-rc1/kernel/cpu_acct.c
@@ -0,0 +1,117 @@
+/*
+ * kernel/cpu_acct.c - CPU accounting container subsystem
+ *
+ * Copyright (C) Google Inc, 2006
+ *
+ */
+
+/*
+ * Container subsystem for reporting total CPU usage of tasks in a
+ * container.
+ */
+
+#include <linux/module.h>
+#include <linux/container.h>
+#include <linux/fs.h>
+#include <asm/div64.h>
+
+struct cpuacct {
+ struct container_subsys_state css;
+ spinlock_t lock;
+ cputime64_t time; // total time used by this class
+};
+
+static struct container_subsys cpuacct_subsys;
+
+static inline struct cpuacct *container_ca(struct container *cont)
+{
+ return container_of(container_subsys_state(cont, &cpuacct_subsys),
+ struct cpuacct, css);
+}
+
+static inline struct cpuacct *task_ca(struct task_struct *task)
+{
+ return container_ca(task_container(task, &cpuacct_subsys));
+}
+
+static int cpuacct_create(struct container_subsys *ss, struct container *cont)
+{
+ struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+ if (!ca) return -ENOMEM;
+ spin_lock_init(&ca->lock);
+ cont->subsys[cpuacct_subsys.subsys_id] = &ca->css;
+ return 0;
+}
+
+static void cpuacct_destroy(struct container_subsys *ss,
+ struct container *cont)
+{
+ kfree(container_ca(cont));
+}
+
+static ssize_t cpuusage_read(struct container *cont,
+ struct cftype *cft,
+ struct file *file,
+ char __user *buf,
+ size_t nbytes, loff_t *ppos)
+{
+ struct cpuacct *ca = container_ca(cont);
+ cputime64_t time;
+ char usagebuf[64];
+ char *s = usagebuf;
+
+ spin_lock_irq(&ca->lock);
+ time = ca->time;
+ spin_unlock_irq(&ca->lock);
+
+ time *= 1000;
+ do_div(time, HZ);
+ s += sprintf(s, "%llu", (unsigned long long) time);
+
+ return simple_read_from_buffer(buf, nbytes, ppos, usagebuf, s - usagebuf);
+}
+
+static struct cftype cft_usage = {
+ .name = "cpuacct.usage",
+ .read = cpuusage_read,
+};
+
+static int cpuacct_populate(struct container_subsys *ss,
+ struct container *cont)
+{
+ return container_add_file(cont, &cft_usage);
+}
+
+
+void cpuacct_charge(struct task_struct *task, cputime_t cputime) {
+
+ struct cpuacct *ca;
+ unsigned long flags;
+
+ if (cpuacct_subsys.subsys_id < 0) return;
+ rcu_read_lock();
+ ca = task_ca(task);
+ if (ca) {
+ spin_lock_irqsave(&ca->lock, flags);
+ ca->time = cputime64_add(ca->time, cputime);
+ spin_unlock_irqrestore(&ca->lock, flags);
+ }
+ rcu_read_unlock();
+}
+
+static struct container_subsys cpuacct_subsys = {
+ .name = "cpuacct",
+ .create = cpuacct_create,
+ .destroy = cpuacct_destroy,
+ .populate = cpuacct_populate,
+ .subsys_id = -1,
+};
+
+
+int __init init_cpuacct(void)
+{
+ int id = container_register_subsys(&cpuacct_subsys);
+ return id < 0 ? id : 0;
+}
+
+module_init(init_cpuacct)
Index: container-2.6.20-rc1/kernel/Makefile
===================================================================
--- container-2.6.20-rc1.orig/kernel/Makefile
+++ container-2.6.20-rc1/kernel/Makefile
@@ -38,6 +38,7 @@ obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CONTAINERS) += container.o
obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CONTAINER_CPUACCT) += cpu_acct.o
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
Index: container-2.6.20-rc1/kernel/sched.c
===================================================================
--- container-2.6.20-rc1.orig/kernel/sched.c
+++ container-2.6.20-rc1/kernel/sched.c
@@ -52,6 +52,7 @@
#include <linux/tsacct_kern.h>
#include <linux/kprobes.h>
#include <linux/delayacct.h>
+#include <linux/cpu_acct.h>
#include <asm/tlb.h>

#include <asm/unistd.h>
@@ -3068,6 +3069,8 @@ void account_user_time(struct task_struc

p->utime = cputime_add(p->utime, cputime);

+ cpuacct_charge(p, cputime);
+
/* Add user time to cpustat. */
tmp = cputime_to_cputime64(cputime);
if (TASK_NICE(p) > 0)
@@ -3091,6 +3094,9 @@ void account_system_time(struct task_str

p->stime = cputime_add(p->stime, cputime);

+ if (p != rq->idle)
+ cpuacct_charge(p, cputime);
+
/* Add system time to cpustat. */
tmp = cputime_to_cputime64(cputime);
if (hardirq_count() - hardirq_offset)

--

2007-01-10 14:27:10

by Balbir Singh

[permalink] [raw]

Subject: Re: [ckrm-tech] [PATCH 4/6] containers: Simple CPU accounting container subsystem

Paul Menage wrote:
> This demonstrates how to use the generic container subsystem for a
> simple resource tracker that counts the total CPU time used by all
> processes in a container, during the time that they're members of the
> container.
>
> Signed-off-by: Paul Menage <[email protected]>
>

Hi, Paul,

I have run into a problem running this patch on a powerpc box. Basically,
the machine panics as soon as I mount the container filesystem with

mount -t container -oall container /<mount point>, I see

cpu 0x2: Vector: 300 (Data Access) at [c0000001e7f2b8e0]
pc: c000000000098b70: .cpuacct_charge+0x84/0xbc
lr: c000000000060a3c: .account_user_time+0x60/0xb4
sp: c0000001e7f2bb60
msr: 8000000000009032
dar: 48
dsisr: 40000000
current = 0xc0000001e7f0e800
paca = 0xc00000000071c300
pid = 0, comm = swapper

Analyzing the dump a bit further lead me to container_subsys_state().

I am trying to figure out the reason for the panic and trying to find
a fix. Since the introduction of whole hierarchy system, the debugging
has gotten a bit harder and taking longer, hence I was wondering if you
had any clues about the problem

--

Balbir Singh,
Linux Technology Center,
IBM Software Labs

2007-01-12 00:33:57

by Paul Menage

[permalink] [raw]

Subject: Re: [ckrm-tech] [PATCH 4/6] containers: Simple CPU accounting container subsystem

On 1/10/07, Balbir Singh <[email protected]> wrote:
>
> I have run into a problem running this patch on a powerpc box. Basically,
> the machine panics as soon as I mount the container filesystem with

This is a multi-processor system?

My guess is that it's a race in the subsystem API that I've been
meaning to deal with for some time - basically I've been using
(<foo>_subsys.subsys_id != -1) to indicate that <foo> is ready for
use, but there's a brief window during subsystem registration where
that's not actually true.

I'll add an "active" field in the container_subsys structure, which
isn't set until registration is completed, and subsystems should use
that instead. container_register_subsys() will set it just prior to
releasing callback_mutex, and cpu_acct.c (and other subsystems) will
check <foo>_subsys.active rather than (<foo>_subsys.subsys_id != -1)

> I am trying to figure out the reason for the panic and trying to find
> a fix. Since the introduction of whole hierarchy system, the debugging
> has gotten a bit harder and taking longer, hence I was wondering if you
> had any clues about the problem
>

Yes, the multi-hierarchy support does make the whole code a little
more complex - but people presented reasonable scenarios where a
single container tree for all resource controllers just wasn't
flexible enough.

Paul

2007-01-12 07:50:05

by Balbir Singh

[permalink] [raw]

Subject: Re: [ckrm-tech] [PATCH 4/6] containers: Simple CPU accounting container subsystem

2007-01-12 08:15:26

by Paul Menage

[permalink] [raw]

Subject: Re: [ckrm-tech] [PATCH 4/6] containers: Simple CPU accounting container subsystem

On 1/11/07, Balbir Singh <[email protected]> wrote:
>
> I tried something similar, I added an activated field, which is set
> to true when the ->create() callback is invoked. That did not help
> either, the machine still panic'ed.

I think that marking it active when create() is called may be too soon.

Is this with my unchanged cpuacct subsystem, or with the version that
you've extended to track load over defined periods? I don't see it
when I test under VMware (with two processors in the VM), but I
suspect that's not going to be quite as parallel as a real SMP system.

>
> I see the need for it, but I wonder if we should start with that
> right away. I understand that people might want to group cpusets
> differently from their grouping of let's say the cpu resource
> manager. I would still prefer to start with one hierarchy and then
> move to multiple hierarchies. I am concerned that adding complexity
> upfront might turn off people from using the infrastructure.

That's what I had originally and people objected to the lack of flexibility :-)

The presence or absence of multiple hierarchies is pretty much exposed
to userspace, and presenting the right interface to userspace is a
fairly important thing to get right from the start.

Paul

2007-01-12 08:26:47

by Balbir Singh

[permalink] [raw]

Subject: Re: [ckrm-tech] [PATCH 4/6] containers: Simple CPU accounting container subsystem

Paul Menage wrote:
> On 1/11/07, Balbir Singh <[email protected]> wrote:
>> I tried something similar, I added an activated field, which is set
>> to true when the ->create() callback is invoked. That did not help
>> either, the machine still panic'ed.
>
> I think that marking it active when create() is called may be too soon.
>
> Is this with my unchanged cpuacct subsystem, or with the version that
> you've extended to track load over defined periods? I don't see it
> when I test under VMware (with two processors in the VM), but I
> suspect that's not going to be quite as parallel as a real SMP system.

This is with the unchanged cpuacct subsystem. Ok, so the container
system needs to mark active internally then.

>
>> I see the need for it, but I wonder if we should start with that
>> right away. I understand that people might want to group cpusets
>> differently from their grouping of let's say the cpu resource
>> manager. I would still prefer to start with one hierarchy and then
>> move to multiple hierarchies. I am concerned that adding complexity
>> upfront might turn off people from using the infrastructure.
>
> That's what I had originally and people objected to the lack of flexibility :-)
>
> The presence or absence of multiple hierarchies is pretty much exposed
> to userspace, and presenting the right interface to userspace is a
> fairly important thing to get right from the start.
>

I understand that the features are exported to userspace. But from
the userspace POV only the mount options change - right?

> Paul
>

--

Balbir Singh,
Linux Technology Center,
IBM Software Labs

2007-01-12 17:32:40

by Paul Menage

[permalink] [raw]

Subject: Re: [ckrm-tech] [PATCH 4/6] containers: Simple CPU accounting container subsystem

On 1/12/07, Balbir Singh <[email protected]> wrote:
>
> I understand that the features are exported to userspace. But from
> the userspace POV only the mount options change - right?
>

The mount options, plus the fact that you can mount different
instances of containerfs with different resource controller sets to
get different hierarchies. (Multiple mounts with the same controller
sets share the same superblock/hierarchy).

Paul

2007-01-15 09:07:10

by Balbir Singh

[permalink] [raw]

Subject: [PATCH 0/1] Add mount/umount callbacks to containers (Re: [ckrm-tech] [PATCH 4/6] containers: Simple CPU accounting container subsystem)

Paul Menage wrote:
> On 1/10/07, Balbir Singh <[email protected]> wrote:
>> I have run into a problem running this patch on a powerpc box. Basically,
>> the machine panics as soon as I mount the container filesystem with
>
> This is a multi-processor system?
>
> My guess is that it's a race in the subsystem API that I've been
> meaning to deal with for some time - basically I've been using
> (<foo>_subsys.subsys_id != -1) to indicate that <foo> is ready for
> use, but there's a brief window during subsystem registration where
> that's not actually true.
>
> I'll add an "active" field in the container_subsys structure, which
> isn't set until registration is completed, and subsystems should use
> that instead. container_register_subsys() will set it just prior to
> releasing callback_mutex, and cpu_acct.c (and other subsystems) will
> check <foo>_subsys.active rather than (<foo>_subsys.subsys_id != -1)
>
>> I am trying to figure out the reason for the panic and trying to find
>> a fix. Since the introduction of whole hierarchy system, the debugging
>> has gotten a bit harder and taking longer, hence I was wondering if you
>> had any clues about the problem
>>
>

Hi, Paul,

I figured out the reason for the panic. Here are the fixes

Add mount and umount callbacks. These callbacks can be used by the
controller to figure out the correct root container and also know
whether the controller is currently acitve.

Signed-off-by: Balbir Singh <[email protected]>
---

include/linux/container.h | 2 ++
kernel/container.c | 12 ++++++++++++
2 files changed, 14 insertions(+)

diff -puN include/linux/container.h~add-mount-callback include/linux/container.h
--- linux-2.6.20-rc3/include/linux/container.h~add-mount-callback 2007-01-12
21:23:00.000000000 +0530
+++ linux-2.6.20-rc3-balbir/include/linux/container.h 2007-01-12
21:23:00.000000000 +0530
@@ -171,6 +171,8 @@ struct container_subsys {
void (*exit)(struct container_subsys *ss, struct task_struct *task);
int (*populate)(struct container_subsys *ss,
struct container *cont);
+ void (*mount)(struct container_subsys *ss, struct container *cont);
+ void (*umount)(struct container_subsys *ss, struct container *cont);

int subsys_id;
#define MAX_CONTAINER_TYPE_NAMELEN 32
diff -puN kernel/container.c~add-mount-callback kernel/container.c
--- linux-2.6.20-rc3/kernel/container.c~add-mount-callback 2007-01-12
21:23:00.000000000 +0530
+++ linux-2.6.20-rc3-balbir/kernel/container.c 2007-01-12 21:42:59.000000000 +0530
@@ -394,6 +394,7 @@ static void container_put_super(struct s
int i;
struct container *cont = &root->top_container;
struct task_struct *g, *p;
+ struct container_subsys *ss;

root->sb = NULL;
sb->s_fs_info = NULL;
@@ -407,6 +408,11 @@ static void container_put_super(struct s

mutex_lock(&callback_mutex);

+ for_each_subsys(hierarchy, ss) {
+ if (ss->umount)
+ ss->umount(ss, cont);
+ }
+
/* Remove all tasks from this container hierarchy */
read_lock(&tasklist_lock);
do_each_thread(g, p) {
@@ -607,6 +613,12 @@ static int container_get_sb(struct file_
rcu_assign_pointer(subsys[i]->hierarchy,
hierarchy);
}
+
+ for_each_subsys(hierarchy, ss) {
+ if (ss->mount)
+ ss->mount(ss, cont);
+ }
+
mutex_unlock(&callback_mutex);
synchronize_rcu();

_

Balbir Singh,
Linux Technology Center,
IBM Software Labs

PS: I hope my mailer does not word wrap the patches.

2007-01-15 09:08:29

by Balbir Singh

[permalink] [raw]

Subject: [PATCH 1/1] Fix a panic while mouting containers on powerpc and some other small cleanups (Re: [ckrm-tech] [PATCH 4/6] containers: Simple CPU accounting container subsystem)

Balbir Singh wrote:
> Paul Menage wrote:
>> On 1/10/07, Balbir Singh <[email protected]> wrote:
>>> I have run into a problem running this patch on a powerpc box. Basically,
>>> the machine panics as soon as I mount the container filesystem with
>> This is a multi-processor system?
>>
> Hi, Paul,
>
> I figured out the reason for the panic. Here are the fixes
>

Here is the second patch and the real fix in sched.c

Fix coding style in cpuacct_charge()

In sched.c, account_user_time() can be called with the task p set to rq->idle.
Since idle tasks do not belong to any container, this was causing a panic in
task_ca() in cpu_acct.c.

Multiplying the time by 1000 is not correct in cpuusage_read(). The code
has been converted to use the correct cputime API.

Add mount/umount callbacks.

Signed-off-by: Balbir Singh <[email protected]>
---

kernel/cpu_acct.c | 29 +++++++++++++++++++++++------
kernel/sched.c | 17 +++++++++++------
2 files changed, 34 insertions(+), 12 deletions(-)

diff -puN kernel/cpu_acct.c~fix-cpuacct-panic-on-mount kernel/cpu_acct.c
--- linux-2.6.20-rc3/kernel/cpu_acct.c~fix-cpuacct-panic-on-mount 2007-01-15
14:23:20.000000000 +0530
+++ linux-2.6.20-rc3-balbir/kernel/cpu_acct.c 2007-01-15 14:23:20.000000000 +0530
@@ -22,6 +22,7 @@ struct cpuacct {
};

static struct container_subsys cpuacct_subsys;
+static struct container *root;

static inline struct cpuacct *container_ca(struct container *cont)
{
@@ -49,6 +50,16 @@ static void cpuacct_destroy(struct conta
kfree(container_ca(cont));
}

+static void cpuacct_mount(struct container_subsys *ss, struct container *cont)
+{
+ root = cont;
+}
+
+static void cpuacct_umount(struct container_subsys *ss, struct container *cont)
+{
+ root = NULL;
+}
+
static ssize_t cpuusage_read(struct container *cont,
struct cftype *cft,
struct file *file,
@@ -57,6 +68,7 @@ static ssize_t cpuusage_read(struct cont
{
struct cpuacct *ca = container_ca(cont);
cputime64_t time;
+ unsigned long time_in_jiffies;
char usagebuf[64];
char *s = usagebuf;

@@ -64,9 +76,8 @@ static ssize_t cpuusage_read(struct cont
time = ca->time;
spin_unlock_irq(&ca->lock);

- time *= 1000;
- do_div(time, HZ);
- s += sprintf(s, "%llu", (unsigned long long) time);
+ time_in_jiffies = cputime_to_jiffies(time);
+ s += sprintf(s, "%llu\n", (unsigned long long) time_in_jiffies);

return simple_read_from_buffer(buf, nbytes, ppos, usagebuf, s - usagebuf);
}
@@ -83,12 +94,13 @@ static int cpuacct_populate(struct conta
}

-void cpuacct_charge(struct task_struct *task, cputime_t cputime) {
+void cpuacct_charge(struct task_struct *task, cputime_t cputime)
+{

struct cpuacct *ca;
unsigned long flags;

- if (cpuacct_subsys.subsys_id < 0) return;
+ if (cpuacct_subsys.subsys_id < 0 || !root) return;
rcu_read_lock();
ca = task_ca(task);
if (ca) {
@@ -104,13 +116,18 @@ static struct container_subsys cpuacct_s
.create = cpuacct_create,
.destroy = cpuacct_destroy,
.populate = cpuacct_populate,
+ .mount = cpuacct_mount,
+ .umount = cpuacct_umount,
.subsys_id = -1,
};

int __init init_cpuacct(void)
{
- int id = container_register_subsys(&cpuacct_subsys);
+ int id;
+
+ root = NULL;
+ id = container_register_subsys(&cpuacct_subsys);
return id < 0 ? id : 0;
}

diff -puN kernel/sched.c~fix-cpuacct-panic-on-mount kernel/sched.c
--- linux-2.6.20-rc3/kernel/sched.c~fix-cpuacct-panic-on-mount 2007-01-15
14:23:20.000000000 +0530
+++ linux-2.6.20-rc3-balbir/kernel/sched.c 2007-01-15 14:23:20.000000000 +0530
@@ -3067,10 +3067,17 @@ void account_user_time(struct task_struc
{
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
cputime64_t tmp;
+ struct rq *rq = this_rq();

p->utime = cputime_add(p->utime, cputime);

- cpuacct_charge(p, cputime);
+ /*
+ * On powerpc this routine can be called with p set to the idle
+ * task of the cpu. idle tasks don't really belong to any
+ * container.
+ */
+ if (p != rq->idle)
+ cpuacct_charge(p, cputime);

/* Add user time to cpustat. */
tmp = cputime_to_cputime64(cputime);
@@ -3095,18 +3102,16 @@ void account_system_time(struct task_str

p->stime = cputime_add(p->stime, cputime);

- if (p != rq->idle)
- cpuacct_charge(p, cputime);
-
/* Add system time to cpustat. */
tmp = cputime_to_cputime64(cputime);
if (hardirq_count() - hardirq_offset)
cpustat->irq = cputime64_add(cpustat->irq, tmp);
else if (softirq_count())
cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
- else if (p != rq->idle)
+ else if (p != rq->idle) {
cpustat->system = cputime64_add(cpustat->system, tmp);
- else if (atomic_read(&rq->nr_iowait) > 0)
+ cpuacct_charge(p, cputime);
+ } else if (atomic_read(&rq->nr_iowait) > 0)
cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
else
cpustat->idle = cputime64_add(cpustat->idle, tmp);
_

Balbir Singh
Linux Technology Center
Bangalore, IBM ISTL

2007-01-15 09:22:35

by Paul Menage

[permalink] [raw]

Subject: Re: [PATCH 1/1] Fix a panic while mouting containers on powerpc and some other small cleanups (Re: [ckrm-tech] [PATCH 4/6] containers: Simple CPU accounting container subsystem)

On 1/15/07, Balbir Singh <[email protected]> wrote:
>
> In sched.c, account_user_time() can be called with the task p set to rq->idle.
> Since idle tasks do not belong to any container, this was causing a panic in
> task_ca() in cpu_acct.c.

How come that didn't cause a problem on x86_64? If this is an
inconsistency between architectures then perhaps it ought to be
cleaned up.

Additionally, I think that we should make the idle tasks members of
the root container(s), to remove this special case. (I'm a bit
surprised that they're not already - I thought that the early
container initialization was early enough that the idle tasks hadn't
yet been forked. Is that different on PowerPC?

>
> Multiplying the time by 1000 is not correct in cpuusage_read(). The code
> has been converted to use the correct cputime API.

Thanks.

>
> Add mount/umount callbacks.

I'm not sure I like the mount/unmount callbacks. What exactly are you
trying to gain from them? My intention was that the

cont->subsys[i]->container = cont;

line in container_get_sb() was doing essentially this - i.e. the
container_subsys_state for the root container in a subsystem is
already kept up to date by the container system, and the subsystem can
rely on the "container" field in the container_subsys_state.

Thanks,

Paul

>
> Signed-off-by: Balbir Singh <[email protected]>
> ---
>
> kernel/cpu_acct.c | 29 +++++++++++++++++++++++------
> kernel/sched.c | 17 +++++++++++------
> 2 files changed, 34 insertions(+), 12 deletions(-)
>
> diff -puN kernel/cpu_acct.c~fix-cpuacct-panic-on-mount kernel/cpu_acct.c
> --- linux-2.6.20-rc3/kernel/cpu_acct.c~fix-cpuacct-panic-on-mount 2007-01-15
> 14:23:20.000000000 +0530
> +++ linux-2.6.20-rc3-balbir/kernel/cpu_acct.c 2007-01-15 14:23:20.000000000 +0530
> @@ -22,6 +22,7 @@ struct cpuacct {
> };
>
> static struct container_subsys cpuacct_subsys;
> +static struct container *root;
>
> static inline struct cpuacct *container_ca(struct container *cont)
> {
> @@ -49,6 +50,16 @@ static void cpuacct_destroy(struct conta
> kfree(container_ca(cont));
> }
>
> +static void cpuacct_mount(struct container_subsys *ss, struct container *cont)
> +{
> + root = cont;
> +}
> +
> +static void cpuacct_umount(struct container_subsys *ss, struct container *cont)
> +{
> + root = NULL;
> +}
> +
> static ssize_t cpuusage_read(struct container *cont,
> struct cftype *cft,
> struct file *file,
> @@ -57,6 +68,7 @@ static ssize_t cpuusage_read(struct cont
> {
> struct cpuacct *ca = container_ca(cont);
> cputime64_t time;
> + unsigned long time_in_jiffies;
> char usagebuf[64];
> char *s = usagebuf;
>
> @@ -64,9 +76,8 @@ static ssize_t cpuusage_read(struct cont
> time = ca->time;
> spin_unlock_irq(&ca->lock);
>
> - time *= 1000;
> - do_div(time, HZ);
> - s += sprintf(s, "%llu", (unsigned long long) time);
> + time_in_jiffies = cputime_to_jiffies(time);
> + s += sprintf(s, "%llu\n", (unsigned long long) time_in_jiffies);
>
> return simple_read_from_buffer(buf, nbytes, ppos, usagebuf, s - usagebuf);
> }
> @@ -83,12 +94,13 @@ static int cpuacct_populate(struct conta
> }
>
>
> -void cpuacct_charge(struct task_struct *task, cputime_t cputime) {
> +void cpuacct_charge(struct task_struct *task, cputime_t cputime)
> +{
>
> struct cpuacct *ca;
> unsigned long flags;
>
> - if (cpuacct_subsys.subsys_id < 0) return;
> + if (cpuacct_subsys.subsys_id < 0 || !root) return;
> rcu_read_lock();
> ca = task_ca(task);
> if (ca) {
> @@ -104,13 +116,18 @@ static struct container_subsys cpuacct_s
> .create = cpuacct_create,
> .destroy = cpuacct_destroy,
> .populate = cpuacct_populate,
> + .mount = cpuacct_mount,
> + .umount = cpuacct_umount,
> .subsys_id = -1,
> };
>
>
> int __init init_cpuacct(void)
> {
> - int id = container_register_subsys(&cpuacct_subsys);
> + int id;
> +
> + root = NULL;
> + id = container_register_subsys(&cpuacct_subsys);
> return id < 0 ? id : 0;
> }
>
> diff -puN kernel/sched.c~fix-cpuacct-panic-on-mount kernel/sched.c
> --- linux-2.6.20-rc3/kernel/sched.c~fix-cpuacct-panic-on-mount 2007-01-15
> 14:23:20.000000000 +0530
> +++ linux-2.6.20-rc3-balbir/kernel/sched.c 2007-01-15 14:23:20.000000000 +0530
> @@ -3067,10 +3067,17 @@ void account_user_time(struct task_struc
> {
> struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
> cputime64_t tmp;
> + struct rq *rq = this_rq();
>
> p->utime = cputime_add(p->utime, cputime);
>
> - cpuacct_charge(p, cputime);
> + /*
> + * On powerpc this routine can be called with p set to the idle
> + * task of the cpu. idle tasks don't really belong to any
> + * container.
> + */
> + if (p != rq->idle)
> + cpuacct_charge(p, cputime);
>
> /* Add user time to cpustat. */
> tmp = cputime_to_cputime64(cputime);
> @@ -3095,18 +3102,16 @@ void account_system_time(struct task_str
>
> p->stime = cputime_add(p->stime, cputime);
>
> - if (p != rq->idle)
> - cpuacct_charge(p, cputime);
> -
> /* Add system time to cpustat. */
> tmp = cputime_to_cputime64(cputime);
> if (hardirq_count() - hardirq_offset)
> cpustat->irq = cputime64_add(cpustat->irq, tmp);
> else if (softirq_count())
> cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
> - else if (p != rq->idle)
> + else if (p != rq->idle) {
> cpustat->system = cputime64_add(cpustat->system, tmp);
> - else if (atomic_read(&rq->nr_iowait) > 0)
> + cpuacct_charge(p, cputime);
> + } else if (atomic_read(&rq->nr_iowait) > 0)
> cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
> else
> cpustat->idle = cputime64_add(cpustat->idle, tmp);
> _
>
> Balbir Singh
> Linux Technology Center
> Bangalore, IBM ISTL
>

2007-01-15 09:52:06

by Balbir Singh

[permalink] [raw]

Subject: Re: [ckrm-tech] [PATCH 1/1] Fix a panic while mouting containers on powerpc and some other small cleanups (Re: [PATCH 4/6] containers: Simple CPU accounting container subsystem)

Paul Menage wrote:
> On 1/15/07, Balbir Singh <[email protected]> wrote:
>> In sched.c, account_user_time() can be called with the task p set to rq->idle.
>> Since idle tasks do not belong to any container, this was causing a panic in
>> task_ca() in cpu_acct.c.
>
> How come that didn't cause a problem on x86_64? If this is an
> inconsistency between architectures then perhaps it ought to be
> cleaned up.
>

That is because account_system/user_time() is also called from
account_process_vtime() which is called from __switch_to in
power pc. vtime is for virtual time accounting. Enabled by
CONFIG_VIRT_CPU_ACCOUNTING.

> Additionally, I think that we should make the idle tasks members of
> the root container(s), to remove this special case. (I'm a bit
> surprised that they're not already - I thought that the early
> container initialization was early enough that the idle tasks hadn't
> yet been forked. Is that different on PowerPC?
>

idle threads are associated only with the runqueue and not visible
by the do_each_thread()/while_each_thread() loop. They are not added
to the tasklist (please see init_idle() in kernel/sched.c).

>> Multiplying the time by 1000 is not correct in cpuusage_read(). The code
>> has been converted to use the correct cputime API.
>
> Thanks.
>
>> Add mount/umount callbacks.
>
> I'm not sure I like the mount/unmount callbacks. What exactly are you
> trying to gain from them? My intention was that the
>
> cont->subsys[i]->container = cont;
>
> line in container_get_sb() was doing essentially this - i.e. the
> container_subsys_state for the root container in a subsystem is
> already kept up to date by the container system, and the subsystem can
> rely on the "container" field in the container_subsys_state.
>

While writing/extending the cpuacct container, I found it useful to
know if the container resource group we are controlling is really mounted.
Controllers can try and avoid doing work when not mounted and start
when the subsystem is mounted. Also, without these callbacks, one has no
definite way of checking if the top_container is dummy or for real.

> Thanks,
>
> Paul

--

Balbir Singh,
Linux Technology Center,
IBM Software Labs

2007-01-15 10:01:38

by Paul Menage

[permalink] [raw]

Subject: Re: [ckrm-tech] [PATCH 1/1] Fix a panic while mouting containers on powerpc and some other small cleanups (Re: [PATCH 4/6] containers: Simple CPU accounting container subsystem)

On 1/15/07, Balbir Singh <[email protected]> wrote:
>
> While writing/extending the cpuacct container, I found it useful to
> know if the container resource group we are controlling is really mounted.
> Controllers can try and avoid doing work when not mounted and start
> when the subsystem is mounted. Also, without these callbacks, one has no
> definite way of checking if the top_container is dummy or for real.
>

That's somewhat intentional - my aim was that the controllers
shouldn't really care whether they're connected to the default
hierarchy or have been bound to some mounted hierarchy. Having said
thay, they can determine it by checking <foo>_subsys.hierarchy if they
really want to. If that's 0 then they're in the default hierarchy (and
can assume that all tasks are in one top-level container).

Paul

2007-01-15 10:12:04

by Balbir Singh

[permalink] [raw]

Subject: Re: [ckrm-tech] [PATCH 1/1] Fix a panic while mouting containers on powerpc and some other small cleanups (Re: [PATCH 4/6] containers: Simple CPU accounting container subsystem)

Paul Menage wrote:
> On 1/15/07, Balbir Singh <[email protected]> wrote:
>> While writing/extending the cpuacct container, I found it useful to
>> know if the container resource group we are controlling is really mounted.
>> Controllers can try and avoid doing work when not mounted and start
>> when the subsystem is mounted. Also, without these callbacks, one has no
>> definite way of checking if the top_container is dummy or for real.
>>
>
> That's somewhat intentional - my aim was that the controllers
> shouldn't really care whether they're connected to the default
> hierarchy or have been bound to some mounted hierarchy. Having said
> thay, they can determine it by checking <foo>_subsys.hierarchy if they
> really want to. If that's 0 then they're in the default hierarchy (and
> can assume that all tasks are in one top-level container).
>

That makes sense, the only additional thing required is to know when
the subsystem really got mounted (we cannot keep polling hierarchy
for it:-))

> Paul

--

Balbir Singh,
Linux Technology Center,
IBM Software Labs