2006-09-17 17:53:24

by Balbir Singh

[permalink] [raw]
Subject: [RFC][PATCH 0/4] Aggregated beancounters (v3)

Attempt #3 to build aggregated beancounters on top of beancounters.
The earlier patch built per tgid beancounters, but that did not work
well. Subsystems keep track of references to beancounters. This patch
creates an aggregated beancounters - they aggregate beancounters. A
beancounter is created for every tgid.

This patch is an RFC for early comments and discussion and a proof of
concept approach to check if this approach can be used as a basis to support
task migration. Dave Hansen initially suggested the idea.

TODOs (some of so many)

1. Add limit checking before migrating tasks
2. Add support for reclamation.
3. Add support for guarantees
4. Add support for per-task beancounters (cpu controller is likely to
require it)

series
------
per-tgid-beancounters.patch
add-aggr-bc.patch
aggr-bc-syscalls.patch
aggr-bc-charging-support.patch

This patch was minimally tested on a x86-64 box.

Comments?

Balbir Singh


Utility ctl.c for controlling and creating beancounters
-------------------------------------------------------

#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <getopt.h>
#include <string.h>
#include <asm/unistd.h>

#ifdef DEBUG
#define debug(x...) printf(x)
#else
#define debug(x...)
#endif

#define err(stream, msg...) fprintf(stream, msg), exit(1);

char *resources[] = {
"kmemsize",
"lockedpages",
"privvmpages",
};

/*
* For i386
*/
#if 0
#define __NR_get_bcid 319
#define __NR_set_bcid 320
#define __NR_set_bclimit 321
#define __NR_get_bcstat 322
#endif

/* For ia64 */
#define __NR_get_bcid 280
#define __NR_set_bcid 281
#define __NR_set_bclimit 282
#define __NR_get_bcstat 283

struct ab_resource_parm {
unsigned long barrier;
unsigned long held;
unsigned long limit; /* hard resource limit */
unsigned long failcnt; /* count of failed charges */
};

struct bc_resource_parm {
unsigned long barrier; /* A barrier over which resource allocations
* are failed gracefully. e.g. if the amount
* of consumed memory is over the barrier
* further sbrk() or mmap() calls fail, the
* existing processes are not killed.
*/
unsigned long limit; /* hard resource limit */
unsigned long held; /* consumed resources */
unsigned long maxheld; /* maximum amount of consumed resources */
unsigned long minheld; /* minumum amount of consumed resources */
unsigned long failcnt; /* count of failed charges */
};

_syscall2(long, set_bcid, int, id, int, pid)
_syscall0(long, get_bcid)
_syscall3(long, set_bclimit, int, id, unsigned long, resource, unsigned long *,
limits)
_syscall5(long, get_bcstat, int, ab_id, int, bc_id,
unsigned long, resource, struct bc_resource_parm *,
bc_parm, struct ab_resource_parm *, ab_parm)

int main(int argc, char *argv[])
{
int opt;
int ab_id = 0, bc_id;
unsigned long limit[2] = { 0, 0 };
int rc = 0;
int set_limit = 0;
struct bc_resource_parm bc_parm;
struct ab_resource_parm ab_parm;
int res_id = 0;
int pid = getpid();

do {
opt = getopt(argc, argv, "i:c:b:l:d:gr:p:");
if (opt < 0)
break;
switch (opt) {
case 'i':
ab_id = atoi(optarg);
debug("id %d\n", ab_id);
break;
case 'p':
pid = atoi(optarg);
debug("pid %d\n", pid);
break;
case 'r':
res_id = atoi(optarg);
debug("resource %s\n", resources[res_id]);
break;
case 'c':
ab_id = atoi(optarg);
debug("ab_id %d\n", ab_id);
rc = set_bcid(ab_id, pid);
if (rc < 0) {
perror("set_bcid failed:");
break;
}
break;
case 'g':
rc = get_bcid();
if (rc < 0) {
perror("set_bcid failed:");
break;
}
printf("current id %d\n", rc);
break;
case 'd':
bc_id = atoi(optarg);
rc = get_bcstat(bc_id, ab_id, res_id, &bc_parm,
&ab_parm);
if (rc < 0) {
perror("getstat failed:");
break;
}
printf("BC: %d, limit %lu, barrier %lu held %lu\n",
bc_id, bc_parm.barrier, bc_parm.limit,
bc_parm.held);
printf("AB: %d, limit %lu, barrier %lu held %lu\n",
ab_id, ab_parm.barrier, ab_parm.limit,
ab_parm.held);
break;
case 'b':
limit[0] = atoi(optarg);
if (limit[0] == 0)
err(stderr,
"Invalid barrier, please try again barrier"
" %lu\n", limit[0]);
debug("barrier is %lu\n", limit[0]);
break;
case 'l':
set_limit = 1;
limit[1] = atoi(optarg);
if (limit[1] == 0)
err(stderr,
"Invalid limit, please try again limit %lu\n",
limit[1]);
debug("limit is %lu\n", limit[1]);
break;
default:
err(stderr, "unknown option %c\n", opt);
}
} while (1);

if (set_limit && ab_id && limit[0] && limit[1]) {
rc = set_bclimit(ab_id, 0UL, limit);
if (rc < 0)
perror("set_bclimit failed: ");
}
return rc;
}

--

Balbir Singh,
Linux Technology Center,
IBM Software Labs


2006-09-17 17:53:46

by Balbir Singh

[permalink] [raw]
Subject: [RFC][PATCH 3/4] Aggregated beancounters syscall support



Change the system calls to operate with aggregated beancounters instead
of beancounters.

Signed-off-by: Balbir Singh <[email protected]>
---

kernel/bc/sys.c | 91 ++++++++++++++++++++++++++++++++++++++------------------
1 files changed, 63 insertions(+), 28 deletions(-)

diff -puN kernel/bc/sys.c~aggr-bc-syscalls kernel/bc/sys.c
--- linux-2.6.18-rc5/kernel/bc/sys.c~aggr-bc-syscalls 2006-09-17 20:34:02.000000000 +0530
+++ linux-2.6.18-rc5-balbir/kernel/bc/sys.c 2006-09-17 20:34:02.000000000 +0530
@@ -10,39 +10,60 @@

#include <bc/beancounter.h>
#include <bc/task.h>
+#include <bc/vmpages.h>

asmlinkage long sys_get_bcid(void)
{
struct beancounter *bc;

bc = get_exec_bc();
- return bc->bc_id;
+ return bc->ab->ab_id;
}

-asmlinkage long sys_set_bcid(bcid_t id)
+asmlinkage long sys_set_bcid(bcid_t id, pid_t pid)
{
int error;
- struct beancounter *bc;
- struct task_beancounter *task_bc;
-
- task_bc = &current->task_bc;
+ struct beancounter *bc, *new_bc;
+ struct aggr_beancounter *ab;
+ struct task_struct *tsk;

/* You may only set an bc as root */
error = -EPERM;
if (!capable(CAP_SETUID))
+ return error;
+
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_pid(pid);
+ if (!tsk) {
+ read_unlock(&tasklist_lock);
+ error = -ESRCH;
goto out;
+ }
+ read_unlock(&tasklist_lock);
+
+ error = -EINVAL;
+ if (id == tsk->task_bc.exec_bc->ab->ab_id)
+ return error;

/* Ok - set up a beancounter entry for this user */
error = -ENOMEM;
- bc = beancounter_findcreate(id, BC_ALLOC);
- if (bc == NULL)
+ ab = ab_findcreate(id, BC_ALLOC);
+ if (ab == NULL)
goto out;

- /* install bc */
- put_beancounter(task_bc->exec_bc);
- task_bc->exec_bc = bc;
- put_beancounter(task_bc->fork_bc);
- task_bc->fork_bc = get_beancounter(bc);
+ /*
+ * Check if limits allow the move - or reclaim if required
+ * TODO:
+ */
+
+ get_task_struct(tsk);
+ bc = tsk->task_bc.exec_bc;
+ new_bc = beancounter_relocate(ab, bc->ab, tsk->tgid);
+ put_task_struct(tsk);
+ if (new_bc == NULL) {
+ put_aggr_beancounter(ab);
+ error = -ESRCH;
+ }
error = 0;
out:
return error;
@@ -53,7 +74,7 @@ asmlinkage long sys_set_bclimit(bcid_t i
{
int error;
unsigned long flags;
- struct beancounter *bc;
+ struct aggr_beancounter *ab;
unsigned long new_limits[2];

error = -EPERM;
@@ -74,45 +95,59 @@ asmlinkage long sys_set_bclimit(bcid_t i
goto out;

error = -ENOENT;
- bc = beancounter_findcreate(id, BC_LOOKUP);
- if (bc == NULL)
+ ab = ab_findcreate(id, BC_LOOKUP);
+ if (ab == NULL)
goto out;

- spin_lock_irqsave(&bc->bc_lock, flags);
- bc->bc_parms[resource].barrier = new_limits[0];
- bc->bc_parms[resource].limit = new_limits[1];
- spin_unlock_irqrestore(&bc->bc_lock, flags);
+ spin_lock_irqsave(&ab->ab_lock, flags);
+ ab->ab_parms[resource].barrier = new_limits[0];
+ ab->ab_parms[resource].limit = new_limits[1];
+ spin_unlock_irqrestore(&ab->ab_lock, flags);

- put_beancounter(bc);
+ put_aggr_beancounter(ab);
error = 0;
out:
return error;
}

-int sys_get_bcstat(bcid_t id, unsigned long resource,
- struct bc_resource_parm __user *uparm)
+int sys_get_bcstat(bcid_t bc_id, bcid_t ab_id, unsigned long resource,
+ struct bc_resource_parm __user *bc_uparm,
+ struct ab_resource_parm __user *ab_uparm)
{
int error;
unsigned long flags;
struct beancounter *bc;
- struct bc_resource_parm parm;
+ struct aggr_beancounter *ab;
+ struct bc_resource_parm bc_parm;
+ struct ab_resource_parm ab_parm;

error = -EINVAL;
if (resource >= BC_RESOURCES)
goto out;

error = -ENOENT;
- bc = beancounter_findcreate(id, BC_LOOKUP);
- if (bc == NULL)
+ ab = ab_findcreate(ab_id, BC_LOOKUP);
+ if (ab == NULL)
goto out;
+ spin_lock_irqsave(&ab->ab_lock, flags);
+ bc = beancounter_find_locked(ab, bc_id);
+ if (bc == NULL) {
+ spin_unlock_irqrestore(&ab->ab_lock, flags);
+ goto out;
+ }

spin_lock_irqsave(&bc->bc_lock, flags);
- parm = bc->bc_parms[resource];
+ bc_parm = bc->bc_parms[resource];
spin_unlock_irqrestore(&bc->bc_lock, flags);
+ ab_parm = ab->ab_parms[resource];
+ spin_unlock_irqrestore(&ab->ab_lock, flags);
put_beancounter(bc);
+ put_aggr_beancounter(ab);

error = 0;
- if (copy_to_user(uparm, &parm, sizeof(parm)))
+ if (copy_to_user(bc_uparm, &bc_parm, sizeof(bc_parm)))
+ error = -EFAULT;
+ if (copy_to_user(ab_uparm, &ab_parm, sizeof(ab_parm)))
error = -EFAULT;

out:
_

--

Balbir Singh,
Linux Technology Center,
IBM Software Labs

2006-09-17 17:53:53

by Balbir Singh

[permalink] [raw]
Subject: [RFC][PATCH 2/4] Aggregated beancounters infrastructure



Add support for aggregated beancounters. An aggregated beancounter contains
several beancounters. The default aggregator is called init_ab (similar to
init_bc). Aggregated beancounters are created without any limits imposed
on them.

Signed-off-by: Balbir Singh <[email protected]>
---

include/bc/beancounter.h | 48 +++++++-
kernel/bc/beancounter.c | 272 ++++++++++++++++++++++++++++++++++++++++-------
kernel/bc/misc.c | 2
3 files changed, 280 insertions(+), 42 deletions(-)

diff -puN include/bc/beancounter.h~add-aggr-bc include/bc/beancounter.h
--- linux-2.6.18-rc5/include/bc/beancounter.h~add-aggr-bc 2006-09-17 20:33:15.000000000 +0530
+++ linux-2.6.18-rc5-balbir/include/bc/beancounter.h 2006-09-17 20:33:15.000000000 +0530
@@ -18,6 +18,13 @@

#define BC_RESOURCES 3

+struct ab_resource_parm {
+ unsigned long barrier;
+ unsigned long held;
+ unsigned long limit; /* hard resource limit */
+ unsigned long failcnt; /* count of failed charges */
+};
+
struct bc_resource_parm {
unsigned long barrier; /* A barrier over which resource allocations
* are failed gracefully. e.g. if the amount
@@ -51,6 +58,27 @@ struct bc_resource_parm {
*/
#define BC_MAGIC 0x62756275UL

+#define AB_HASH_BITS 8
+#define AB_HASH_SIZE (1 << AB_HASH_BITS)
+#define ab_hash_fn(bcid) (hash_long(bcid, AB_HASH_BITS))
+
+#define BC_HASH_BITS 5
+#define BC_HASH_SIZE (1 << BC_HASH_BITS)
+#define bc_hash_fn(bcid) (hash_long(bcid, BC_HASH_BITS))
+
+
+/*
+ * Aggregate beancounters - group independent BC's together to form groups
+ */
+struct aggr_beancounter {
+ struct ab_resource_parm ab_parms[BC_RESOURCES];
+ atomic_t ab_refcount;
+ struct hlist_head ab_bucket[AB_HASH_SIZE];
+ spinlock_t ab_lock;
+ struct hlist_node hash;
+ bcid_t ab_id;
+};
+
/*
* Resource management structures
* Serialization issues:
@@ -73,6 +101,7 @@ struct beancounter {
#endif
/* resources statistics and settings */
struct bc_resource_parm bc_parms[BC_RESOURCES];
+ struct aggr_beancounter *ab;
};

enum bc_severity { BC_BARRIER, BC_LIMIT, BC_FORCE };
@@ -118,7 +147,20 @@ int __must_check bc_charge(struct beanco
void bc_uncharge_locked(struct beancounter *bc, int res, unsigned long val);
void bc_uncharge(struct beancounter *bc, int res, unsigned long val);

-struct beancounter *beancounter_findcreate(bcid_t id, int mask);
+struct beancounter *beancounter_create(struct aggr_beancounter *ab, bcid_t id);
+struct beancounter *beancounter_find_locked(struct aggr_beancounter *ab,
+ bcid_t id);
+struct aggr_beancounter *ab_findcreate(bcid_t id, int mask);
+struct beancounter *beancounter_relocate(struct aggr_beancounter *dst_ab,
+ struct aggr_beancounter *src_ab,
+ bcid_t id);
+
+static inline struct aggr_beancounter *
+get_aggr_beancounter(struct aggr_beancounter *ab)
+{
+ atomic_inc(&ab->ab_refcount);
+ return ab;
+}

static inline struct beancounter *get_beancounter(struct beancounter *bc)
{
@@ -127,6 +169,7 @@ static inline struct beancounter *get_be
}

void put_beancounter(struct beancounter *bc);
+void put_aggr_beancounter(struct aggr_beancounter *ab);

void bc_init_early(void);
void bc_init_late(void);
@@ -139,7 +182,8 @@ extern const char *bc_rnames[];

#define nr_beancounters 0

-#define beancounter_findcreate(id, f) (NULL)
+#define beancounter_create(ab, id) (NULL)
+#define beancounter_find_locked(ab, id) (NULL)
#define get_beancounter(bc) (NULL)
#define put_beancounter(bc) do { } while (0)

diff -puN kernel/bc/beancounter.c~add-aggr-bc kernel/bc/beancounter.c
--- linux-2.6.18-rc5/kernel/bc/beancounter.c~add-aggr-bc 2006-09-17 20:33:15.000000000 +0530
+++ linux-2.6.18-rc5-balbir/kernel/bc/beancounter.c 2006-09-17 20:33:15.000000000 +0530
@@ -14,11 +14,17 @@
#include <bc/vmrss.h>

static kmem_cache_t *bc_cachep;
+static kmem_cache_t *ab_cachep;
static struct beancounter default_beancounter;

-static void init_beancounter_struct(struct beancounter *bc, bcid_t id);
+static void init_beancounter_struct(struct aggr_beancounter *ab,
+ struct beancounter *bc, bcid_t id);
+static void init_aggr_beancounter_struct(struct aggr_beancounter *ab,
+ bcid_t id);
+static void init_aggr_beancounter_parm_nolimits(struct aggr_beancounter *ab);

-struct beancounter init_bc;
+struct beancounter init_bc;
+struct aggr_beancounter init_ab;

unsigned int nr_beancounters = 1;

@@ -28,12 +34,64 @@ const char *bc_rnames[] = {
"privvmpages",
};

-#define BC_HASH_BITS 8
-#define BC_HASH_SIZE (1 << BC_HASH_BITS)
+/*
+ * Aggregated beancounters are stored in ab_hash bucket
+ */
+static struct hlist_head ab_hash[AB_HASH_SIZE];
+spinlock_t ab_hash_lock;
+
+struct aggr_beancounter *ab_findcreate(bcid_t id, int mask)
+{
+ struct aggr_beancounter *new_ab, *ab;
+ unsigned long flags;
+ struct hlist_head *slot;
+ struct hlist_node *pos;
+
+ /*
+ * Assumption: This function is called with a reference to ab held
+ */
+ slot = &ab_hash[ab_hash_fn(id)];
+ new_ab = NULL;
+
+retry:
+ spin_lock_irqsave(&ab_hash_lock, flags);
+ hlist_for_each_entry (ab, pos, slot, hash)
+ if (ab->ab_id == id)
+ break;

-static struct hlist_head bc_hash[BC_HASH_SIZE];
-static spinlock_t bc_hash_lock;
-#define bc_hash_fn(bcid) (hash_long(bcid, BC_HASH_BITS))
+ if (pos != NULL) {
+ get_aggr_beancounter(ab);
+ spin_unlock_irqrestore(&ab_hash_lock, flags);
+
+ if (new_ab != NULL)
+ kmem_cache_free(ab_cachep, new_ab);
+ return ab;
+ }
+
+ if (new_ab != NULL)
+ goto out_install;
+
+ spin_unlock_irqrestore(&ab_hash_lock, flags);
+
+ if (!(mask & BC_ALLOC))
+ goto out;
+
+ new_ab = kmem_cache_alloc(ab_cachep,
+ mask & BC_ALLOC_ATOMIC ? GFP_ATOMIC : GFP_KERNEL);
+ if (new_ab == NULL)
+ goto out;
+
+ init_aggr_beancounter_struct(new_ab, id);
+ init_aggr_beancounter_parm_nolimits(new_ab);
+ goto retry;
+
+out_install:
+ hlist_add_head(&new_ab->hash, slot);
+ nr_beancounters++;
+ spin_unlock_irqrestore(&ab_hash_lock, flags);
+out:
+ return new_ab;
+}

/*
* Per resource beancounting. Resources are tied to their bc id.
@@ -47,63 +105,159 @@ static spinlock_t bc_hash_lock;
* will mean the old entry is still around with resource tied to it.
*/

-struct beancounter *beancounter_findcreate(bcid_t id, int mask)
+struct beancounter *beancounter_find_locked(struct aggr_beancounter *ab,
+ bcid_t id)
{
- struct beancounter *new_bc, *bc;
- unsigned long flags;
+ struct beancounter *bc;
struct hlist_head *slot;
struct hlist_node *pos;

- slot = &bc_hash[bc_hash_fn(id)];
- new_bc = NULL;
+ /*
+ * Assumption: This function is called with a reference to ab held
+ */
+ slot = &ab->ab_bucket[bc_hash_fn(id)];

-retry:
- spin_lock_irqsave(&bc_hash_lock, flags);
hlist_for_each_entry (bc, pos, slot, hash)
if (bc->bc_id == id)
break;

if (pos != NULL) {
get_beancounter(bc);
- spin_unlock_irqrestore(&bc_hash_lock, flags);
+ return bc;
+ }
+ else
+ return NULL;
+}
+
+struct beancounter *beancounter_create(struct aggr_beancounter *ab, bcid_t id)
+{
+ unsigned long flags;
+ struct beancounter *bc = NULL, *new_bc;
+ struct hlist_head *slot;
+
+ get_aggr_beancounter(ab);
+ spin_lock_irqsave(&ab->ab_lock, flags);
+ bc = beancounter_find_locked(ab, id);
+ spin_unlock_irqrestore(&ab->ab_lock, flags);

- if (new_bc != NULL)
- kmem_cache_free(bc_cachep, new_bc);
+ if (bc)
return bc;
+ else {
+ new_bc = kmem_cache_alloc(bc_cachep, GFP_KERNEL);
+ *new_bc = default_beancounter;
+ init_beancounter_struct(ab, new_bc, id);
}

- if (new_bc != NULL)
- goto out_install;
+ spin_lock_irqsave(&ab->ab_lock, flags);
+ bc = beancounter_find_locked(ab, id);
+ if (unlikely(bc)) {
+ spin_unlock_irqrestore(&ab->ab_lock, flags);
+ kmem_cache_free(bc_cachep, new_bc);
+ return bc;
+ }
+ slot = &ab->ab_bucket[bc_hash_fn(id)];
+ hlist_add_head(&new_bc->hash, slot);
+ spin_unlock_irqrestore(&ab->ab_lock, flags);
+ return new_bc;
+}

- spin_unlock_irqrestore(&bc_hash_lock, flags);
+static void double_ab_lock(struct aggr_beancounter *ab1,
+ struct aggr_beancounter *ab2,
+ unsigned long *flags)
+{
+ if (ab1 > ab2) {
+ spin_lock_irqsave(&ab1->ab_lock, *flags);
+ spin_lock(&ab2->ab_lock);
+ } else if (ab2 > ab1) {
+ spin_lock_irqsave(&ab2->ab_lock, *flags);
+ spin_lock(&ab1->ab_lock);
+ } else
+ BUG();
+}

- if (!(mask & BC_ALLOC))
+static void double_ab_unlock(struct aggr_beancounter *ab1,
+ struct aggr_beancounter *ab2,
+ unsigned long *flags)
+{
+ if (ab1 > ab2) {
+ spin_unlock(&ab2->ab_lock);
+ spin_unlock_irqrestore(&ab1->ab_lock, *flags);
+ } else if (ab2 > ab1) {
+ spin_unlock(&ab1->ab_lock);
+ spin_unlock_irqrestore(&ab2->ab_lock, *flags);
+ } else
+ BUG();
+}
+
+/*
+ * This function should be called with a reference to dst_ab held
+ */
+struct beancounter *beancounter_relocate(struct aggr_beancounter *dst_ab,
+ struct aggr_beancounter *src_ab,
+ bcid_t id)
+{
+ unsigned long flags;
+ struct beancounter *bc = NULL, *new_bc;
+ struct hlist_head *slot;
+
+ double_ab_lock(dst_ab, src_ab, &flags);
+ bc = beancounter_find_locked(src_ab, id);
+ if (!bc)
goto out;

- new_bc = kmem_cache_alloc(bc_cachep,
- mask & BC_ALLOC_ATOMIC ? GFP_ATOMIC : GFP_KERNEL);
- if (new_bc == NULL)
+ /*
+ * Ideally this should be a BUG if new_bc is found.
+ * But we allow a small margin for several threads migrating
+ * to a new aggregated beancounter. BUG() it later.
+ */
+ new_bc = beancounter_find_locked(dst_ab, id);
+ if (new_bc) {
+ bc = new_bc;
goto out;
+ }

- *new_bc = default_beancounter;
- init_beancounter_struct(new_bc, id);
- goto retry;
+ spin_lock(&bc->bc_lock);
+ hlist_del(&bc->hash);
+ slot = &dst_ab->ab_bucket[bc_hash_fn(id)];
+ hlist_add_head(&bc->hash, slot);
+ bc->ab = dst_ab;
+ spin_unlock(&bc->bc_lock);

-out_install:
- hlist_add_head(&new_bc->hash, slot);
- nr_beancounters++;
- spin_unlock_irqrestore(&bc_hash_lock, flags);
out:
- return new_bc;
+ put_aggr_beancounter(src_ab);
+ double_ab_unlock(dst_ab, src_ab, &flags);
+ return bc;
+}
+
+void put_aggr_beancounter(struct aggr_beancounter *ab)
+{
+ int i;
+ unsigned long flags;
+
+ if (!atomic_dec_and_lock_irqsave(&ab->ab_refcount,
+ &ab_hash_lock, flags))
+ return;
+
+ for (i = 0; i < BC_RESOURCES; i++)
+ if (ab->ab_parms[i].held != 0)
+ printk("AB: %d has %lu of %s held on put", ab->ab_id,
+ ab->ab_parms[i].held, bc_rnames[i]);
+
+ hlist_del(&ab->hash);
+ nr_beancounters--;
+ spin_unlock_irqrestore(&ab_hash_lock, flags);
+
+ kmem_cache_free(ab_cachep, ab);
}

void put_beancounter(struct beancounter *bc)
{
int i;
unsigned long flags;
+ struct aggr_beancounter *ab = bc->ab;

if (!atomic_dec_and_lock_irqsave(&bc->bc_refcount,
- &bc_hash_lock, flags))
+ &ab->ab_lock, flags))
return;

BUG_ON(bc == &init_bc);
@@ -123,9 +277,10 @@ void put_beancounter(struct beancounter
#endif
hlist_del(&bc->hash);
nr_beancounters--;
- spin_unlock_irqrestore(&bc_hash_lock, flags);
+ spin_unlock_irqrestore(&ab->ab_lock, flags);

kmem_cache_free(bc_cachep, bc);
+ put_aggr_beancounter(ab);
}

EXPORT_SYMBOL_GPL(put_beancounter);
@@ -228,12 +383,38 @@ EXPORT_SYMBOL_GPL(bc_uncharge);
* of fields not initialized explicitly.
*/

-static void init_beancounter_struct(struct beancounter *bc, bcid_t id)
+static void init_beancounter_struct(struct aggr_beancounter *ab,
+ struct beancounter *bc, bcid_t id)
{
bc->bc_magic = BC_MAGIC;
atomic_set(&bc->bc_refcount, 1);
spin_lock_init(&bc->bc_lock);
bc->bc_id = id;
+ bc->ab = ab;
+}
+
+static void init_aggr_beancounter_struct(struct aggr_beancounter *ab,
+ bcid_t ab_id)
+{
+ int i;
+
+ spin_lock_init(&ab->ab_lock);
+ ab->ab_id = ab_id;
+ atomic_set(&ab->ab_refcount, 1);
+ for (i = 0; i < AB_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&ab->ab_bucket[i]);
+ INIT_HLIST_NODE(&ab->hash);
+}
+
+static void init_aggr_beancounter_parm_nolimits(struct aggr_beancounter *ab)
+{
+ int k;
+
+ for (k = 0; k < BC_RESOURCES; k++) {
+ ab->ab_parms[k].held = 0;
+ ab->ab_parms[k].limit = BC_MAXVALUE;
+ ab->ab_parms[k].barrier = BC_MAXVALUE;
+ }
}

static void init_beancounter_nolimits(struct beancounter *bc)
@@ -261,17 +442,24 @@ static void init_beancounter_syslimits(s
void __init bc_init_early(void)
{
struct beancounter *bc;
+ struct aggr_beancounter *ab;
struct hlist_head *slot;

bc = &init_bc;
+ ab = &init_ab;

init_beancounter_nolimits(bc);
- init_beancounter_struct(bc, 0);
+ init_aggr_beancounter_struct(ab, 0);
+ init_aggr_beancounter_parm_nolimits(ab);
+ init_beancounter_struct(ab, bc, 0);

- spin_lock_init(&bc_hash_lock);
- slot = &bc_hash[bc_hash_fn(bc->bc_id)];
+ slot = &ab->ab_bucket[bc_hash_fn(bc->bc_id)];
hlist_add_head(&bc->hash, slot);

+ spin_lock_init(&ab_hash_lock);
+ slot = &ab_hash[ab_hash_fn(ab->ab_id)];
+ hlist_add_head(&ab->hash, slot);
+
current->task_bc.exec_bc = get_beancounter(bc);
current->task_bc.fork_bc = get_beancounter(bc);
}
@@ -279,12 +467,18 @@ void __init bc_init_early(void)
void __init bc_init_late(void)
{
struct beancounter *bc;
+ struct aggr_beancounter *ab;

bc_cachep = kmem_cache_create("beancounters",
sizeof(struct beancounter), 0,
SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL, NULL);

+ ab_cachep = kmem_cache_create("aggr_beancounters",
+ sizeof(struct aggr_beancounter), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL, NULL);
+
+ ab = &init_ab;
bc = &default_beancounter;
init_beancounter_syslimits(bc);
- init_beancounter_struct(bc, 0);
+ init_beancounter_struct(ab, bc, 0);
}
diff -puN kernel/bc/misc.c~add-aggr-bc kernel/bc/misc.c
--- linux-2.6.18-rc5/kernel/bc/misc.c~add-aggr-bc 2006-09-17 20:33:15.000000000 +0530
+++ linux-2.6.18-rc5-balbir/kernel/bc/misc.c 2006-09-17 20:33:15.000000000 +0530
@@ -16,7 +16,7 @@ void bc_task_charge(struct task_struct *
struct beancounter *bc;

new_bc = &new->task_bc;
- bc = beancounter_findcreate(new->tgid, BC_ALLOC);
+ bc = beancounter_create(parent->task_bc.exec_bc->ab, new->tgid);
if (!bc) {
printk(KERN_WARNING "failed to create bc %d for tgid %d\n",
bc->bc_id, new->tgid);
_

--

Balbir Singh,
Linux Technology Center,
IBM Software Labs

2006-09-17 17:53:27

by Balbir Singh

[permalink] [raw]
Subject: [RFC][PATCH 1/4] Add a beancounter per tgid



Create one beancounter per thread group. This is the base patch and serves
as the starting point for aggregating thread groups.

Signed-off-by: Balbir Singh <[email protected]>
---

kernel/bc/misc.c | 11 +++++++----
kernel/fork.c | 4 ++--
2 files changed, 9 insertions(+), 6 deletions(-)

diff -puN kernel/bc/misc.c~per-tgid-beancounters kernel/bc/misc.c
--- linux-2.6.18-rc5/kernel/bc/misc.c~per-tgid-beancounters 2006-09-12 22:27:02.000000000 +0530
+++ linux-2.6.18-rc5-balbir/kernel/bc/misc.c 2006-09-17 20:30:55.000000000 +0530
@@ -12,15 +12,18 @@

void bc_task_charge(struct task_struct *parent, struct task_struct *new)
{
- struct task_beancounter *old_bc;
struct task_beancounter *new_bc;
struct beancounter *bc;

- old_bc = &parent->task_bc;
new_bc = &new->task_bc;
+ bc = beancounter_findcreate(new->tgid, BC_ALLOC);
+ if (!bc) {
+ printk(KERN_WARNING "failed to create bc %d for tgid %d\n",
+ bc->bc_id, new->tgid);
+ return;
+ }

- bc = old_bc->fork_bc;
- new_bc->exec_bc = get_beancounter(bc);
+ new_bc->exec_bc = bc;
new_bc->fork_bc = get_beancounter(bc);
}

diff -puN kernel/fork.c~per-tgid-beancounters kernel/fork.c
--- linux-2.6.18-rc5/kernel/fork.c~per-tgid-beancounters 2006-09-12 22:27:02.000000000 +0530
+++ linux-2.6.18-rc5-balbir/kernel/fork.c 2006-09-12 22:32:26.000000000 +0530
@@ -994,8 +994,6 @@ static struct task_struct *copy_process(
if (!p)
goto fork_out;

- bc_task_charge(current, p);
-
#ifdef CONFIG_TRACE_IRQFLAGS
DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
@@ -1106,6 +1104,8 @@ static struct task_struct *copy_process(
if (clone_flags & CLONE_THREAD)
p->tgid = current->tgid;

+ bc_task_charge(current, p);
+
if ((retval = security_task_alloc(p)))
goto bad_fork_cleanup_policy;
if ((retval = audit_alloc(p)))
_

--

Balbir Singh,
Linux Technology Center,
IBM Software Labs

2006-09-17 17:53:54

by Balbir Singh

[permalink] [raw]
Subject: [RFC][PATCH 4/4] Aggregated beancounters syscall support



Add support for charging aggregated beancounters along with the beancounters
they contain. Limit checks are also done at the aggregated beancounter level.

Signed-off-by: Balbir Singh <[email protected]>
---

include/bc/beancounter.h | 4 +++
kernel/bc/beancounter.c | 51 +++++++++++++++++++++++++++++++++++++++++++----
kernel/bc/vmpages.c | 23 +++++++++++++++++++--
kernel/bc/vmrss.c | 13 +++++++++++
4 files changed, 84 insertions(+), 7 deletions(-)

diff -puN kernel/bc/beancounter.c~aggr-bc-charging-support kernel/bc/beancounter.c
--- linux-2.6.18-rc5/kernel/bc/beancounter.c~aggr-bc-charging-support 2006-09-17 20:34:48.000000000 +0530
+++ linux-2.6.18-rc5-balbir/kernel/bc/beancounter.c 2006-09-17 20:34:48.000000000 +0530
@@ -199,6 +199,7 @@ struct beancounter *beancounter_relocate
unsigned long flags;
struct beancounter *bc = NULL, *new_bc;
struct hlist_head *slot;
+ int i;

double_ab_lock(dst_ab, src_ab, &flags);
bc = beancounter_find_locked(src_ab, id);
@@ -217,10 +218,26 @@ struct beancounter *beancounter_relocate
}

spin_lock(&bc->bc_lock);
+
+ /*
+ * TODO: Support limit checking before relocation
+ */
+ for (i = 0; i < BC_RESOURCES; i++)
+ src_ab->ab_parms[i].held -= bc->bc_parms[i].held;
+ src_ab->unused_privvmpages -= bc->unused_privvmpages;
+ src_ab->rss_pages -= bc->rss_pages;
+
hlist_del(&bc->hash);
slot = &dst_ab->ab_bucket[bc_hash_fn(id)];
hlist_add_head(&bc->hash, slot);
bc->ab = dst_ab;
+
+ for (i = 0; i < BC_RESOURCES; i++)
+ dst_ab->ab_parms[i].held += bc->bc_parms[i].held;
+
+ dst_ab->unused_privvmpages += bc->unused_privvmpages;
+ dst_ab->rss_pages += bc->rss_pages;
+
spin_unlock(&bc->bc_lock);

out:
@@ -243,6 +260,14 @@ void put_aggr_beancounter(struct aggr_be
printk("AB: %d has %lu of %s held on put", ab->ab_id,
ab->ab_parms[i].held, bc_rnames[i]);

+ if (ab->unused_privvmpages != 0)
+ printk("AB: %d has %lu of unused pages held on put", ab->ab_id,
+ ab->unused_privvmpages);
+#ifdef CONFIG_BEANCOUNTERS_RSS
+ if (ab->rss_pages != 0)
+ printk("AB: %d hash %llu of rss pages held on put", ab->ab_id,
+ ab->rss_pages);
+#endif
hlist_del(&ab->hash);
nr_beancounters--;
spin_unlock_irqrestore(&ab_hash_lock, flags);
@@ -294,6 +319,7 @@ int bc_charge_locked(struct beancounter
enum bc_severity strict)
{
unsigned long new_held;
+ struct aggr_beancounter *ab = bc->ab;

/*
* bc_value <= BC_MAXVALUE, value <= BC_MAXVALUE, and only one addition
@@ -303,17 +329,18 @@ int bc_charge_locked(struct beancounter

switch (strict) {
case BC_BARRIER:
- if (bc->bc_parms[resource].held >
- bc->bc_parms[resource].barrier)
+ if (ab->ab_parms[resource].held >
+ ab->ab_parms[resource].barrier)
break;
/* fallthrough */
case BC_LIMIT:
- if (bc->bc_parms[resource].held >
- bc->bc_parms[resource].limit)
+ if (ab->ab_parms[resource].held >
+ ab->ab_parms[resource].limit)
break;
/* fallthrough */
case BC_FORCE:
bc->bc_parms[resource].held = new_held;
+ ab->ab_parms[resource].held += val;
bc_adjust_maxheld(bc, resource);
return 0;

@@ -344,6 +371,19 @@ EXPORT_SYMBOL_GPL(bc_charge);
/* called with bc->bc_lock held and interrupts disabled */
void bc_uncharge_locked(struct beancounter *bc, int resource, unsigned long val)
{
+ struct aggr_beancounter *ab = bc->ab;
+ unsigned long val2 = val;
+
+ if (unlikely(ab->ab_parms[resource].held < val)) {
+ if (printk_ratelimit()) {
+ printk("AB: overuncharging ab %d %s: val %lu, holds "
+ "%lu\n", ab->ab_id, bc_rnames[resource], val,
+ ab->ab_parms[resource].held);
+ dump_stack();
+ }
+ val2 = ab->ab_parms[resource].held;
+ }
+
if (unlikely(bc->bc_parms[resource].held < val)) {
if (printk_ratelimit()) {
printk("BC: overuncharging bc %d %s: val %lu, holds "
@@ -355,6 +395,7 @@ void bc_uncharge_locked(struct beancount
}

bc->bc_parms[resource].held -= val;
+ ab->ab_parms[resource].held -= val2;
bc_adjust_minheld(bc, resource);
}
EXPORT_SYMBOL_GPL(bc_uncharge_locked);
@@ -404,6 +445,8 @@ static void init_aggr_beancounter_struct
for (i = 0; i < AB_HASH_SIZE; i++)
INIT_HLIST_HEAD(&ab->ab_bucket[i]);
INIT_HLIST_NODE(&ab->hash);
+ ab->unused_privvmpages = 0;
+ ab->rss_pages = 0;
}

static void init_aggr_beancounter_parm_nolimits(struct aggr_beancounter *ab)
diff -puN kernel/bc/vmpages.c~aggr-bc-charging-support kernel/bc/vmpages.c
--- linux-2.6.18-rc5/kernel/bc/vmpages.c~aggr-bc-charging-support 2006-09-17 20:34:48.000000000 +0530
+++ linux-2.6.18-rc5-balbir/kernel/bc/vmpages.c 2006-09-17 20:34:48.000000000 +0530
@@ -17,6 +17,13 @@

void bc_update_privvmpages(struct beancounter *bc)
{
+ struct aggr_beancounter *ab = bc->ab;
+
+ ab->ab_parms[BC_PRIVVMPAGES].held = ab->unused_privvmpages
+#ifdef CONFIG_BEANCOUNTERS_RSS
+ + (ab->rss_pages >> PB_PAGE_WEIGHT_SHIFT)
+#endif
+ ;
bc->bc_parms[BC_PRIVVMPAGES].held = bc->unused_privvmpages
#ifdef CONFIG_BEANCOUNTERS_RSS
+ (bc->rss_pages >> PB_PAGE_WEIGHT_SHIFT)
@@ -33,17 +40,29 @@ static inline int privvm_charge(struct b
return -ENOMEM;

bc->unused_privvmpages += sz;
+ bc->ab->unused_privvmpages += sz;
return 0;
}

static inline void privvm_uncharge(struct beancounter *bc, unsigned long sz)
{
+ unsigned long sz2 = sz;
+ struct aggr_beancounter *ab = bc->ab;
+
if (unlikely(bc->unused_privvmpages < sz)) {
- printk("BC: overuncharging %d unused pages: val %lu held %lu\n",
- bc->bc_id, sz, bc->unused_privvmpages);
+ printk("privvm_uncharge: BC: overuncharging %d unused pages: "
+ " val %lu held %lu\n", bc->bc_id, sz,
+ bc->unused_privvmpages);
sz = bc->unused_privvmpages;
}
+ if (unlikely(ab->unused_privvmpages < sz2)) {
+ printk("privvm_uncharge: AB: overuncharging %d unused pages: "
+ "val %lu held %lu\n", ab->ab_id, sz,
+ ab->unused_privvmpages);
+ sz2 = ab->unused_privvmpages;
+ }
bc->unused_privvmpages -= sz;
+ ab->unused_privvmpages -= sz;
bc_update_privvmpages(bc);
}

diff -puN include/bc/beancounter.h~aggr-bc-charging-support include/bc/beancounter.h
--- linux-2.6.18-rc5/include/bc/beancounter.h~aggr-bc-charging-support 2006-09-17 20:34:48.000000000 +0530
+++ linux-2.6.18-rc5-balbir/include/bc/beancounter.h 2006-09-17 20:34:48.000000000 +0530
@@ -77,6 +77,10 @@ struct aggr_beancounter {
spinlock_t ab_lock;
struct hlist_node hash;
bcid_t ab_id;
+ unsigned long unused_privvmpages;
+#ifdef CONFIG_BEANCOUNTERS_RSS
+ unsigned long long rss_pages;
+#endif
};

/*
diff -puN kernel/bc/vmrss.c~aggr-bc-charging-support kernel/bc/vmrss.c
--- linux-2.6.18-rc5/kernel/bc/vmrss.c~aggr-bc-charging-support 2006-09-17 20:34:48.000000000 +0530
+++ linux-2.6.18-rc5-balbir/kernel/bc/vmrss.c 2006-09-17 20:34:48.000000000 +0530
@@ -154,19 +154,30 @@ static void mod_rss_pages(struct beancou
struct vm_area_struct *vma, int unused)
{
unsigned long flags;
+ int unused2 = unused;

spin_lock_irqsave(&bc->bc_lock, flags);
if (vma && BC_VM_PRIVATE(vma->vm_flags, vma->vm_file)) {
if (unused < 0 && unlikely(bc->unused_privvmpages < -unused)) {
- printk("BC: overuncharging %d unused pages: "
+ printk("mod_rss: BC: overuncharging %d unused pages: "
"val %i, held %lu\n",
bc->bc_id, unused,
bc->unused_privvmpages);
unused = -bc->unused_privvmpages;
}
+ if (unused < 0 && unlikely(bc->ab->unused_privvmpages <
+ -unused)) {
+ printk("mod_rss: AB: overuncharging %d unused pages: "
+ "val %i, held %lu\n",
+ bc->ab->ab_id, unused,
+ bc->ab->unused_privvmpages);
+ unused2 = -bc->ab->unused_privvmpages;
+ }
bc->unused_privvmpages += unused;
+ bc->ab->unused_privvmpages += unused2;
}
bc->rss_pages += val;
+ bc->ab->rss_pages += val;
bc_update_privvmpages(bc);
spin_unlock_irqrestore(&bc->bc_lock, flags);
}
_

--

Balbir Singh,
Linux Technology Center,
IBM Software Labs