2009-04-16 12:28:53

by Ryo Tsuruta

[permalink] [raw]
Subject: [PATCH 1/5] bio-cgroup: Introduction

Hi all,

This is a new release of bio-cgroup which provides an IO tracking
mechanism. The patches can be applied to the kernel 2.6.30-rc1 and you
can also download them from the following site.
http://people.valinux.co.jp/~ryov/bio-cgroup/

What's bio-cgroup all about?
============================

With this feature, you can determine the owners of any type of
I/Os. This makes dm-ioband_--_I/O_bandwidth_controller_-- be able to
control the Block I/O bandwidths even when it accepts delayed write
requests. Dm-ioband can find the owner cgroup of each request. It is
also possible that the other people who work on the i/o bandwidth
throttling use this functionality to control asynchronous I/Os with a
little enhancement.

Setting up bio-cgroup
=====================

You have to apply the patch dm-ioband_v1.9.0 before applying this
series of bio-cgroup patches.
And you have to select the following config options when compiling
kernel.

CONFIG_CGROUPS=y
CONFIG_CGROUP_BIO=y

And I recommend you should also select the options for cgroup memory
subsystem, because it makes it possible to give some I/O bandwidth and
some memory to a certain cgroup to control delayed write requests and
the processes in the cgroup will be able to make pages dirty only
inside the cgroup even when the given bandwidth is narrow.

CONFIG_RESOURCE_COUNTERS=y
CONFIG_CGROUP_MEM_RES_CTLR=y

Using bio-cgroup
================

The following shows how to use dm-ioband with cgroups. Please assume
that you want make two cgroups, which we call "bio cgroup" here, to
track down block I/Os and assign them to ioband device "ioband1".

First, mount the bio cgroup filesystem.

# mount -t cgroup -o bio none /cgroup/bio

Then, make new bio cgroups and put some processes in them.

# mkdir /cgroup/bio/bgroup1
# mkdir /cgroup/bio/bgroup2
# echo 1234 > /cgroup/bio/bgroup1/tasks
# echo 5678 > /cgroup/bio/bgroup1/tasks

Now, check the ID of each bio cgroup which is just created.

# cat /cgroup/bio/bgroup1/bio.id
2
# cat /cgroup/bio/bgroup2/bio.id
3

Finally, attach the cgroups to "ioband1" and assign them weights.

# dmsetup message ioband1 0 type cgroup
# dmsetup message ioband1 0 attach 2
# dmsetup message ioband1 0 attach 3
# dmsetup message ioband1 0 weight 2:30
# dmsetup message ioband1 0 weight 3:60

You can also make use of the dm-ioband administration tool
iobandctl.py. You can set up the device with the tool as follows.
In this case, you don't need to know the IDs of the cgroups.

# iobandctl.py group /dev/mapper/ioband1 cgroup \
/cgroup/bio/bgroup1:30 /cgroup/bio/bgroup2:60


2009-04-16 12:29:43

by Ryo Tsuruta

[permalink] [raw]
Subject: [PATCH 2/5] bio-cgroup: The new page_cgroup framework

This patch makes the page_cgroup framework be able to be used even if
the compile option of the cgroup memory controller is off.
So bio-cgroup can use this framework without the memory controller.

Based on 2.6.30-rc1
Signed-off-by: Hirokazu Takahashi <[email protected]>
Signed-off-by: Ryo Tsuruta <[email protected]>

---
include/linux/memcontrol.h | 6 ++++++
include/linux/mmzone.h | 4 ++--
include/linux/page_cgroup.h | 8 +++++---
init/Kconfig | 4 ++++
mm/Makefile | 3 ++-
mm/memcontrol.c | 6 ++++++
mm/page_cgroup.c | 3 +--
7 files changed, 26 insertions(+), 8 deletions(-)

Index: linux-2.6.30-rc1/include/linux/memcontrol.h
===================================================================
--- linux-2.6.30-rc1.orig/include/linux/memcontrol.h
+++ linux-2.6.30-rc1/include/linux/memcontrol.h
@@ -37,6 +37,8 @@ struct mm_struct;
* (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.)
*/

+extern void __init_mem_page_cgroup(struct page_cgroup *pc);
+
extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask);
/* for swap handling */
@@ -120,6 +122,10 @@ extern bool mem_cgroup_oom_called(struct
#else /* CONFIG_CGROUP_MEM_RES_CTLR */
struct mem_cgroup;

+static inline void __init_mem_page_cgroup(struct page_cgroup *pc)
+{
+}
+
static inline int mem_cgroup_newpage_charge(struct page *page,
struct mm_struct *mm, gfp_t gfp_mask)
{
Index: linux-2.6.30-rc1/include/linux/mmzone.h
===================================================================
--- linux-2.6.30-rc1.orig/include/linux/mmzone.h
+++ linux-2.6.30-rc1/include/linux/mmzone.h
@@ -607,7 +607,7 @@ typedef struct pglist_data {
int nr_zones;
#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
struct page *node_mem_map;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_CGROUP_PAGE
struct page_cgroup *node_page_cgroup;
#endif
#endif
@@ -958,7 +958,7 @@ struct mem_section {

/* See declaration of similar field in struct zone */
unsigned long *pageblock_flags;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_CGROUP_PAGE
/*
* If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
* section. (see memcontrol.h/page_cgroup.h about this.)
Index: linux-2.6.30-rc1/include/linux/page_cgroup.h
===================================================================
--- linux-2.6.30-rc1.orig/include/linux/page_cgroup.h
+++ linux-2.6.30-rc1/include/linux/page_cgroup.h
@@ -1,7 +1,7 @@
#ifndef __LINUX_PAGE_CGROUP_H
#define __LINUX_PAGE_CGROUP_H

-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_CGROUP_PAGE
#include <linux/bit_spinlock.h>
/*
* Page Cgroup can be considered as an extended mem_map.
@@ -12,9 +12,11 @@
*/
struct page_cgroup {
unsigned long flags;
- struct mem_cgroup *mem_cgroup;
struct page *page;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+ struct mem_cgroup *mem_cgroup;
struct list_head lru; /* per cgroup LRU list */
+#endif
};

void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat);
@@ -71,7 +73,7 @@ static inline void unlock_page_cgroup(st
bit_spin_unlock(PCG_LOCK, &pc->flags);
}

-#else /* CONFIG_CGROUP_MEM_RES_CTLR */
+#else /* CONFIG_CGROUP_PAGE */
struct page_cgroup;

static inline void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
Index: linux-2.6.30-rc1/init/Kconfig
===================================================================
--- linux-2.6.30-rc1.orig/init/Kconfig
+++ linux-2.6.30-rc1/init/Kconfig
@@ -608,6 +608,10 @@ config CGROUP_MEM_RES_CTLR_SWAP

endif # CGROUPS

+config CGROUP_PAGE
+ def_bool y
+ depends on CGROUP_MEM_RES_CTLR
+
config MM_OWNER
bool

Index: linux-2.6.30-rc1/mm/Makefile
===================================================================
--- linux-2.6.30-rc1.orig/mm/Makefile
+++ linux-2.6.30-rc1/mm/Makefile
@@ -37,4 +37,5 @@ else
obj-$(CONFIG_SMP) += allocpercpu.o
endif
obj-$(CONFIG_QUICKLIST) += quicklist.o
-obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
+obj-$(CONFIG_CGROUP_PAGE) += page_cgroup.o
Index: linux-2.6.30-rc1/mm/memcontrol.c
===================================================================
--- linux-2.6.30-rc1.orig/mm/memcontrol.c
+++ linux-2.6.30-rc1/mm/memcontrol.c
@@ -128,6 +128,12 @@ struct mem_cgroup_lru_info {
struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
};

+void __meminit __init_mem_page_cgroup(struct page_cgroup *pc)
+{
+ pc->mem_cgroup = NULL;
+ INIT_LIST_HEAD(&pc->lru);
+}
+
/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
Index: linux-2.6.30-rc1/mm/page_cgroup.c
===================================================================
--- linux-2.6.30-rc1.orig/mm/page_cgroup.c
+++ linux-2.6.30-rc1/mm/page_cgroup.c
@@ -14,9 +14,8 @@ static void __meminit
__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
{
pc->flags = 0;
- pc->mem_cgroup = NULL;
pc->page = pfn_to_page(pfn);
- INIT_LIST_HEAD(&pc->lru);
+ __init_mem_page_cgroup(pc);
}
static unsigned long total_usage;

2009-04-16 12:30:32

by Ryo Tsuruta

[permalink] [raw]
Subject: [PATCH 3/5] bio-cgroup: The body of bio-cgroup

This is the body of bio-cgroup.

Based on 2.6.30-rc1
Signed-off-by: Hirokazu Takahashi <[email protected]>
Signed-off-by: Ryo Tsuruta <[email protected]>

---
block/blk-ioc.c | 30 ++---
include/linux/biotrack.h | 83 ++++++++++++++
include/linux/cgroup_subsys.h | 6 +
include/linux/iocontext.h | 1
include/linux/page_cgroup.h | 3
init/Kconfig | 13 ++
mm/biotrack.c | 244 ++++++++++++++++++++++++++++++++++++++++++
mm/page_cgroup.c | 12 +-
8 files changed, 373 insertions(+), 19 deletions(-)

Index: linux-2.6.30-rc1/block/blk-ioc.c
===================================================================
--- linux-2.6.30-rc1.orig/block/blk-ioc.c
+++ linux-2.6.30-rc1/block/blk-ioc.c
@@ -84,24 +84,28 @@ void exit_io_context(void)
}
}

+void init_io_context(struct io_context *ioc)
+{
+ atomic_set(&ioc->refcount, 1);
+ atomic_set(&ioc->nr_tasks, 1);
+ spin_lock_init(&ioc->lock);
+ ioc->ioprio_changed = 0;
+ ioc->ioprio = 0;
+ ioc->last_waited = jiffies; /* doesn't matter... */
+ ioc->nr_batch_requests = 0; /* because this is 0 */
+ ioc->aic = NULL;
+ INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
+ INIT_HLIST_HEAD(&ioc->cic_list);
+ ioc->ioc_data = NULL;
+}
+
struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
{
struct io_context *ret;

ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
- if (ret) {
- atomic_set(&ret->refcount, 1);
- atomic_set(&ret->nr_tasks, 1);
- spin_lock_init(&ret->lock);
- ret->ioprio_changed = 0;
- ret->ioprio = 0;
- ret->last_waited = jiffies; /* doesn't matter... */
- ret->nr_batch_requests = 0; /* because this is 0 */
- ret->aic = NULL;
- INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
- INIT_HLIST_HEAD(&ret->cic_list);
- ret->ioc_data = NULL;
- }
+ if (ret)
+ init_io_context(ret);

return ret;
}
Index: linux-2.6.30-rc1/include/linux/biotrack.h
===================================================================
--- /dev/null
+++ linux-2.6.30-rc1/include/linux/biotrack.h
@@ -0,0 +1,83 @@
+#include <linux/cgroup.h>
+#include <linux/mm.h>
+#include <linux/page_cgroup.h>
+
+#ifndef _LINUX_BIOTRACK_H
+#define _LINUX_BIOTRACK_H
+
+#ifdef CONFIG_CGROUP_BIO
+
+struct io_context;
+struct block_device;
+
+struct bio_cgroup {
+ struct cgroup_subsys_state css;
+ struct io_context *io_context; /* default io_context */
+/* struct radix_tree_root io_context_root; per device io_context */
+};
+
+static inline void __init_bio_page_cgroup(struct page_cgroup *pc)
+{
+ pc->bio_cgroup_id = 0;
+}
+
+static inline bool bio_cgroup_disabled(void)
+{
+ if (bio_cgroup_subsys.disabled)
+ return true;
+ return false;
+}
+
+extern void bio_cgroup_set_owner(struct page *page, struct mm_struct *mm);
+extern void bio_cgroup_reset_owner(struct page *page, struct mm_struct *mm);
+extern void bio_cgroup_reset_owner_pagedirty(struct page *page,
+ struct mm_struct *mm);
+extern void bio_cgroup_copy_owner(struct page *page, struct page *opage);
+
+extern struct io_context *get_bio_cgroup_iocontext(struct bio *bio);
+extern int get_bio_cgroup_id(struct bio *bio);
+
+#else /* CONFIG_CGROUP_BIO */
+
+struct bio_cgroup;
+
+static inline void __init_bio_page_cgroup(struct page_cgroup *pc)
+{
+}
+
+static inline bool bio_cgroup_disabled(void)
+{
+ return true;
+}
+
+static inline void bio_cgroup_set_owner(struct page *page, struct mm_struct *mm)
+{
+}
+
+static inline void bio_cgroup_reset_owner(struct page *page,
+ struct mm_struct *mm)
+{
+}
+
+static inline void bio_cgroup_reset_owner_pagedirty(struct page *page,
+ struct mm_struct *mm)
+{
+}
+
+static inline void bio_cgroup_copy_owner(struct page *page, struct page *opage)
+{
+}
+
+static inline struct io_context *get_bio_cgroup_iocontext(struct bio *bio)
+{
+ return NULL;
+}
+
+static inline int get_bio_cgroup_id(struct bio *bio)
+{
+ return 0;
+}
+
+#endif /* CONFIG_CGROUP_BIO */
+
+#endif /* _LINUX_BIOTRACK_H */
Index: linux-2.6.30-rc1/include/linux/cgroup_subsys.h
===================================================================
--- linux-2.6.30-rc1.orig/include/linux/cgroup_subsys.h
+++ linux-2.6.30-rc1/include/linux/cgroup_subsys.h
@@ -43,6 +43,12 @@ SUBSYS(mem_cgroup)

/* */

+#ifdef CONFIG_CGROUP_BIO
+SUBSYS(bio_cgroup)
+#endif
+
+/* */
+
#ifdef CONFIG_CGROUP_DEVICE
SUBSYS(devices)
#endif
Index: linux-2.6.30-rc1/include/linux/iocontext.h
===================================================================
--- linux-2.6.30-rc1.orig/include/linux/iocontext.h
+++ linux-2.6.30-rc1/include/linux/iocontext.h
@@ -104,6 +104,7 @@ int put_io_context(struct io_context *io
void exit_io_context(void);
struct io_context *get_io_context(gfp_t gfp_flags, int node);
struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
+void init_io_context(struct io_context *ioc);
void copy_io_context(struct io_context **pdst, struct io_context **psrc);
#else
static inline void exit_io_context(void)
Index: linux-2.6.30-rc1/include/linux/page_cgroup.h
===================================================================
--- linux-2.6.30-rc1.orig/include/linux/page_cgroup.h
+++ linux-2.6.30-rc1/include/linux/page_cgroup.h
@@ -17,6 +17,9 @@ struct page_cgroup {
struct mem_cgroup *mem_cgroup;
struct list_head lru; /* per cgroup LRU list */
#endif
+#ifdef CONFIG_CGROUP_BIO
+ unsigned short bio_cgroup_id;
+#endif
};

void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat);
Index: linux-2.6.30-rc1/init/Kconfig
===================================================================
--- linux-2.6.30-rc1.orig/init/Kconfig
+++ linux-2.6.30-rc1/init/Kconfig
@@ -608,9 +608,20 @@ config CGROUP_MEM_RES_CTLR_SWAP

endif # CGROUPS

+config CGROUP_BIO
+ bool "Block I/O cgroup subsystem"
+ depends on CGROUPS && BLOCK
+ select MM_OWNER
+ help
+ Provides a Resource Controller which enables to track the onwner
+ of every Block I/O requests.
+ The information this subsystem provides can be used from any
+ kind of module such as dm-ioband device mapper modules or
+ the cfq-scheduler.
+
config CGROUP_PAGE
def_bool y
- depends on CGROUP_MEM_RES_CTLR
+ depends on CGROUP_MEM_RES_CTLR || CGROUP_BIO

config MM_OWNER
bool
Index: linux-2.6.30-rc1/mm/biotrack.c
===================================================================
--- /dev/null
+++ linux-2.6.30-rc1/mm/biotrack.c
@@ -0,0 +1,244 @@
+/* biotrack.c - Block I/O Tracking
+ *
+ * Copyright (C) VA Linux Systems Japan, 2008-2009
+ * Developed by Hirokazu Takahashi <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/bit_spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/biotrack.h>
+
+/*
+ * The block I/O tracking mechanism is implemented on the cgroup memory
+ * controller framework. It helps to find the the owner of an I/O request
+ * because every I/O request has a target page and the owner of the page
+ * can be easily determined on the framework.
+ */
+
+/* Return the bio_cgroup that associates with a cgroup. */
+static inline struct bio_cgroup *cgroup_bio(struct cgroup *cgrp)
+{
+ return container_of(cgroup_subsys_state(cgrp, bio_cgroup_subsys_id),
+ struct bio_cgroup, css);
+}
+
+/* Return the bio_cgroup that associates with a process. */
+static inline struct bio_cgroup *bio_cgroup_from_task(struct task_struct *p)
+{
+ return container_of(task_subsys_state(p, bio_cgroup_subsys_id),
+ struct bio_cgroup, css);
+}
+
+static struct io_context default_bio_io_context;
+static struct bio_cgroup default_bio_cgroup = {
+ .io_context = &default_bio_io_context,
+};
+
+/*
+ * This function is used to make a given page have the bio-cgroup id of
+ * the owner of this page.
+ */
+void bio_cgroup_set_owner(struct page *page, struct mm_struct *mm)
+{
+ struct bio_cgroup *biog;
+ struct page_cgroup *pc;
+
+ if (bio_cgroup_disabled())
+ return;
+ pc = lookup_page_cgroup(page);
+ if (unlikely(!pc))
+ return;
+
+ pc->bio_cgroup_id = 0; /* 0: default bio_cgroup id */
+ if (!mm)
+ return;
+ /*
+ * Locking "pc" isn't necessary here since the current process is
+ * the only one that can access the members related to bio_cgroup.
+ */
+ rcu_read_lock();
+ biog = bio_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!biog))
+ goto out;
+ /*
+ * css_get(&bio->css) isn't called to increment the reference
+ * count of this bio_cgroup "biog" so pc->bio_cgroup_id might turn
+ * invalid even if this page is still active.
+ * This approach is chosen to minimize the overhead.
+ */
+ pc->bio_cgroup_id = css_id(&biog->css);
+out:
+ rcu_read_unlock();
+}
+
+/*
+ * Change the owner of a given page if necessary.
+ */
+void bio_cgroup_reset_owner(struct page *page, struct mm_struct *mm)
+{
+ /*
+ * A little trick:
+ * Just call bio_cgroup_set_owner() for pages which are already
+ * active since the bio_cgroup_id member of page_cgroup can be
+ * updated without any locks. This is because an integer type of
+ * variable can be set a new value at once on modern cpus.
+ */
+ bio_cgroup_set_owner(page, mm);
+}
+
+/*
+ * Change the owner of a given page. This function is only effective for
+ * pages in the pagecache.
+ */
+void bio_cgroup_reset_owner_pagedirty(struct page *page, struct mm_struct *mm)
+{
+ if (PageSwapCache(page) || PageAnon(page))
+ return;
+ if (current->flags & PF_MEMALLOC)
+ return;
+
+ bio_cgroup_reset_owner(page, mm);
+}
+
+/*
+ * Assign "page" the same owner as "opage."
+ */
+void bio_cgroup_copy_owner(struct page *npage, struct page *opage)
+{
+ struct page_cgroup *npc, *opc;
+
+ if (bio_cgroup_disabled())
+ return;
+ npc = lookup_page_cgroup(npage);
+ if (unlikely(!npc))
+ return;
+ opc = lookup_page_cgroup(opage);
+ if (unlikely(!opc))
+ return;
+
+ /*
+ * Do this without any locks. The reason is the same as
+ * bio_cgroup_reset_owner().
+ */
+ npc->bio_cgroup_id = opc->bio_cgroup_id;
+}
+
+/* Create a new bio-cgroup. */
+static struct cgroup_subsys_state *
+bio_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ struct bio_cgroup *biog;
+ struct io_context *ioc;
+
+ if (!cgrp->parent) {
+ biog = &default_bio_cgroup;
+ init_io_context(biog->io_context);
+ /* Increment the referrence count not to be released ever. */
+ atomic_inc(&biog->io_context->refcount);
+ return &biog->css;
+ }
+
+ biog = kzalloc(sizeof(*biog), GFP_KERNEL);
+ if (!biog)
+ return ERR_PTR(-ENOMEM);
+ ioc = alloc_io_context(GFP_KERNEL, -1);
+ if (!ioc) {
+ kfree(biog);
+ return ERR_PTR(-ENOMEM);
+ }
+ biog->io_context = ioc;
+ return &biog->css;
+}
+
+/* Delete the bio-cgroup. */
+static void bio_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ struct bio_cgroup *biog = cgroup_bio(cgrp);
+
+ put_io_context(biog->io_context);
+ free_css_id(&bio_cgroup_subsys, &biog->css);
+ kfree(biog);
+}
+
+/* Determine the bio-cgroup id of a given bio. */
+int get_bio_cgroup_id(struct bio *bio)
+{
+ struct page_cgroup *pc;
+ struct page *page = bio_iovec_idx(bio, 0)->bv_page;
+ int id = 0;
+
+ pc = lookup_page_cgroup(page);
+ if (pc)
+ id = pc->bio_cgroup_id;
+ return id;
+}
+
+/* Determine the iocontext of the bio-cgroup that issued a given bio. */
+struct io_context *get_bio_cgroup_iocontext(struct bio *bio)
+{
+ struct cgroup_subsys_state *css;
+ struct bio_cgroup *biog = NULL;
+ struct io_context *ioc;
+ int id = 0;
+
+ id = get_bio_cgroup_id(bio);
+
+ rcu_read_lock();
+ css = css_lookup(&bio_cgroup_subsys, id);
+ if (css)
+ biog = container_of(css, struct bio_cgroup, css);
+ else
+ biog = &default_bio_cgroup;
+ rcu_read_unlock();
+
+ ioc = biog->io_context; /* default io_context for this cgroup */
+ atomic_inc(&ioc->refcount);
+ return ioc;
+}
+EXPORT_SYMBOL(get_bio_cgroup_iocontext);
+EXPORT_SYMBOL(get_bio_cgroup_id);
+
+static u64 bio_id_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct bio_cgroup *biog = cgroup_bio(cgrp);
+ int id;
+
+ rcu_read_lock();
+ id = css_id(&biog->css);
+ rcu_read_unlock();
+ return (u64)id;
+}
+
+
+static struct cftype bio_files[] = {
+ {
+ .name = "id",
+ .read_u64 = bio_id_read,
+ },
+};
+
+static int bio_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ return cgroup_add_files(cgrp, ss, bio_files, ARRAY_SIZE(bio_files));
+}
+
+struct cgroup_subsys bio_cgroup_subsys = {
+ .name = "bio",
+ .create = bio_cgroup_create,
+ .destroy = bio_cgroup_destroy,
+ .populate = bio_cgroup_populate,
+ .subsys_id = bio_cgroup_subsys_id,
+ .use_id = 1,
+};
Index: linux-2.6.30-rc1/mm/page_cgroup.c
===================================================================
--- linux-2.6.30-rc1.orig/mm/page_cgroup.c
+++ linux-2.6.30-rc1/mm/page_cgroup.c
@@ -9,6 +9,7 @@
#include <linux/vmalloc.h>
#include <linux/cgroup.h>
#include <linux/swapops.h>
+#include <linux/biotrack.h>

static void __meminit
__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
@@ -16,6 +17,7 @@ __init_page_cgroup(struct page_cgroup *p
pc->flags = 0;
pc->page = pfn_to_page(pfn);
__init_mem_page_cgroup(pc);
+ __init_bio_page_cgroup(pc);
}
static unsigned long total_usage;

@@ -73,7 +75,7 @@ void __init page_cgroup_init(void)

int nid, fail;

- if (mem_cgroup_disabled())
+ if (mem_cgroup_disabled() && bio_cgroup_disabled())
return;

for_each_online_node(nid) {
@@ -82,12 +84,12 @@ void __init page_cgroup_init(void)
goto fail;
}
printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
- printk(KERN_INFO "please try cgroup_disable=memory option if you"
+ printk(KERN_INFO "please try cgroup_disable=memory,bio option if you"
" don't want\n");
return;
fail:
printk(KERN_CRIT "allocation of page_cgroup was failed.\n");
- printk(KERN_CRIT "please try cgroup_disable=memory boot option\n");
+ printk(KERN_CRIT "please try cgroup_disable=memory,bio boot options\n");
panic("Out of memory");
}

@@ -247,7 +249,7 @@ void __init page_cgroup_init(void)
unsigned long pfn;
int fail = 0;

- if (mem_cgroup_disabled())
+ if (mem_cgroup_disabled() && bio_cgroup_disabled())
return;

for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
@@ -262,7 +264,7 @@ void __init page_cgroup_init(void)
hotplug_memory_notifier(page_cgroup_callback, 0);
}
printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
- printk(KERN_INFO "please try cgroup_disable=memory option if you don't"
+ printk(KERN_INFO "please try cgroup_disable=memory,bio option if you don't"
" want\n");
}

2009-04-16 12:31:21

by Ryo Tsuruta

[permalink] [raw]
Subject: [PATCH 4/5] bio-cgroup: Page tracking hooks

This patch contains several hooks that let the bio-cgroup framework to know
which bio-group is the owner of a page before starting I/O against the page.

Based on 2.6.30-rc1
Signed-off-by: Hirokazu Takahashi <[email protected]>
Signed-off-by: Ryo Tsuruta <[email protected]>

---
fs/buffer.c | 2 ++
fs/direct-io.c | 2 ++
mm/Makefile | 1 +
mm/bounce.c | 2 ++
mm/filemap.c | 2 ++
mm/memory.c | 5 +++++
mm/page-writeback.c | 2 ++
mm/swap_state.c | 2 ++
8 files changed, 18 insertions(+)

Index: linux-2.6.30-rc1/fs/buffer.c
===================================================================
--- linux-2.6.30-rc1.orig/fs/buffer.c
+++ linux-2.6.30-rc1/fs/buffer.c
@@ -36,6 +36,7 @@
#include <linux/buffer_head.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/bio.h>
+#include <linux/biotrack.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/bitops.h>
@@ -655,6 +656,7 @@ static void __set_page_dirty(struct page
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
account_page_dirtied(page, mapping);
+ bio_cgroup_reset_owner_pagedirty(page, current->mm);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
Index: linux-2.6.30-rc1/fs/direct-io.c
===================================================================
--- linux-2.6.30-rc1.orig/fs/direct-io.c
+++ linux-2.6.30-rc1/fs/direct-io.c
@@ -33,6 +33,7 @@
#include <linux/err.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
+#include <linux/biotrack.h>
#include <linux/rwsem.h>
#include <linux/uio.h>
#include <asm/atomic.h>
@@ -799,6 +800,7 @@ static int do_direct_IO(struct dio *dio)
ret = PTR_ERR(page);
goto out;
}
+ bio_cgroup_reset_owner(page, current->mm);

while (block_in_page < blocks_per_page) {
unsigned offset_in_page = block_in_page << blkbits;
Index: linux-2.6.30-rc1/mm/bounce.c
===================================================================
--- linux-2.6.30-rc1.orig/mm/bounce.c
+++ linux-2.6.30-rc1/mm/bounce.c
@@ -14,6 +14,7 @@
#include <linux/hash.h>
#include <linux/highmem.h>
#include <linux/blktrace_api.h>
+#include <linux/biotrack.h>
#include <trace/block.h>
#include <asm/tlbflush.h>

@@ -212,6 +213,7 @@ static void __blk_queue_bounce(struct re
to->bv_len = from->bv_len;
to->bv_offset = from->bv_offset;
inc_zone_page_state(to->bv_page, NR_BOUNCE);
+ bio_cgroup_copy_owner(to->bv_page, page);

if (rw == WRITE) {
char *vto, *vfrom;
Index: linux-2.6.30-rc1/mm/filemap.c
===================================================================
--- linux-2.6.30-rc1.orig/mm/filemap.c
+++ linux-2.6.30-rc1/mm/filemap.c
@@ -33,6 +33,7 @@
#include <linux/cpuset.h>
#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
#include <linux/memcontrol.h>
+#include <linux/biotrack.h>
#include <linux/mm_inline.h> /* for page_is_file_cache() */
#include "internal.h"

@@ -463,6 +464,7 @@ int add_to_page_cache_locked(struct page
gfp_mask & GFP_RECLAIM_MASK);
if (error)
goto out;
+ bio_cgroup_set_owner(page, current->mm);

error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error == 0) {
Index: linux-2.6.30-rc1/mm/Makefile
===================================================================
--- linux-2.6.30-rc1.orig/mm/Makefile
+++ linux-2.6.30-rc1/mm/Makefile
@@ -39,3 +39,4 @@ endif
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
obj-$(CONFIG_CGROUP_PAGE) += page_cgroup.o
+obj-$(CONFIG_CGROUP_BIO) += biotrack.o
Index: linux-2.6.30-rc1/mm/memory.c
===================================================================
--- linux-2.6.30-rc1.orig/mm/memory.c
+++ linux-2.6.30-rc1/mm/memory.c
@@ -51,6 +51,7 @@
#include <linux/init.h>
#include <linux/writeback.h>
#include <linux/memcontrol.h>
+#include <linux/biotrack.h>
#include <linux/mmu_notifier.h>
#include <linux/kallsyms.h>
#include <linux/swapops.h>
@@ -2053,6 +2054,7 @@ gotten:
*/
ptep_clear_flush_notify(vma, address, page_table);
page_add_new_anon_rmap(new_page, vma, address);
+ bio_cgroup_set_owner(new_page, mm);
set_pte_at(mm, address, page_table, entry);
update_mmu_cache(vma, address, entry);
if (old_page) {
@@ -2497,6 +2499,7 @@ static int do_swap_page(struct mm_struct
flush_icache_page(vma, page);
set_pte_at(mm, address, page_table, pte);
page_add_anon_rmap(page, vma, address);
+ bio_cgroup_reset_owner(page, mm);
/* It's better to call commit-charge after rmap is established */
mem_cgroup_commit_charge_swapin(page, ptr);

@@ -2560,6 +2563,7 @@ static int do_anonymous_page(struct mm_s
goto release;
inc_mm_counter(mm, anon_rss);
page_add_new_anon_rmap(page, vma, address);
+ bio_cgroup_set_owner(page, mm);
set_pte_at(mm, address, page_table, entry);

/* No need to invalidate - it was non-present before */
@@ -2712,6 +2716,7 @@ static int __do_fault(struct mm_struct *
if (anon) {
inc_mm_counter(mm, anon_rss);
page_add_new_anon_rmap(page, vma, address);
+ bio_cgroup_set_owner(page, mm);
} else {
inc_mm_counter(mm, file_rss);
page_add_file_rmap(page);
Index: linux-2.6.30-rc1/mm/page-writeback.c
===================================================================
--- linux-2.6.30-rc1.orig/mm/page-writeback.c
+++ linux-2.6.30-rc1/mm/page-writeback.c
@@ -23,6 +23,7 @@
#include <linux/init.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
+#include <linux/biotrack.h>
#include <linux/blkdev.h>
#include <linux/mpage.h>
#include <linux/rmap.h>
@@ -1243,6 +1244,7 @@ int __set_page_dirty_nobuffers(struct pa
BUG_ON(mapping2 != mapping);
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
account_page_dirtied(page, mapping);
+ bio_cgroup_reset_owner_pagedirty(page, current->mm);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
Index: linux-2.6.30-rc1/mm/swap_state.c
===================================================================
--- linux-2.6.30-rc1.orig/mm/swap_state.c
+++ linux-2.6.30-rc1/mm/swap_state.c
@@ -18,6 +18,7 @@
#include <linux/pagevec.h>
#include <linux/migrate.h>
#include <linux/page_cgroup.h>
+#include <linux/biotrack.h>

#include <asm/pgtable.h>

@@ -308,6 +309,7 @@ struct page *read_swap_cache_async(swp_e
*/
__set_page_locked(new_page);
SetPageSwapBacked(new_page);
+ bio_cgroup_set_owner(new_page, current->mm);
err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
if (likely(!err)) {
/*

2009-04-16 12:32:26

by Ryo Tsuruta

[permalink] [raw]
Subject: [PATCH 5/5] bio-cgroup: Add a cgroup support to dm-ioband

With this patch, dm-ioband can work with the bio cgroup.

Based on 2.6.30-rc1
Signed-off-by: Hirokazu Takahashi <[email protected]>
Signed-off-by: Ryo Tsuruta <[email protected]>

---
drivers/md/dm-ioband-type.c | 10 ++--------
1 file changed, 2 insertions(+), 8 deletions(-)

Index: linux-2.6.30-rc1/drivers/md/dm-ioband-type.c
===================================================================
--- linux-2.6.30-rc1.orig/drivers/md/dm-ioband-type.c
+++ linux-2.6.30-rc1/drivers/md/dm-ioband-type.c
@@ -6,6 +6,7 @@
* This file is released under the GPL.
*/
#include <linux/bio.h>
+#include <linux/biotrack.h>
#include "dm.h"
#include "dm-bio-list.h"
#include "dm-ioband.h"
@@ -53,14 +54,7 @@ static int ioband_node(struct bio *bio)

static int ioband_cgroup(struct bio *bio)
{
- /*
- * This function should return the ID of the cgroup which
- * issued "bio". The ID of the cgroup which the current
- * process belongs to won't be suitable ID for this purpose,
- * since some BIOs will be handled by kernel threads like aio
- * or pdflush on behalf of the process requesting the BIOs.
- */
- return 0; /* not implemented yet */
+ return get_bio_cgroup_id(bio);
}

struct group_type dm_ioband_group_type[] = {