Hi all,
This is a new release of blkio-cgroup v10. This release reduces
IO tracking overhead and fixes an issue that could cause a deadlock
since lock_page_cgroup() is no longer used.
Thank you KAMEZAWA-san for your suggestions and pointing out the issue.
This patch can be applied to 2.6.31-rc3-mmotm0716 and 2.6.31-rc4.
The list of the patches:
[PATCH 1/7] blkio-cgroup-v10: Introduction
[PATCH 2/7] blkio-cgroup-v10: The new page_cgroup framework
[PATCH 3/7] blkio-cgroup-v10: Refactoring io-context initialization
[PATCH 4/7] blkio-cgroup-v10: The body of blkio-cgroup
[PATCH 5/7] blkio-cgroup-v10: The document of blkio-cgroup
[PATCH 6/7] blkio-cgroup-v10: Page tracking hooks
[PATCH 7/7] blkio-cgroup-v10: Add a cgroup support to dm-ioband
About blkio-cgroup:
blkio-cgroup is a block I/O tracking mechanism implemented on the
cgroup memory subsystem. Using this feature the owners of any type of
I/O can be determined. This allows dm-ioband to control block I/O
bandwidth even when it is accepting delayed write requests. dm-ioband
can find the cgroup of each request. It is also for possible that
others working on I/O bandwidth throttling to use this functionality
to control asynchronous I/O with a little enhancement.
Please visit our website, the patches and more information are available.
Linux Block I/O Bandwidth Control Project
http://sourceforge.net/apps/trac/ioband/
I'd like to get some feedbacks from the list. Any comments are
appreciated.
Thanks,
Ryo Tsuruta
This patch makes the page_cgroup framework be able to be used even if
the compile option of the cgroup memory controller is off.
So blkio-cgroup can use this framework without the memory controller.
Signed-off-by: Hirokazu Takahashi <[email protected]>
Signed-off-by: Ryo Tsuruta <[email protected]>
---
include/linux/memcontrol.h | 6 ++++++
include/linux/mmzone.h | 4 ++--
include/linux/page_cgroup.h | 5 +++--
init/Kconfig | 4 ++++
mm/Makefile | 3 ++-
mm/memcontrol.c | 6 ++++++
mm/page_cgroup.c | 3 +--
7 files changed, 24 insertions(+), 7 deletions(-)
Index: linux-2.6.31-rc3-mm1/include/linux/memcontrol.h
===================================================================
--- linux-2.6.31-rc3-mm1.orig/include/linux/memcontrol.h
+++ linux-2.6.31-rc3-mm1/include/linux/memcontrol.h
@@ -37,6 +37,8 @@ struct mm_struct;
* (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.)
*/
+extern void __init_mem_page_cgroup(struct page_cgroup *pc);
+
extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask);
/* for swap handling */
@@ -121,6 +123,10 @@ void mem_cgroup_update_mapped_file_stat(
#else /* CONFIG_CGROUP_MEM_RES_CTLR */
struct mem_cgroup;
+static inline void __init_mem_page_cgroup(struct page_cgroup *pc)
+{
+}
+
static inline int mem_cgroup_newpage_charge(struct page *page,
struct mm_struct *mm, gfp_t gfp_mask)
{
Index: linux-2.6.31-rc3-mm1/include/linux/mmzone.h
===================================================================
--- linux-2.6.31-rc3-mm1.orig/include/linux/mmzone.h
+++ linux-2.6.31-rc3-mm1/include/linux/mmzone.h
@@ -609,7 +609,7 @@ typedef struct pglist_data {
int nr_zones;
#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
struct page *node_mem_map;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_CGROUP_PAGE
struct page_cgroup *node_page_cgroup;
#endif
#endif
@@ -960,7 +960,7 @@ struct mem_section {
/* See declaration of similar field in struct zone */
unsigned long *pageblock_flags;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_CGROUP_PAGE
/*
* If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
* section. (see memcontrol.h/page_cgroup.h about this.)
Index: linux-2.6.31-rc3-mm1/include/linux/page_cgroup.h
===================================================================
--- linux-2.6.31-rc3-mm1.orig/include/linux/page_cgroup.h
+++ linux-2.6.31-rc3-mm1/include/linux/page_cgroup.h
@@ -1,7 +1,7 @@
#ifndef __LINUX_PAGE_CGROUP_H
#define __LINUX_PAGE_CGROUP_H
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_CGROUP_PAGE
#include <linux/bit_spinlock.h>
/*
* Page Cgroup can be considered as an extended mem_map.
@@ -14,6 +14,7 @@ struct page_cgroup {
unsigned long flags;
struct mem_cgroup *mem_cgroup;
struct page *page;
+ unsigned long blkio_cgroup_id;
struct list_head lru; /* per cgroup LRU list */
};
@@ -96,7 +97,7 @@ static inline void unlock_page_cgroup(st
bit_spin_unlock(PCG_LOCK, &pc->flags);
}
-#else /* CONFIG_CGROUP_MEM_RES_CTLR */
+#else /* CONFIG_CGROUP_PAGE */
struct page_cgroup;
static inline void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
Index: linux-2.6.31-rc3-mm1/init/Kconfig
===================================================================
--- linux-2.6.31-rc3-mm1.orig/init/Kconfig
+++ linux-2.6.31-rc3-mm1/init/Kconfig
@@ -610,6 +610,10 @@ config CGROUP_MEM_RES_CTLR_SWAP
endif # CGROUPS
+config CGROUP_PAGE
+ def_bool y
+ depends on CGROUP_MEM_RES_CTLR
+
config MM_OWNER
bool
Index: linux-2.6.31-rc3-mm1/mm/Makefile
===================================================================
--- linux-2.6.31-rc3-mm1.orig/mm/Makefile
+++ linux-2.6.31-rc3-mm1/mm/Makefile
@@ -40,6 +40,7 @@ else
obj-$(CONFIG_SMP) += allocpercpu.o
endif
obj-$(CONFIG_QUICKLIST) += quicklist.o
-obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
+obj-$(CONFIG_CGROUP_PAGE) += page_cgroup.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
Index: linux-2.6.31-rc3-mm1/mm/memcontrol.c
===================================================================
--- linux-2.6.31-rc3-mm1.orig/mm/memcontrol.c
+++ linux-2.6.31-rc3-mm1/mm/memcontrol.c
@@ -130,6 +130,12 @@ struct mem_cgroup_lru_info {
struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
};
+void __meminit __init_mem_page_cgroup(struct page_cgroup *pc)
+{
+ pc->mem_cgroup = NULL;
+ INIT_LIST_HEAD(&pc->lru);
+}
+
/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
Index: linux-2.6.31-rc3-mm1/mm/page_cgroup.c
===================================================================
--- linux-2.6.31-rc3-mm1.orig/mm/page_cgroup.c
+++ linux-2.6.31-rc3-mm1/mm/page_cgroup.c
@@ -14,9 +14,8 @@ static void __meminit
__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
{
pc->flags = 0;
- pc->mem_cgroup = NULL;
pc->page = pfn_to_page(pfn);
- INIT_LIST_HEAD(&pc->lru);
+ __init_mem_page_cgroup(pc);
}
static unsigned long total_usage;
This patch refactors io_context initialization.
Signed-off-by: Hirokazu Takahashi <[email protected]>
Signed-off-by: Ryo Tsuruta <[email protected]>
---
block/blk-ioc.c | 30 +++++++++++++++++-------------
include/linux/iocontext.h | 1 +
2 files changed, 18 insertions(+), 13 deletions(-)
Index: linux-2.6.31-rc3-mm1/block/blk-ioc.c
===================================================================
--- linux-2.6.31-rc3-mm1.orig/block/blk-ioc.c
+++ linux-2.6.31-rc3-mm1/block/blk-ioc.c
@@ -84,24 +84,28 @@ void exit_io_context(void)
}
}
+void init_io_context(struct io_context *ioc)
+{
+ atomic_long_set(&ioc->refcount, 1);
+ atomic_set(&ioc->nr_tasks, 1);
+ spin_lock_init(&ioc->lock);
+ ioc->ioprio_changed = 0;
+ ioc->ioprio = 0;
+ ioc->last_waited = jiffies; /* doesn't matter... */
+ ioc->nr_batch_requests = 0; /* because this is 0 */
+ ioc->aic = NULL;
+ INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
+ INIT_HLIST_HEAD(&ioc->cic_list);
+ ioc->ioc_data = NULL;
+}
+
struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
{
struct io_context *ret;
ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
- if (ret) {
- atomic_long_set(&ret->refcount, 1);
- atomic_set(&ret->nr_tasks, 1);
- spin_lock_init(&ret->lock);
- ret->ioprio_changed = 0;
- ret->ioprio = 0;
- ret->last_waited = jiffies; /* doesn't matter... */
- ret->nr_batch_requests = 0; /* because this is 0 */
- ret->aic = NULL;
- INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
- INIT_HLIST_HEAD(&ret->cic_list);
- ret->ioc_data = NULL;
- }
+ if (ret)
+ init_io_context(ret);
return ret;
}
Index: linux-2.6.31-rc3-mm1/include/linux/iocontext.h
===================================================================
--- linux-2.6.31-rc3-mm1.orig/include/linux/iocontext.h
+++ linux-2.6.31-rc3-mm1/include/linux/iocontext.h
@@ -104,6 +104,7 @@ int put_io_context(struct io_context *io
void exit_io_context(void);
struct io_context *get_io_context(gfp_t gfp_flags, int node);
struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
+void init_io_context(struct io_context *ioc);
void copy_io_context(struct io_context **pdst, struct io_context **psrc);
#else
static inline void exit_io_context(void)
The body of blkio-cgroup.
Signed-off-by: Hirokazu Takahashi <[email protected]>
Signed-off-by: Ryo Tsuruta <[email protected]>
---
include/linux/biotrack.h | 100 ++++++++++++++
include/linux/cgroup_subsys.h | 6
init/Kconfig | 13 +
mm/Makefile | 1
mm/biotrack.c | 288 ++++++++++++++++++++++++++++++++++++++++++
mm/page_cgroup.c | 20 +-
6 files changed, 419 insertions(+), 9 deletions(-)
Index: linux-2.6.31-rc3-mm1/include/linux/biotrack.h
===================================================================
--- /dev/null
+++ linux-2.6.31-rc3-mm1/include/linux/biotrack.h
@@ -0,0 +1,100 @@
+#include <linux/cgroup.h>
+#include <linux/mm.h>
+#include <linux/page_cgroup.h>
+
+#ifndef _LINUX_BIOTRACK_H
+#define _LINUX_BIOTRACK_H
+
+#ifdef CONFIG_CGROUP_BLKIO
+
+struct io_context;
+struct block_device;
+
+struct blkio_cgroup {
+ struct cgroup_subsys_state css;
+ struct io_context *io_context; /* default io_context */
+/* struct radix_tree_root io_context_root; per device io_context */
+};
+
+/**
+ * __init_blkio_page_cgroup() - initialize a blkio_page_cgroup
+ * @pc: page_cgroup of the page
+ *
+ * Reset the owner ID of a page.
+ */
+static inline void __init_blkio_page_cgroup(struct page_cgroup *pc)
+{
+ pc->blkio_cgroup_id = 0;
+}
+
+/**
+ * blkio_cgroup_disabled() - check whether blkio_cgroup is disabled
+ *
+ * Returns true if disabled, false if not.
+ */
+static inline bool blkio_cgroup_disabled(void)
+{
+ if (blkio_cgroup_subsys.disabled)
+ return true;
+ return false;
+}
+
+extern void blkio_cgroup_set_owner(struct page *page, struct mm_struct *mm);
+extern void blkio_cgroup_reset_owner(struct page *page, struct mm_struct *mm);
+extern void blkio_cgroup_reset_owner_pagedirty(struct page *page,
+ struct mm_struct *mm);
+extern void blkio_cgroup_copy_owner(struct page *page, struct page *opage);
+
+extern struct io_context *get_blkio_cgroup_iocontext(struct bio *bio);
+extern unsigned long get_blkio_cgroup_id(struct bio *bio);
+extern struct cgroup *blkio_cgroup_lookup(unsigned long id);
+
+#else /* !CONFIG_CGROUP_BLKIO */
+
+struct blkio_cgroup;
+
+static inline void __init_blkio_page_cgroup(struct page_cgroup *pc)
+{
+}
+
+static inline bool blkio_cgroup_disabled(void)
+{
+ return true;
+}
+
+static inline void blkio_cgroup_set_owner(struct page *page, struct mm_struct *mm)
+{
+}
+
+static inline void blkio_cgroup_reset_owner(struct page *page,
+ struct mm_struct *mm)
+{
+}
+
+static inline void blkio_cgroup_reset_owner_pagedirty(struct page *page,
+ struct mm_struct *mm)
+{
+}
+
+static inline void blkio_cgroup_copy_owner(struct page *page, struct page *opage)
+{
+}
+
+static inline struct io_context *get_blkio_cgroup_iocontext(struct bio *bio)
+{
+ return NULL;
+}
+
+static inline unsigned long get_blkio_cgroup_id(struct bio *bio)
+{
+ return 0;
+}
+
+struct cgroup *blkio_cgroup_lookup(unsigned long id)
+{
+ return NULL;
+}
+
+#endif /* CONFIG_CGROUP_BLKIO */
+
+#endif /* _LINUX_BIOTRACK_H */
Index: linux-2.6.31-rc3-mm1/include/linux/cgroup_subsys.h
===================================================================
--- linux-2.6.31-rc3-mm1.orig/include/linux/cgroup_subsys.h
+++ linux-2.6.31-rc3-mm1/include/linux/cgroup_subsys.h
@@ -43,6 +43,12 @@ SUBSYS(mem_cgroup)
/* */
+#ifdef CONFIG_CGROUP_BLKIO
+SUBSYS(blkio_cgroup)
+#endif
+
+/* */
+
#ifdef CONFIG_CGROUP_DEVICE
SUBSYS(devices)
#endif
Index: linux-2.6.31-rc3-mm1/init/Kconfig
===================================================================
--- linux-2.6.31-rc3-mm1.orig/init/Kconfig
+++ linux-2.6.31-rc3-mm1/init/Kconfig
@@ -610,9 +610,20 @@ config CGROUP_MEM_RES_CTLR_SWAP
endif # CGROUPS
+config CGROUP_BLKIO
+ bool "Block I/O cgroup subsystem"
+ depends on CGROUPS && BLOCK
+ select MM_OWNER
+ help
+ Provides a Resource Controller which enables to track the onwner
+ of every Block I/O requests.
+ The information this subsystem provides can be used from any
+ kind of module such as dm-ioband device mapper modules or
+ the cfq-scheduler.
+
config CGROUP_PAGE
def_bool y
- depends on CGROUP_MEM_RES_CTLR
+ depends on CGROUP_MEM_RES_CTLR || CGROUP_BLKIO
config MM_OWNER
bool
Index: linux-2.6.31-rc3-mm1/mm/biotrack.c
===================================================================
--- /dev/null
+++ linux-2.6.31-rc3-mm1/mm/biotrack.c
@@ -0,0 +1,288 @@
+/* biotrack.c - Block I/O Tracking
+ *
+ * Copyright (C) VA Linux Systems Japan, 2008-2009
+ * Developed by Hirokazu Takahashi <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/bit_spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/biotrack.h>
+#include <linux/mm_inline.h>
+
+/*
+ * The block I/O tracking mechanism is implemented on the cgroup memory
+ * controller framework. It helps to find the the owner of an I/O request
+ * because every I/O request has a target page and the owner of the page
+ * can be easily determined on the framework.
+ */
+
+/* Return the blkio_cgroup that associates with a cgroup. */
+static inline struct blkio_cgroup *cgroup_blkio(struct cgroup *cgrp)
+{
+ return container_of(cgroup_subsys_state(cgrp, blkio_cgroup_subsys_id),
+ struct blkio_cgroup, css);
+}
+
+/* Return the blkio_cgroup that associates with a process. */
+static inline struct blkio_cgroup *blkio_cgroup_from_task(struct task_struct *p)
+{
+ return container_of(task_subsys_state(p, blkio_cgroup_subsys_id),
+ struct blkio_cgroup, css);
+}
+
+static struct io_context default_blkio_io_context;
+static struct blkio_cgroup default_blkio_cgroup = {
+ .io_context = &default_blkio_io_context,
+};
+
+/**
+ * blkio_cgroup_set_owner() - set the owner ID of a page.
+ * @page: the page we want to tag
+ * @mm: the mm_struct of a page owner
+ *
+ * Make a given page have the blkio-cgroup ID of the owner of this page.
+ */
+void blkio_cgroup_set_owner(struct page *page, struct mm_struct *mm)
+{
+ struct blkio_cgroup *biog;
+ struct page_cgroup *pc;
+
+ if (blkio_cgroup_disabled())
+ return;
+ pc = lookup_page_cgroup(page);
+ if (unlikely(!pc))
+ return;
+
+ pc->blkio_cgroup_id = 0; /* 0: default blkio_cgroup id */
+ if (!mm)
+ return;
+ /*
+ * Locking "pc" isn't necessary here since the current process is
+ * the only one that can access the members related to blkio_cgroup.
+ */
+ rcu_read_lock();
+ biog = blkio_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!biog))
+ goto out;
+ /*
+ * css_get(&bio->css) isn't called to increment the reference
+ * count of this blkio_cgroup "biog" so pc->blkio_cgroup_id
+ * might turn invalid even if this page is still active.
+ * This approach is chosen to minimize the overhead.
+ */
+ pc->blkio_cgroup_id = css_id(&biog->css);
+out:
+ rcu_read_unlock();
+}
+
+/**
+ * blkio_cgroup_reset_owner() - reset the owner ID of a page
+ * @page: the page we want to tag
+ * @mm: the mm_struct of a page owner
+ *
+ * Change the owner of a given page if necessary.
+ */
+void blkio_cgroup_reset_owner(struct page *page, struct mm_struct *mm)
+{
+ /*
+ * A little trick:
+ * Just call blkio_cgroup_set_owner() for pages which are already
+ * active since the blkio_cgroup_id member of page_cgroup can be
+ * updated without any locks. This is because an integer type of
+ * variable can be set a new value at once on modern cpus.
+ */
+ blkio_cgroup_set_owner(page, mm);
+}
+
+/**
+ * blkio_cgroup_reset_owner_pagedirty() - reset the owner ID of a pagecache page
+ * @page: the page we want to tag
+ * @mm: the mm_struct of a page owner
+ *
+ * Change the owner of a given page if the page is in the pagecache.
+ */
+void blkio_cgroup_reset_owner_pagedirty(struct page *page, struct mm_struct *mm)
+{
+ if (!page_is_file_cache(page))
+ return;
+ if (current->flags & PF_MEMALLOC)
+ return;
+
+ blkio_cgroup_reset_owner(page, mm);
+}
+
+/**
+ * blkio_cgroup_copy_owner() - copy the owner ID of a page into another page
+ * @npage: the page where we want to copy the owner
+ * @opage: the page from which we want to copy the ID
+ *
+ * Copy the owner ID of @opage into @npage.
+ */
+void blkio_cgroup_copy_owner(struct page *npage, struct page *opage)
+{
+ struct page_cgroup *npc, *opc;
+
+ if (blkio_cgroup_disabled())
+ return;
+ npc = lookup_page_cgroup(npage);
+ if (unlikely(!npc))
+ return;
+ opc = lookup_page_cgroup(opage);
+ if (unlikely(!opc))
+ return;
+
+ /*
+ * Do this without any locks. The reason is the same as
+ * blkio_cgroup_reset_owner().
+ */
+ npc->blkio_cgroup_id = opc->blkio_cgroup_id;
+}
+
+/* Create a new blkio-cgroup. */
+static struct cgroup_subsys_state *
+blkio_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ struct blkio_cgroup *biog;
+ struct io_context *ioc;
+
+ if (!cgrp->parent) {
+ biog = &default_blkio_cgroup;
+ init_io_context(biog->io_context);
+ /* Increment the referrence count not to be released ever. */
+ atomic_inc(&biog->io_context->refcount);
+ return &biog->css;
+ }
+
+ biog = kzalloc(sizeof(*biog), GFP_KERNEL);
+ if (!biog)
+ return ERR_PTR(-ENOMEM);
+ ioc = alloc_io_context(GFP_KERNEL, -1);
+ if (!ioc) {
+ kfree(biog);
+ return ERR_PTR(-ENOMEM);
+ }
+ biog->io_context = ioc;
+ return &biog->css;
+}
+
+/* Delete the blkio-cgroup. */
+static void blkio_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ struct blkio_cgroup *biog = cgroup_blkio(cgrp);
+
+ put_io_context(biog->io_context);
+ free_css_id(&blkio_cgroup_subsys, &biog->css);
+ kfree(biog);
+}
+
+/**
+ * get_blkio_cgroup_id() - determine the blkio-cgroup ID
+ * @bio: the &struct bio which describes the I/O
+ *
+ * Returns the blkio-cgroup ID of a given bio. A return value zero
+ * means that the page associated with the bio belongs to default_blkio_cgroup.
+ */
+unsigned long get_blkio_cgroup_id(struct bio *bio)
+{
+ struct page_cgroup *pc;
+ struct page *page = bio_iovec_idx(bio, 0)->bv_page;
+ unsigned long id = 0;
+
+ pc = lookup_page_cgroup(page);
+ if (pc)
+ id = pc->blkio_cgroup_id;
+ return id;
+}
+
+/**
+ * get_blkio_cgroup_iocontext() - determine the blkio-cgroup iocontext
+ * @bio: the &struct bio which describe the I/O
+ *
+ * Returns the iocontext of blkio-cgroup that issued a given bio.
+ */
+struct io_context *get_blkio_cgroup_iocontext(struct bio *bio)
+{
+ struct cgroup_subsys_state *css;
+ struct blkio_cgroup *biog;
+ struct io_context *ioc;
+ unsigned long id;
+
+ id = get_blkio_cgroup_id(bio);
+ rcu_read_lock();
+ css = css_lookup(&blkio_cgroup_subsys, id);
+ if (css)
+ biog = container_of(css, struct blkio_cgroup, css);
+ else
+ biog = &default_blkio_cgroup;
+ ioc = biog->io_context; /* default io_context for this cgroup */
+ atomic_inc(&ioc->refcount);
+ rcu_read_unlock();
+ return ioc;
+}
+
+/**
+ * blkio_cgroup_lookup() - lookup a cgroup by blkio-cgroup ID
+ * @id: blkio-cgroup ID
+ *
+ * Returns the cgroup associated with the specified ID, or NULL if lookup
+ * fails.
+ *
+ * Note:
+ * This function must be called under rcu_read_lock().
+ */
+struct cgroup *blkio_cgroup_lookup(unsigned long id)
+{
+ struct cgroup_subsys_state *css;
+
+ if (!id)
+ return NULL;
+ css = css_lookup(&blkio_cgroup_subsys, id);
+ if (!css)
+ return NULL;
+ return css->cgroup;
+}
+EXPORT_SYMBOL(get_blkio_cgroup_id);
+EXPORT_SYMBOL(get_blkio_cgroup_iocontext);
+EXPORT_SYMBOL(blkio_cgroup_lookup);
+
+/* Read the ID of the specified blkio cgroup. */
+static u64 blkio_id_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct blkio_cgroup *biog = cgroup_blkio(cgrp);
+
+ return (u64)css_id(&biog->css);
+}
+
+static struct cftype blkio_files[] = {
+ {
+ .name = "id",
+ .read_u64 = blkio_id_read,
+ },
+};
+
+static int blkio_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ return cgroup_add_files(cgrp, ss, blkio_files,
+ ARRAY_SIZE(blkio_files));
+}
+
+struct cgroup_subsys blkio_cgroup_subsys = {
+ .name = "blkio",
+ .create = blkio_cgroup_create,
+ .destroy = blkio_cgroup_destroy,
+ .populate = blkio_cgroup_populate,
+ .subsys_id = blkio_cgroup_subsys_id,
+ .use_id = 1,
+};
Index: linux-2.6.31-rc3-mm1/mm/page_cgroup.c
===================================================================
--- linux-2.6.31-rc3-mm1.orig/mm/page_cgroup.c
+++ linux-2.6.31-rc3-mm1/mm/page_cgroup.c
@@ -9,6 +9,7 @@
#include <linux/vmalloc.h>
#include <linux/cgroup.h>
#include <linux/swapops.h>
+#include <linux/biotrack.h>
static void __meminit
__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
@@ -16,6 +17,7 @@ __init_page_cgroup(struct page_cgroup *p
pc->flags = 0;
pc->page = pfn_to_page(pfn);
__init_mem_page_cgroup(pc);
+ __init_blkio_page_cgroup(pc);
}
static unsigned long total_usage;
@@ -73,7 +75,7 @@ void __init page_cgroup_init_flatmem(voi
int nid, fail;
- if (mem_cgroup_disabled())
+ if (mem_cgroup_disabled() && blkio_cgroup_disabled())
return;
for_each_online_node(nid) {
@@ -82,12 +84,13 @@ void __init page_cgroup_init_flatmem(voi
goto fail;
}
printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
- printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
- " don't want memory cgroups\n");
+ printk(KERN_INFO "please try 'cgroup_disable=memory,blkio' option"
+ " if you don't want memory and blkio cgroups\n");
return;
fail:
printk(KERN_CRIT "allocation of page_cgroup failed.\n");
- printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
+ printk(KERN_CRIT
+ "please try 'cgroup_disable=memory,blkio' boot option\n");
panic("Out of memory");
}
@@ -250,7 +253,7 @@ void __init page_cgroup_init(void)
unsigned long pfn;
int fail = 0;
- if (mem_cgroup_disabled())
+ if (mem_cgroup_disabled() && blkio_cgroup_disabled())
return;
for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
@@ -259,14 +262,15 @@ void __init page_cgroup_init(void)
fail = init_section_page_cgroup(pfn);
}
if (fail) {
- printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
+ printk(KERN_CRIT
+ "try 'cgroup_disable=memory,blkio' boot option\n");
panic("Out of memory");
} else {
hotplug_memory_notifier(page_cgroup_callback, 0);
}
printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
- printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't"
- " want memory cgroups\n");
+ printk(KERN_INFO "please try 'cgroup_disable=memory,blkio' option"
+ " if you don't want memory and blkio cgroups\n");
}
void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
Index: linux-2.6.31-rc3-mm1/mm/Makefile
===================================================================
--- linux-2.6.31-rc3-mm1.orig/mm/Makefile
+++ linux-2.6.31-rc3-mm1/mm/Makefile
@@ -42,5 +42,6 @@ endif
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
obj-$(CONFIG_CGROUP_PAGE) += page_cgroup.o
+obj-$(CONFIG_CGROUP_BLKIO) += biotrack.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
The document of blkio-cgroup.
Signed-off-by: Hirokazu Takahashi <[email protected]>
Signed-off-by: Ryo Tsuruta <[email protected]>
---
Documentation/cgroups/00-INDEX | 2
Documentation/cgroups/blkio.txt | 289 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 291 insertions(+)
Index: linux-2.6.31-rc3-mm1/Documentation/cgroups/00-INDEX
===================================================================
--- linux-2.6.31-rc3-mm1.orig/Documentation/cgroups/00-INDEX
+++ linux-2.6.31-rc3-mm1/Documentation/cgroups/00-INDEX
@@ -16,3 +16,5 @@ memory.txt
- Memory Resource Controller; design, accounting, interface, testing.
resource_counter.txt
- Resource Counter API.
+blkio.txt
+ - Block I/O Tracking; description, interface and examples.
Index: linux-2.6.31-rc3-mm1/Documentation/cgroups/blkio.txt
===================================================================
--- /dev/null
+++ linux-2.6.31-rc3-mm1/Documentation/cgroups/blkio.txt
@@ -0,0 +1,289 @@
+Block I/O Cgroup
+
+1. Overview
+
+Using this feature the owners of any type of I/O can be determined.
+This allows dm-ioband to control block I/O bandwidth even when it is
+accepting delayed write requests. dm-ioband can find the cgroup of
+each request. It is also for possible that others working on I/O
+bandwidth throttling to use this functionality to control asynchronous
+I/O with a little enhancement.
+
+2. Setting up blkio-cgroup
+
+Note: If dm-ioband is to be used with blkio-cgroup, then the dm-ioband
+patch needs to be applied first.
+
+The following kernel config options are required.
+
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_BLKIO=y
+
+Selecting the options for the cgroup memory subsystem is also recommended
+as it makes it possible to give some I/O bandwidth and memory to a selected
+cgroup to control delayed write requests. The amount of dirty pages is
+limited within the cgroup even if the allocated bandwidth is narrow.
+
+CONFIG_RESOURCE_COUNTERS=y
+CONFIG_CGROUP_MEM_RES_CTLR=y
+
+3. User interface
+
+3.1 Mounting the cgroup filesystem
+
+First, mount the cgroup filesystem in order to enable observation and
+modification of the blkio-cgroup settings.
+
+# mount -t cgroup -o blkio none /cgroup
+
+3.2 The blkio.id file
+
+After mounting the cgroup filesystem the blkio.id file will be visible
+in the cgroup directory. This file contains a unique ID number for
+each cgroup. When an I/O operation starts, blkio-cgroup sets the
+page's ID number on the page cgroup. The cgroup of I/O can be
+determined by retrieving the ID number from the page cgroup, because
+the page cgroup is associated with the page which is involved in the
+I/O.
+
+If the dm-ioband support patch was applied then the blkio.devices and
+blkio.settings files will also be present.
+
+4. Using dm-ioband and blkio-cgroup
+
+This section describes how to set up dm-ioband and blkio-cgroup in
+order to control bandwidth on a per cgroup per logical volume basis.
+The example used in this section assumes that there are two LVM volume
+groups on individual hard disks and two logical volumes on each volume
+group.
+
+ Table. LVM configurations
+
+ --------------------------------------------------------------
+ | LVM volume groups | vg0 on /dev/sda | vg1 on /dev/sdb |
+ |----------------------|-------------------|-------------------|
+ | LVM logical volume | lv0 | lv1 | lv0 | lv1 |
+ --------------------------------------------------------------
+
+4.1. Creating a dm-ioband logical device
+
+A dm-ioband logical device needs to be created and stacked on the
+device that is to bandwidth controlled. In this example the dm-ioband
+logical devices are stacked on each of the existing LVM logical
+volumes. By using the LVM facilities there is no need to unmount any
+logical volumes, even in the case of a volume being used as the root
+device. The following script is an example of how to stack and remove
+dm-ioband devices.
+
+==================== cut here (ioband.sh) ====================
+#!/bin/sh
+#
+# NOTE: You must run "ioband.sh stop" to restore the device-mapper
+# settings before changing logical volume settings, such as activate,
+# rename, resize and so on. These constraints would be eliminated by
+# enhancing LVM tools to support dm-ioband.
+
+logvols="vg0-lv0 vg0-lv1 vg1-lv0 vg1-lv1"
+
+start()
+{
+ for lv in $logvols; do
+ volgrp=${lv%%-*}
+ orig=${lv}-orig
+
+ # clone an existing logical volume.
+ /sbin/dmsetup table $lv | /sbin/dmsetup create $orig
+
+ # stack a dm-ioband device on the clone.
+ size=$(/sbin/blockdev --getsize /dev/mapper/$orig)
+ cat<<-EOM | /sbin/dmsetup load ${lv}
+ 0 $size ioband /dev/mapper/${orig} ${volgrp} 0 0 cgroup weight 0 :100
+ EOM
+
+ # activate the new setting.
+ /sbin/dmsetup resume $lv
+ done
+}
+
+stop()
+{
+ for lv in $logvols; do
+ orig=${lv}-orig
+
+ # restore the original setting.
+ /sbin/dmsetup table $orig | /sbin/dmsetup load $lv
+
+ # activate the new setting.
+ /sbin/dmsetup resume $lv
+
+ # remove the clone.
+ /sbin/dmsetup remove $orig
+ done
+}
+
+case "$1" in
+ start)
+ start
+ ;;
+ stop)
+ stop
+ ;;
+esac
+exit 0
+==================== cut here (ioband.sh) ====================
+
+The following diagram shows how dm-ioband devices are stacked on and
+removed from the logical volumes.
+
+ Figure. stacking and removing dm-ioband devices
+
+ run "ioband.sh start"
+ ===>
+
+ ----------------------- -----------------------
+ | lv0 | lv1 | | lv0 | lv1 |
+ |(dm-linear)|(dm-linear)| |(dm-ioband)|(dm-ioband)|
+ |-----------------------| |-----------------------|
+ | vg0 | | lv0-orig | lv1-orig |
+ ----------------------- |(dm-linear)|(dm-linear)|
+ |-----------------------|
+ | vg0 |
+ -----------------------
+ <===
+ run "ioband.sh stop"
+
+After creating the dm-ioband devices, the settings can be observed by
+reading the blkio.devices file.
+
+# cat /cgroup/blkio.devices
+vg0 policy=weight io_throttle=4 io_limit=192 token=768 carryover=2
+ vg0-lv0
+ vg0-lv1
+vg1 policy=weight io_throttle=4 io_limit=192 token=768 carryover=2
+ vg1-lv0
+ vg1-lv1
+
+The first field in the first line is the symbolic name for an ioband
+device group, and the subsequent fields are settings for the ioband
+device group. The settings can be changed by writing to the
+blkio.devices, for example:
+
+# echo vg1 policy range-bw > /cgroup/blkio.devices
+
+Please refer to Document/device-mapper/ioband.txt which describes the
+details of the ioband device group settings.
+
+The second and the third indented lines "vg0-lv0" and "vg0-lv1" are
+the names of the dm-ioband devices that belong to the ioband device
+group. Typically, dm-ioband devices that reside on the same hard disk
+should belong to the same ioband device group in order to share the
+bandwidth of the hard disk.
+
+dm-ioband is not restricted to working with LVM, it may work in
+conjunction with any type of block device. Please refer to
+Documentation/device-mapper/ioband.txt for more details.
+
+4.2 Setting up dm-ioband through the blkio-cgroup interface
+
+The following table shows the given settings for this example. The
+bandwidth will be assigned on a per cgroup per logical volume basis.
+
+ Table. Settings for each cgroup
+
+ --------------------------------------------------------------
+ | LVM volume groups | vg0 on /dev/sda | vg1 on /dev/sdb |
+ |----------------------|-------------------|-------------------|
+ | LVM logical volume | lv0 | lv1 | lv0 | lv1 |
+ |----------------------|-------------------|-------------------|
+ | bandwidth control | relative | absolute |
+ | policy | weight | bandwidth limit |
+ |----------------------|-------------------|-------------------|
+ | unit | weight value (*1) | throughput [KB/s] |
+ |----------------------|-------------------|-------------------|
+ | settings for cgroup1 | 40 (16) | 90 (36) | 400 | 900 |
+ |----------------------|---------|---------|---------|---------|
+ | settings for cgroup2 | 20 (8) | 60 (24) | 200 | 600 |
+ |----------------------|---------|---------|---------|---------|
+ | for other cgroups | 10 (4) | 30 (12) | 100 | 300 |
+ --------------------------------------------------------------
+
+ *1: The values enclosed in () denote the preceding weight
+ as a percentage of the total weight. The bandwidth of
+ vg0 is distributed proportional to the total weight.
+
+The set-up is described step-by-step below.
+
+1) Create new cgroups using the mkdir command
+
+# mkdir /cgroup/1
+# mkdir /cgroup/2
+
+2) Set bandwidth control policy on each ioband device group
+
+The set-up of bandwidth control policy is done by writing to
+blkio.devices file.
+
+# echo vg0 policy weight > /cgroup/blkio.devices
+# echo vg1 policy range-bw > /cgroup/blkio.devices
+
+3) Set up the root cgroup
+
+The root cgroup represents the default blkio-cgroup. If an I/O is
+performed by a process in a cgroup and the cgroup is not set up by
+blkio-cgroup, the I/O is charged to the root cgroup.
+
+The set-up of the root cgroup is done by writing to blkio.settings
+file in the cgroup's root directory. The following commands write
+the settings of each logical volume to that file.
+
+# echo vg0-lv0 10 > /cgroup/bklio.settings
+# echo vg0-lv1 30 > /cgroup/bklio.settings
+# echo vg1-lv0 100:100 > /cgroup/blkio.settings
+# echo vg1-lv1 300:300 > /cgroup/blkio.settings
+
+The settings can be verified by reading the blkio.settings file.
+
+# cat /cgroup/blkio.settings
+vg0-lv0 weight=10
+vg0-lv1 weight=30
+vg1-lv0 range-bw=100:100
+vg1-lv1 range-bw=300:300
+
+4) Set up cgroup1 and cgroup2
+
+New cgroups are set up in the same manner as the root cgroup.
+
+Settings for cgroup1
+# echo vg0-lv0 40 > /cgroup/1/blkio.settings
+# echo vg0-lv1 90 > /cgroup/1/bklio.settings
+# echo vg1-lv0 400:400 > /cgroup/1/blkio.settings
+# echo vg1-lv1 900:900 > /cgroup/1/bklio.settings
+
+Settings for cgroup2
+# echo vg0-lv0 20 > /cgroup/2/blkio.settings
+# echo vg0-lv1 60 > /cgroup/2/bklio.settings
+# echo vg1-lv0 200:200 > /cgroup/2/blkio.settings
+# echo vg1-lv1 600:600 > /cgroup/2/bklio.settings
+
+Again, the settings can be verified by reading the appropriate
+blkio.settings file.
+
+# cat /cgroup/1/blkio.settings
+vg0-lv0 weight=40
+vg0-lv1 weight=90
+vg1-lv0 range-bw=400:400
+vg1-lv1 range-bw=900:900
+
+If only the logical volume name is specified, the entry for the
+logical volume is removed.
+
+# echo vg0-lv1 > /cgroup/1/vlkio.setting
+# cat /cgroup/1/blkio.settings
+vg0-lv0 weight=40
+vg0-lv1 weight=90
+vg1-lv0 range-bw=400:400
+
+5. Contact
+
+Linux Block I/O Bandwidth Control Project
+http://sourceforge.net/projects/ioband/
This patch contains several hooks that let the blkio-cgroup framework to know
which blkio-cgroup is the owner of a page before starting I/O against the page.
Signed-off-by: Hirokazu Takahashi <[email protected]>
Signed-off-by: Ryo Tsuruta <[email protected]>
---
fs/buffer.c | 2 ++
fs/direct-io.c | 2 ++
mm/bounce.c | 2 ++
mm/filemap.c | 2 ++
mm/memory.c | 5 +++++
mm/page-writeback.c | 2 ++
mm/swap_state.c | 2 ++
7 files changed, 17 insertions(+)
Index: linux-2.6.31-rc3-mm1/fs/buffer.c
===================================================================
--- linux-2.6.31-rc3-mm1.orig/fs/buffer.c
+++ linux-2.6.31-rc3-mm1/fs/buffer.c
@@ -36,6 +36,7 @@
#include <linux/buffer_head.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/bio.h>
+#include <linux/biotrack.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/bitops.h>
@@ -676,6 +677,7 @@ static void __set_page_dirty(struct page
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
account_page_dirtied(page, mapping);
+ blkio_cgroup_reset_owner_pagedirty(page, current->mm);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
Index: linux-2.6.31-rc3-mm1/fs/direct-io.c
===================================================================
--- linux-2.6.31-rc3-mm1.orig/fs/direct-io.c
+++ linux-2.6.31-rc3-mm1/fs/direct-io.c
@@ -33,6 +33,7 @@
#include <linux/err.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
+#include <linux/biotrack.h>
#include <linux/rwsem.h>
#include <linux/uio.h>
#include <asm/atomic.h>
@@ -797,6 +798,7 @@ static int do_direct_IO(struct dio *dio)
ret = PTR_ERR(page);
goto out;
}
+ blkio_cgroup_reset_owner(page, current->mm);
while (block_in_page < blocks_per_page) {
unsigned offset_in_page = block_in_page << blkbits;
Index: linux-2.6.31-rc3-mm1/mm/bounce.c
===================================================================
--- linux-2.6.31-rc3-mm1.orig/mm/bounce.c
+++ linux-2.6.31-rc3-mm1/mm/bounce.c
@@ -13,6 +13,7 @@
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
+#include <linux/biotrack.h>
#include <asm/tlbflush.h>
#include <trace/events/block.h>
@@ -210,6 +211,7 @@ static void __blk_queue_bounce(struct re
to->bv_len = from->bv_len;
to->bv_offset = from->bv_offset;
inc_zone_page_state(to->bv_page, NR_BOUNCE);
+ blkio_cgroup_copy_owner(to->bv_page, page);
if (rw == WRITE) {
char *vto, *vfrom;
Index: linux-2.6.31-rc3-mm1/mm/filemap.c
===================================================================
--- linux-2.6.31-rc3-mm1.orig/mm/filemap.c
+++ linux-2.6.31-rc3-mm1/mm/filemap.c
@@ -33,6 +33,7 @@
#include <linux/cpuset.h>
#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
#include <linux/memcontrol.h>
+#include <linux/biotrack.h>
#include <linux/mm_inline.h> /* for page_is_file_cache() */
#include "internal.h"
@@ -466,6 +467,7 @@ int add_to_page_cache_locked(struct page
gfp_mask & GFP_RECLAIM_MASK);
if (error)
goto out;
+ blkio_cgroup_set_owner(page, current->mm);
error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error == 0) {
Index: linux-2.6.31-rc3-mm1/mm/memory.c
===================================================================
--- linux-2.6.31-rc3-mm1.orig/mm/memory.c
+++ linux-2.6.31-rc3-mm1/mm/memory.c
@@ -51,6 +51,7 @@
#include <linux/init.h>
#include <linux/writeback.h>
#include <linux/memcontrol.h>
+#include <linux/biotrack.h>
#include <linux/mmu_notifier.h>
#include <linux/kallsyms.h>
#include <linux/swapops.h>
@@ -2115,6 +2116,7 @@ gotten:
*/
ptep_clear_flush_notify(vma, address, page_table);
page_add_new_anon_rmap(new_page, vma, address);
+ blkio_cgroup_set_owner(new_page, mm);
set_pte_at(mm, address, page_table, entry);
update_mmu_cache(vma, address, entry);
if (old_page) {
@@ -2580,6 +2582,7 @@ static int do_swap_page(struct mm_struct
flush_icache_page(vma, page);
set_pte_at(mm, address, page_table, pte);
page_add_anon_rmap(page, vma, address);
+ blkio_cgroup_reset_owner(page, mm);
/* It's better to call commit-charge after rmap is established */
mem_cgroup_commit_charge_swapin(page, ptr);
@@ -2644,6 +2647,7 @@ static int do_anonymous_page(struct mm_s
goto release;
inc_mm_counter(mm, anon_rss);
page_add_new_anon_rmap(page, vma, address);
+ blkio_cgroup_set_owner(page, mm);
set_pte_at(mm, address, page_table, entry);
/* No need to invalidate - it was non-present before */
@@ -2791,6 +2795,7 @@ static int __do_fault(struct mm_struct *
if (anon) {
inc_mm_counter(mm, anon_rss);
page_add_new_anon_rmap(page, vma, address);
+ blkio_cgroup_set_owner(page, mm);
} else {
inc_mm_counter(mm, file_rss);
page_add_file_rmap(page);
Index: linux-2.6.31-rc3-mm1/mm/page-writeback.c
===================================================================
--- linux-2.6.31-rc3-mm1.orig/mm/page-writeback.c
+++ linux-2.6.31-rc3-mm1/mm/page-writeback.c
@@ -23,6 +23,7 @@
#include <linux/init.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
+#include <linux/biotrack.h>
#include <linux/blkdev.h>
#include <linux/mpage.h>
#include <linux/rmap.h>
@@ -1130,6 +1131,7 @@ int __set_page_dirty_nobuffers(struct pa
BUG_ON(mapping2 != mapping);
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
account_page_dirtied(page, mapping);
+ blkio_cgroup_reset_owner_pagedirty(page, current->mm);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
Index: linux-2.6.31-rc3-mm1/mm/swap_state.c
===================================================================
--- linux-2.6.31-rc3-mm1.orig/mm/swap_state.c
+++ linux-2.6.31-rc3-mm1/mm/swap_state.c
@@ -18,6 +18,7 @@
#include <linux/pagevec.h>
#include <linux/migrate.h>
#include <linux/page_cgroup.h>
+#include <linux/biotrack.h>
#include <asm/pgtable.h>
@@ -308,6 +309,7 @@ struct page *read_swap_cache_async(swp_e
*/
__set_page_locked(new_page);
SetPageSwapBacked(new_page);
+ blkio_cgroup_set_owner(new_page, current->mm);
err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
if (likely(!err)) {
/*
With this patch, dm-ioband can work with the blkio-cgroup.
Signed-off-by: Hirokazu Takahashi <[email protected]>
Signed-off-by: Ryo Tsuruta <[email protected]>
---
drivers/md/dm-ioband-ctl.c | 201 ++++++++++++++++++++++++++++++++++++++++-
drivers/md/dm-ioband-policy.c | 20 +++-
drivers/md/dm-ioband-rangebw.c | 13 ++
drivers/md/dm-ioband-type.c | 10 --
drivers/md/dm-ioband.h | 14 ++
drivers/md/dm-ioctl.c | 1
include/linux/biotrack.h | 7 +
mm/biotrack.c | 116 +++++++++++++++++++++++
8 files changed, 371 insertions(+), 11 deletions(-)
Index: linux-2.6.31-rc3-mm1/include/linux/biotrack.h
===================================================================
--- linux-2.6.31-rc3-mm1.orig/include/linux/biotrack.h
+++ linux-2.6.31-rc3-mm1/include/linux/biotrack.h
@@ -9,6 +9,7 @@
struct io_context;
struct block_device;
+struct ioband_cgroup_ops;
struct blkio_cgroup {
struct cgroup_subsys_state css;
@@ -48,6 +49,12 @@ extern void blkio_cgroup_copy_owner(stru
extern struct io_context *get_blkio_cgroup_iocontext(struct bio *bio);
extern unsigned long get_blkio_cgroup_id(struct bio *bio);
extern struct cgroup *blkio_cgroup_lookup(unsigned long id);
+extern int blkio_cgroup_register_ioband(const struct ioband_cgroup_ops *ops);
+
+static inline int blkio_cgroup_unregister_ioband(void)
+{
+ return blkio_cgroup_register_ioband(NULL);
+}
#else /* !CONFIG_CGROUP_BLKIO */
Index: linux-2.6.31-rc3-mm1/mm/biotrack.c
===================================================================
--- linux-2.6.31-rc3-mm1.orig/mm/biotrack.c
+++ linux-2.6.31-rc3-mm1/mm/biotrack.c
@@ -20,6 +20,9 @@
#include <linux/blkdev.h>
#include <linux/biotrack.h>
#include <linux/mm_inline.h>
+#include <linux/seq_file.h>
+#include <linux/dm-ioctl.h>
+#include <../drivers/md/dm-ioband.h>
/*
* The block I/O tracking mechanism is implemented on the cgroup memory
@@ -46,6 +49,8 @@ static struct io_context default_blkio_i
static struct blkio_cgroup default_blkio_cgroup = {
.io_context = &default_blkio_io_context,
};
+static DEFINE_MUTEX(ioband_ops_lock);
+static const struct ioband_cgroup_ops *ioband_ops = NULL;
/**
* blkio_cgroup_set_owner() - set the owner ID of a page.
@@ -181,6 +186,14 @@ blkio_cgroup_create(struct cgroup_subsys
static void blkio_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
{
struct blkio_cgroup *biog = cgroup_blkio(cgrp);
+ int id;
+
+ mutex_lock(&ioband_ops_lock);
+ if (ioband_ops) {
+ id = css_id(&biog->css);
+ ioband_ops->remove_group(id);
+ }
+ mutex_unlock(&ioband_ops_lock);
put_io_context(biog->io_context);
free_css_id(&blkio_cgroup_subsys, &biog->css);
@@ -253,9 +266,28 @@ struct cgroup *blkio_cgroup_lookup(unsig
return NULL;
return css->cgroup;
}
+
+/**
+ * blkio_cgroup_register_ioband() - register ioband
+ * @p: a pointer to struct ioband_cgroup_ops
+ *
+ * Calling with NULL means unregistration.
+ * Returns 0 on success.
+ */
+int blkio_cgroup_register_ioband(const struct ioband_cgroup_ops *p)
+{
+ if (blkio_cgroup_disabled())
+ return -1;
+
+ mutex_lock(&ioband_ops_lock);
+ ioband_ops = p;
+ mutex_unlock(&ioband_ops_lock);
+ return 0;
+}
EXPORT_SYMBOL(get_blkio_cgroup_id);
EXPORT_SYMBOL(get_blkio_cgroup_iocontext);
EXPORT_SYMBOL(blkio_cgroup_lookup);
+EXPORT_SYMBOL(blkio_cgroup_register_ioband);
/* Read the ID of the specified blkio cgroup. */
static u64 blkio_id_read(struct cgroup *cgrp, struct cftype *cft)
@@ -265,11 +297,95 @@ static u64 blkio_id_read(struct cgroup *
return (u64)css_id(&biog->css);
}
+/* Show all ioband devices and their settings. */
+static int blkio_devs_read(struct cgroup *cgrp, struct cftype *cft,
+ struct seq_file *m)
+{
+ mutex_lock(&ioband_ops_lock);
+ if (ioband_ops)
+ ioband_ops->show_device(m);
+ mutex_unlock(&ioband_ops_lock);
+ return 0;
+}
+
+/* Configure ioband devices specified by an ioband device ID */
+static int blkio_devs_write(struct cgroup *cgrp, struct cftype *cft,
+ const char *buffer)
+{
+ char **argv;
+ int argc, r = 0;
+
+ if (cgrp != cgrp->top_cgroup)
+ return -EACCES;
+
+ argv = argv_split(GFP_KERNEL, buffer, &argc);
+ if (!argv)
+ return -ENOMEM;
+
+ mutex_lock(&ioband_ops_lock);
+ if (ioband_ops)
+ r = ioband_ops->config_device(argc, argv);
+ mutex_unlock(&ioband_ops_lock);
+
+ argv_free(argv);
+ return r;
+}
+
+/* Show the settings of the specified blkio cgroup. */
+static int blkio_settings_read(struct cgroup *cgrp, struct cftype *cft,
+ struct seq_file *m)
+{
+ struct blkio_cgroup *biog;
+ int id;
+
+ mutex_lock(&ioband_ops_lock);
+ if (ioband_ops) {
+ biog = cgroup_blkio(cgrp);
+ id = css_id(&biog->css);
+ ioband_ops->show_group(m, biog, id);
+ }
+ mutex_unlock(&ioband_ops_lock);
+ return 0;
+}
+
+/* Configure the specified blkio cgroup. */
+static int blkio_settings_write(struct cgroup *cgrp, struct cftype *cft,
+ const char *buffer)
+{
+ struct blkio_cgroup *biog;
+ char **argv;
+ int argc, id, r = 0;
+
+ argv = argv_split(GFP_KERNEL, buffer, &argc);
+ if (!argv)
+ return -ENOMEM;
+
+ mutex_lock(&ioband_ops_lock);
+ if (ioband_ops) {
+ biog = cgroup_blkio(cgrp);
+ id = css_id(&biog->css);
+ r = ioband_ops->config_group(argc, argv, biog, id);
+ }
+ mutex_unlock(&ioband_ops_lock);
+ argv_free(argv);
+ return r;
+}
+
static struct cftype blkio_files[] = {
{
.name = "id",
.read_u64 = blkio_id_read,
},
+ {
+ .name = "devices",
+ .read_seq_string = blkio_devs_read,
+ .write_string = blkio_devs_write,
+ },
+ {
+ .name = "settings",
+ .read_seq_string = blkio_settings_read,
+ .write_string = blkio_settings_write,
+ },
};
static int blkio_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
Index: linux-2.6.31-rc3-mm1/drivers/md/dm-ioctl.c
===================================================================
--- linux-2.6.31-rc3-mm1.orig/drivers/md/dm-ioctl.c
+++ linux-2.6.31-rc3-mm1/drivers/md/dm-ioctl.c
@@ -1601,3 +1601,4 @@ out:
return r;
}
+EXPORT_SYMBOL(dm_copy_name_and_uuid);
Index: linux-2.6.31-rc3-mm1/drivers/md/dm-ioband-policy.c
===================================================================
--- linux-2.6.31-rc3-mm1.orig/drivers/md/dm-ioband-policy.c
+++ linux-2.6.31-rc3-mm1/drivers/md/dm-ioband-policy.c
@@ -8,6 +8,7 @@
#include <linux/bio.h>
#include <linux/workqueue.h>
#include <linux/rbtree.h>
+#include <linux/seq_file.h>
#include "dm.h"
#include "dm-ioband.h"
@@ -276,7 +277,7 @@ static int policy_weight_param(struct io
if (value)
err = strict_strtol(value, 0, &val);
- if (!strcmp(cmd, "weight")) {
+ if (!cmd || !strcmp(cmd, "weight")) {
if (!value)
set_weight(gp, DEFAULT_WEIGHT);
else if (!err && 0 < val && val <= SHORT_MAX)
@@ -341,6 +342,19 @@ static void policy_weight_show(struct io
*szp = sz;
}
+static void policy_weight_show_device(struct seq_file *m,
+ struct ioband_device *dp)
+{
+ seq_printf(m, " token=%d carryover=%d",
+ dp->g_token_bucket, dp->g_carryover);
+}
+
+static void policy_weight_show_group(struct seq_file *m,
+ struct ioband_group *gp)
+{
+ seq_printf(m, " weight=%d", gp->c_weight);
+}
+
/*
* <Method> <description>
* g_can_submit : To determine whether a given group has the right to
@@ -369,6 +383,8 @@ static void policy_weight_show(struct io
* Return 1 if a given group can't receive any more BIOs,
* otherwise return 0.
* g_show : Show the configuration.
+ * g_show_device : Show the configuration of the specified ioband device.
+ * g_show_group : Show the configuration of the spacified ioband group.
*/
static int policy_weight_init(struct ioband_device *dp, int argc, char **argv)
{
@@ -391,6 +407,8 @@ static int policy_weight_init(struct iob
dp->g_set_param = policy_weight_param;
dp->g_should_block = is_queue_full;
dp->g_show = policy_weight_show;
+ dp->g_show_device = policy_weight_show_device;
+ dp->g_show_group = policy_weight_show_group;
dp->g_epoch = 0;
dp->g_weight_total = 0;
Index: linux-2.6.31-rc3-mm1/drivers/md/dm-ioband-rangebw.c
===================================================================
--- linux-2.6.31-rc3-mm1.orig/drivers/md/dm-ioband-rangebw.c
+++ linux-2.6.31-rc3-mm1/drivers/md/dm-ioband-rangebw.c
@@ -25,6 +25,7 @@
#include <linux/random.h>
#include <linux/time.h>
#include <linux/timer.h>
+#include <linux/seq_file.h>
#include "dm.h"
#include "md.h"
#include "dm-ioband.h"
@@ -459,7 +460,7 @@ static int policy_range_bw_param(struct
err++;
}
- if (!strcmp(cmd, "range-bw")) {
+ if (!cmd || !strcmp(cmd, "range-bw")) {
if (!err && 0 <= min_val &&
min_val <= (INT_MAX / 2) && 0 <= max_val &&
max_val <= (INT_MAX / 2) && min_val <= max_val)
@@ -547,6 +548,12 @@ static void policy_range_bw_show(struct
*szp = sz;
}
+static void policy_range_bw_show_group(struct seq_file *m,
+ struct ioband_group *gp)
+{
+ seq_printf(m, " range-bw=%d:%d", gp->c_min_bw, gp->c_max_bw);
+}
+
static int range_bw_prepare_token(struct ioband_group *gp,
struct bio *bio, int flag)
{
@@ -633,6 +640,8 @@ void range_bw_timeover(unsigned long gp)
* Return 1 if a given group can't receive any more BIOs,
* otherwise return 0.
* g_show : Show the configuration.
+ * g_show_device : Show the configuration of the specified ioband device.
+ * g_show_group : Show the configuration of the spacified ioband group.
*/
int policy_range_bw_init(struct ioband_device *dp, int argc, char **argv)
@@ -656,6 +665,8 @@ int policy_range_bw_init(struct ioband_d
dp->g_set_param = policy_range_bw_param;
dp->g_should_block = range_bw_queue_full;
dp->g_show = policy_range_bw_show;
+ dp->g_show_device = NULL;
+ dp->g_show_group = policy_range_bw_show_group;
dp->g_min_bw_total = 0;
dp->g_running_gp = NULL;
Index: linux-2.6.31-rc3-mm1/drivers/md/dm-ioband-ctl.c
===================================================================
--- linux-2.6.31-rc3-mm1.orig/drivers/md/dm-ioband-ctl.c
+++ linux-2.6.31-rc3-mm1/drivers/md/dm-ioband-ctl.c
@@ -13,6 +13,8 @@
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/rbtree.h>
+#include <linux/biotrack.h>
+#include <linux/dm-ioctl.h>
#include "dm.h"
#include "md.h"
#include "dm-ioband.h"
@@ -109,6 +111,7 @@ static struct ioband_device *alloc_ioban
INIT_DELAYED_WORK(&new_dp->g_conductor, ioband_conduct);
INIT_LIST_HEAD(&new_dp->g_groups);
INIT_LIST_HEAD(&new_dp->g_list);
+ INIT_LIST_HEAD(&new_dp->g_heads);
spin_lock_init(&new_dp->g_lock);
bio_list_init(&new_dp->g_urgent_bios);
new_dp->g_io_throttle = io_throttle;
@@ -241,6 +244,7 @@ static int ioband_group_init(struct ioba
int r;
INIT_LIST_HEAD(&gp->c_list);
+ INIT_LIST_HEAD(&gp->c_heads);
bio_list_init(&gp->c_blocked_bios);
bio_list_init(&gp->c_prio_bios);
gp->c_id = id; /* should be verified */
@@ -271,7 +275,8 @@ static int ioband_group_init(struct ioba
ioband_group_add_node(&head->c_group_root, gp);
gp->c_dev = head->c_dev;
gp->c_target = head->c_target;
- }
+ } else
+ list_add_tail(&gp->c_heads, &dp->g_heads);
spin_unlock_irqrestore(&dp->g_lock, flags);
return 0;
@@ -285,6 +290,8 @@ static void ioband_group_release(struct
list_del(&gp->c_list);
if (head)
rb_erase(&gp->c_group_node, &head->c_group_root);
+ else
+ list_del(&gp->c_heads);
dp->g_group_dtr(gp);
kfree(gp);
}
@@ -1297,6 +1304,191 @@ static struct target_type ioband_target
.iterate_devices = ioband_iterate_devices,
};
+#ifdef CONFIG_CGROUP_BLKIO
+/* Read the ID of the specified blkio cgroup. */
+static void ioband_copy_name(struct ioband_group *gp, char *name)
+{
+ struct mapped_device *md;
+
+ md = dm_table_get_md(gp->c_target->table);
+ dm_copy_name_and_uuid(md, name, NULL);
+ dm_put(md);
+}
+
+/* Show all ioband devices and their settings. */
+static void ioband_cgroup_show_device(struct seq_file *m)
+{
+ struct ioband_device *dp;
+ struct ioband_group *gp;
+ char name[DM_NAME_LEN];
+
+ mutex_lock(&ioband_lock);
+ list_for_each_entry(dp, &ioband_device_list, g_list) {
+ seq_printf(m, "%s policy=%s io_throttle=%d io_limit=%d",
+ dp->g_name, dp->g_policy->p_name,
+ dp->g_io_throttle, dp->g_io_limit);
+ if (dp->g_show_device)
+ dp->g_show_device(m, dp);
+ seq_putc(m, '\n');
+
+ list_for_each_entry(gp, &dp->g_heads, c_heads) {
+ if (strcmp(gp->c_type->t_name, "cgroup"))
+ continue;
+ ioband_copy_name(gp, name);
+ seq_printf(m, " %s\n", name);
+ }
+ }
+ mutex_unlock(&ioband_lock);
+}
+
+/* Configure ioband devices specified by an ioband device ID */
+static int ioband_cgroup_config_device(int argc, char **argv)
+{
+ struct ioband_device *dp;
+ struct ioband_group *gp;
+ char name[DM_NAME_LEN];
+ int r;
+
+ if (argc < 1)
+ return -EINVAL;
+
+ mutex_lock(&ioband_lock);
+ /* look up the ioband device */
+ list_for_each_entry(dp, &ioband_device_list, g_list) {
+ /* assuming argv[0] is a share name */
+ if (!strcmp(dp->g_name, argv[0])) {
+ gp = list_first_entry(&dp->g_heads,
+ struct ioband_group, c_heads);
+ goto found;
+ }
+
+ /* assuming argv[0] is a device name */
+ list_for_each_entry(gp, &dp->g_heads, c_heads) {
+ ioband_copy_name(gp, name);
+ if (!strcmp(name, argv[0]))
+ goto found;
+ }
+ }
+ mutex_unlock(&ioband_lock);
+ return -ENODEV;
+
+found:
+ if (!strcmp(gp->c_type->t_name, "cgroup"))
+ r = __ioband_message(gp->c_target, --argc, &argv[1]);
+ else
+ r = -ENODEV;
+
+ mutex_unlock(&ioband_lock);
+ return r;
+}
+
+/* Show the settings of the specified blkio cgroup. */
+static void ioband_cgroup_show_group(struct seq_file *m,
+ struct blkio_cgroup *biog, int id)
+{
+ struct ioband_device *dp;
+ struct ioband_group *head, *gp;
+ struct cgroup *cgrp = biog->css.cgroup;
+ char name[DM_NAME_LEN];
+
+ mutex_lock(&ioband_lock);
+ list_for_each_entry(dp, &ioband_device_list, g_list) {
+ list_for_each_entry(head, &dp->g_heads, c_heads) {
+ if (strcmp(head->c_type->t_name, "cgroup"))
+ continue;
+
+ if (cgrp == cgrp->top_cgroup)
+ gp = head;
+ else {
+ gp = ioband_group_find(head, id);
+ if (!gp)
+ continue;
+ }
+
+ ioband_copy_name(head, name);
+ seq_puts(m, name);
+ if (dp->g_show_group)
+ dp->g_show_group(m, gp);
+ seq_putc(m, '\n');
+ }
+ }
+ mutex_unlock(&ioband_lock);
+}
+
+/* Configure the specified blkio cgroup. */
+static int ioband_cgroup_config_group(int argc, char **argv,
+ struct blkio_cgroup *biog, int id)
+{
+ struct ioband_device *dp;
+ struct ioband_group *head, *gp;
+ struct cgroup *cgrp = biog->css.cgroup;
+ char name[DM_NAME_LEN];
+ int r;
+
+ if (argc != 1 && argc != 2)
+ return -EINVAL;
+
+ mutex_lock(&ioband_lock);
+ list_for_each_entry(dp, &ioband_device_list, g_list) {
+ list_for_each_entry(head, &dp->g_heads, c_heads) {
+ if (strcmp(head->c_type->t_name, "cgroup"))
+ continue;
+ ioband_copy_name(head, name);
+ if (!strcmp(name, argv[0]))
+ goto found;
+ }
+ }
+ mutex_unlock(&ioband_lock);
+ return -ENODEV;
+
+found:
+ if (argc == 1) {
+ if (cgrp == cgrp->top_cgroup)
+ r = -EINVAL;
+ else
+ r = ioband_group_detach(head, id);
+ } else {
+ if (cgrp == cgrp->top_cgroup)
+ gp = head;
+ else
+ gp = ioband_group_find(head, id);
+
+ if (!gp)
+ r = ioband_group_attach(head, id, argv[1]);
+ else
+ r = gp->c_banddev->g_set_param(gp, NULL, argv[1]);
+ }
+
+ mutex_unlock(&ioband_lock);
+ return r;
+}
+
+/* Remove the specified blkio cgroup. */
+static void ioband_cgroup_remove_group(int id)
+{
+ struct ioband_device *dp;
+ struct ioband_group *head;
+
+ mutex_lock(&ioband_lock);
+ list_for_each_entry(dp, &ioband_device_list, g_list) {
+ list_for_each_entry(head, &dp->g_heads, c_heads) {
+ if (strcmp(head->c_type->t_name, "cgroup"))
+ continue;
+ ioband_group_detach(head, id);
+ }
+ }
+ mutex_unlock(&ioband_lock);
+}
+
+static const struct ioband_cgroup_ops ioband_ops = {
+ .show_device = ioband_cgroup_show_device,
+ .config_device = ioband_cgroup_config_device,
+ .show_group = ioband_cgroup_show_group,
+ .config_group = ioband_cgroup_config_group,
+ .remove_group = ioband_cgroup_remove_group,
+};
+#endif
+
static int __init dm_ioband_init(void)
{
int r;
@@ -1304,11 +1496,18 @@ static int __init dm_ioband_init(void)
r = dm_register_target(&ioband_target);
if (r < 0)
DMERR("register failed %d", r);
+#ifdef CONFIG_CGROUP_BLKIO
+ else
+ r = blkio_cgroup_register_ioband(&ioband_ops);
+#endif
return r;
}
static void __exit dm_ioband_exit(void)
{
+#ifdef CONFIG_CGROUP_BLKIO
+ blkio_cgroup_unregister_ioband();
+#endif
dm_unregister_target(&ioband_target);
}
Index: linux-2.6.31-rc3-mm1/drivers/md/dm-ioband.h
===================================================================
--- linux-2.6.31-rc3-mm1.orig/drivers/md/dm-ioband.h
+++ linux-2.6.31-rc3-mm1/drivers/md/dm-ioband.h
@@ -44,6 +44,7 @@ struct ioband_device {
int g_ref;
struct list_head g_list;
+ struct list_head g_heads;
int g_flags;
char g_name[IOBAND_NAME_MAX + 1];
const struct ioband_policy_type *g_policy;
@@ -59,6 +60,8 @@ struct ioband_device {
int (*g_set_param) (struct ioband_group *, const char *, const char *);
int (*g_should_block) (struct ioband_group *);
void (*g_show) (struct ioband_group *, int *, char *, unsigned);
+ void (*g_show_device) (struct seq_file *, struct ioband_device *);
+ void (*g_show_group) (struct seq_file *, struct ioband_group *);
/* members for weight balancing policy */
int g_epoch;
@@ -104,6 +107,7 @@ struct ioband_group_stat {
struct ioband_group {
struct list_head c_list;
+ struct list_head c_heads;
struct ioband_device *c_banddev;
struct dm_dev *c_dev;
struct dm_target *c_target;
@@ -150,6 +154,16 @@ struct ioband_group {
};
+struct blkio_cgroup;
+
+struct ioband_cgroup_ops {
+ void (*show_device)(struct seq_file *);
+ int (*config_device)(int, char **);
+ void (*show_group)(struct seq_file *, struct blkio_cgroup *, int);
+ int (*config_group)(int, char **, struct blkio_cgroup *, int);
+ void (*remove_group)(int);
+};
+
#define IOBAND_URGENT 1
#define DEV_BIO_BLOCKED 1
Index: linux-2.6.31-rc3-mm1/drivers/md/dm-ioband-type.c
===================================================================
--- linux-2.6.31-rc3-mm1.orig/drivers/md/dm-ioband-type.c
+++ linux-2.6.31-rc3-mm1/drivers/md/dm-ioband-type.c
@@ -6,6 +6,7 @@
* This file is released under the GPL.
*/
#include <linux/bio.h>
+#include <linux/biotrack.h>
#include "dm.h"
#include "dm-ioband.h"
@@ -52,14 +53,7 @@ static int ioband_node(struct bio *bio)
static int ioband_cgroup(struct bio *bio)
{
- /*
- * This function should return the ID of the cgroup which
- * issued "bio". The ID of the cgroup which the current
- * process belongs to won't be suitable ID for this purpose,
- * since some BIOs will be handled by kernel threads like aio
- * or pdflush on behalf of the process requesting the BIOs.
- */
- return 0; /* not implemented yet */
+ return get_blkio_cgroup_id(bio);
}
const struct ioband_group_type dm_ioband_group_type[] = {