Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755257AbZDNUWg (ORCPT ); Tue, 14 Apr 2009 16:22:36 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1753907AbZDNUVh (ORCPT ); Tue, 14 Apr 2009 16:21:37 -0400 Received: from mail-bw0-f169.google.com ([209.85.218.169]:41193 "EHLO mail-bw0-f169.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753626AbZDNUVe (ORCPT ); Tue, 14 Apr 2009 16:21:34 -0400 DomainKey-Signature: a=rsa-sha1; c=nofws; d=gmail.com; s=gamma; h=from:to:cc:subject:date:message-id:x-mailer:in-reply-to:references; b=FwWoyeQarG5HVQcZ1yDQgRmXoUK047hP/2Lrx2NtHCWv1zlSQuLxfmCkiZhq6bX9t1 xNX2n/N8i5dNAdq8UDdjexcOurDKIV8XJnOFaASn54DTZE5ArdOXlNcgnWxTv2vQr2AX E9RrHGZzU4SZknv1UDLQFnpUXRgr9c/5tAB2Q= From: Andrea Righi To: Paul Menage Cc: Balbir Singh , Gui Jianfeng , KAMEZAWA Hiroyuki , agk@sourceware.org, akpm@linux-foundation.org, axboe@kernel.dk, baramsori72@gmail.com, Carl Henrik Lunde , dave@linux.vnet.ibm.com, Divyesh Shah , eric.rannaud@gmail.com, fernando@oss.ntt.co.jp, Hirokazu Takahashi , Li Zefan , matt@bluehost.com, dradford@bluehost.com, ngupta@google.com, randy.dunlap@oracle.com, roberto@unbit.it, Ryo Tsuruta , Satoshi UCHIDA , subrata@linux.vnet.ibm.com, yoshikawa.takuya@oss.ntt.co.jp, containers@lists.linux-foundation.org, linux-kernel@vger.kernel.org Subject: [PATCH 3/9] bio-cgroup controller Date: Tue, 14 Apr 2009 22:21:14 +0200 Message-Id: <1239740480-28125-4-git-send-email-righi.andrea@gmail.com> X-Mailer: git-send-email 1.5.6.3 In-Reply-To: <1239740480-28125-1-git-send-email-righi.andrea@gmail.com> References: <1239740480-28125-1-git-send-email-righi.andrea@gmail.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 27561 Lines: 955 From: Ryo Tsuruta From: Ryo Tsuruta With writeback IO processed asynchronously by kernel threads (pdflush) the real writes to the underlying block devices can occur in a different IO context respect to the task that originally generated the dirty pages involved in the IO operation. The controller bio-cgroup is used by io-throttle to track writeback IO and for properly apply throttling. Also apply a patch by Gui Jianfeng to announce tasks moving in bio-cgroup groups. See also: http://people.valinux.co.jp/~ryov/bio-cgroup Signed-off-by: Gui Jianfeng Signed-off-by: Ryo Tsuruta Signed-off-by: Hirokazu Takahashi --- block/blk-ioc.c | 30 ++-- fs/buffer.c | 2 + fs/direct-io.c | 2 + include/linux/biotrack.h | 95 +++++++++++ include/linux/cgroup_subsys.h | 6 + include/linux/iocontext.h | 1 + include/linux/memcontrol.h | 6 + include/linux/mmzone.h | 4 +- include/linux/page_cgroup.h | 13 ++- init/Kconfig | 15 ++ mm/Makefile | 4 +- mm/biotrack.c | 349 +++++++++++++++++++++++++++++++++++++++++ mm/bounce.c | 2 + mm/filemap.c | 2 + mm/memcontrol.c | 5 + mm/memory.c | 5 + mm/page-writeback.c | 2 + mm/page_cgroup.c | 17 ++- mm/swap_state.c | 2 + 19 files changed, 536 insertions(+), 26 deletions(-) create mode 100644 include/linux/biotrack.h create mode 100644 mm/biotrack.c diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 012f065..ef8cac0 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -84,24 +84,28 @@ void exit_io_context(void) } } +void init_io_context(struct io_context *ioc) +{ + atomic_set(&ioc->refcount, 1); + atomic_set(&ioc->nr_tasks, 1); + spin_lock_init(&ioc->lock); + ioc->ioprio_changed = 0; + ioc->ioprio = 0; + ioc->last_waited = jiffies; /* doesn't matter... */ + ioc->nr_batch_requests = 0; /* because this is 0 */ + ioc->aic = NULL; + INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH); + INIT_HLIST_HEAD(&ioc->cic_list); + ioc->ioc_data = NULL; +} + struct io_context *alloc_io_context(gfp_t gfp_flags, int node) { struct io_context *ret; ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); - if (ret) { - atomic_set(&ret->refcount, 1); - atomic_set(&ret->nr_tasks, 1); - spin_lock_init(&ret->lock); - ret->ioprio_changed = 0; - ret->ioprio = 0; - ret->last_waited = jiffies; /* doesn't matter... */ - ret->nr_batch_requests = 0; /* because this is 0 */ - ret->aic = NULL; - INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); - INIT_HLIST_HEAD(&ret->cic_list); - ret->ioc_data = NULL; - } + if (ret) + init_io_context(ret); return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index 13edf7a..bc72150 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -655,6 +656,7 @@ static void __set_page_dirty(struct page *page, if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); + bio_cgroup_reset_owner_pagedirty(page, current->mm); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } diff --git a/fs/direct-io.c b/fs/direct-io.c index da258e7..ec42362 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -799,6 +800,7 @@ static int do_direct_IO(struct dio *dio) ret = PTR_ERR(page); goto out; } + bio_cgroup_reset_owner(page, current->mm); while (block_in_page < blocks_per_page) { unsigned offset_in_page = block_in_page << blkbits; diff --git a/include/linux/biotrack.h b/include/linux/biotrack.h new file mode 100644 index 0000000..25b8810 --- /dev/null +++ b/include/linux/biotrack.h @@ -0,0 +1,95 @@ +#include +#include +#include + +#ifndef _LINUX_BIOTRACK_H +#define _LINUX_BIOTRACK_H + +#ifdef CONFIG_CGROUP_BIO + +struct tsk_move_msg { + int old_id; + int new_id; + struct task_struct *tsk; +}; + +extern int register_biocgroup_notifier(struct notifier_block *nb); +extern int unregister_biocgroup_notifier(struct notifier_block *nb); + +struct io_context; +struct block_device; + +struct bio_cgroup { + struct cgroup_subsys_state css; + int id; + struct io_context *io_context; /* default io_context */ +/* struct radix_tree_root io_context_root; per device io_context */ +}; + +static inline void __init_bio_page_cgroup(struct page_cgroup *pc) +{ + pc->bio_cgroup_id = 0; +} + +extern struct cgroup *get_cgroup_from_page(struct page *page); +extern void put_cgroup_from_page(struct page *page); +extern struct cgroup *bio_id_to_cgroup(int id); + +static inline int bio_cgroup_disabled(void) +{ + return bio_cgroup_subsys.disabled; +} + +extern void bio_cgroup_set_owner(struct page *page, struct mm_struct *mm); +extern void bio_cgroup_reset_owner(struct page *page, struct mm_struct *mm); +extern void bio_cgroup_reset_owner_pagedirty(struct page *page, + struct mm_struct *mm); +extern void bio_cgroup_copy_owner(struct page *page, struct page *opage); + +extern struct io_context *get_bio_cgroup_iocontext(struct bio *bio); +extern int get_bio_cgroup_id(struct bio *bio); + +#else /* CONFIG_CGROUP_BIO */ + +struct bio_cgroup; + +static inline void __init_bio_page_cgroup(struct page_cgroup *pc) +{ +} + +static inline int bio_cgroup_disabled(void) +{ + return 1; +} + +static inline void bio_cgroup_set_owner(struct page *page, struct mm_struct *mm) +{ +} + +static inline void bio_cgroup_reset_owner(struct page *page, + struct mm_struct *mm) +{ +} + +static inline void bio_cgroup_reset_owner_pagedirty(struct page *page, + struct mm_struct *mm) +{ +} + +static inline void bio_cgroup_copy_owner(struct page *page, struct page *opage) +{ +} + +static inline struct io_context *get_bio_cgroup_iocontext(struct bio *bio) +{ + return NULL; +} + +static inline int get_bio_cgroup_id(struct bio *bio) +{ + return 0; +} + +#endif /* CONFIG_CGROUP_BIO */ + +#endif /* _LINUX_BIOTRACK_H */ diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 9c8d31b..5df23f8 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -43,6 +43,12 @@ SUBSYS(mem_cgroup) /* */ +#ifdef CONFIG_CGROUP_BIO +SUBSYS(bio_cgroup) +#endif + +/* */ + #ifdef CONFIG_CGROUP_DEVICE SUBSYS(devices) #endif diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 08b987b..be37c27 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -104,6 +104,7 @@ int put_io_context(struct io_context *ioc); void exit_io_context(void); struct io_context *get_io_context(gfp_t gfp_flags, int node); struct io_context *alloc_io_context(gfp_t gfp_flags, int node); +void init_io_context(struct io_context *ioc); void copy_io_context(struct io_context **pdst, struct io_context **psrc); #else static inline void exit_io_context(void) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 18146c9..f3e0e64 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -37,6 +37,8 @@ struct mm_struct; * (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.) */ +extern void __init_mem_page_cgroup(struct page_cgroup *pc); + extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); /* for swap handling */ @@ -120,6 +122,10 @@ extern bool mem_cgroup_oom_called(struct task_struct *task); #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct mem_cgroup; +static inline void __init_mem_page_cgroup(struct page_cgroup *pc) +{ +} + static inline int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 186ec6a..47a6f55 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -607,7 +607,7 @@ typedef struct pglist_data { int nr_zones; #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ struct page *node_mem_map; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_CGROUP_PAGE struct page_cgroup *node_page_cgroup; #endif #endif @@ -958,7 +958,7 @@ struct mem_section { /* See declaration of similar field in struct zone */ unsigned long *pageblock_flags; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_CGROUP_PAGE /* * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use * section. (see memcontrol.h/page_cgroup.h about this.) diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 7339c7b..a7249bb 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -1,7 +1,7 @@ #ifndef __LINUX_PAGE_CGROUP_H #define __LINUX_PAGE_CGROUP_H -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_CGROUP_PAGE #include /* * Page Cgroup can be considered as an extended mem_map. @@ -12,9 +12,16 @@ */ struct page_cgroup { unsigned long flags; - struct mem_cgroup *mem_cgroup; struct page *page; +#ifdef CONFIG_CGROUP_MEM_RES_CTLR + struct mem_cgroup *mem_cgroup; +#endif +#ifdef CONFIG_CGROUP_BIO + int bio_cgroup_id; +#endif +#if defined(CONFIG_CGROUP_MEM_RES_CTLR) || defined(CONFIG_CGROUP_BIO) struct list_head lru; /* per cgroup LRU list */ +#endif }; void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat); @@ -71,7 +78,7 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc) bit_spin_unlock(PCG_LOCK, &pc->flags); } -#else /* CONFIG_CGROUP_MEM_RES_CTLR */ +#else /* CONFIG_CGROUP_PAGE */ struct page_cgroup; static inline void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) diff --git a/init/Kconfig b/init/Kconfig index 7be4d38..8f7b23c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -606,8 +606,23 @@ config CGROUP_MEM_RES_CTLR_SWAP Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page size is 4096bytes, 512k per 1Gbytes of swap. +config CGROUP_BIO + bool "Block I/O cgroup subsystem" + depends on CGROUPS && BLOCK + select MM_OWNER + help + Provides a Resource Controller which enables to track the onwner + of every Block I/O requests. + The information this subsystem provides can be used from any + kind of module such as dm-ioband device mapper modules or + the cfq-scheduler. + endif # CGROUPS +config CGROUP_PAGE + def_bool y + depends on CGROUP_MEM_RES_CTLR || CGROUP_BIO + config MM_OWNER bool diff --git a/mm/Makefile b/mm/Makefile index ec73c68..a78a437 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -37,4 +37,6 @@ else obj-$(CONFIG_SMP) += allocpercpu.o endif obj-$(CONFIG_QUICKLIST) += quicklist.o -obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o +obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o +obj-$(CONFIG_CGROUP_PAGE) += page_cgroup.o +obj-$(CONFIG_CGROUP_BIO) += biotrack.o diff --git a/mm/biotrack.c b/mm/biotrack.c new file mode 100644 index 0000000..d3a35f1 --- /dev/null +++ b/mm/biotrack.c @@ -0,0 +1,349 @@ +/* biotrack.c - Block I/O Tracking + * + * Copyright (C) VA Linux Systems Japan, 2008 + * Developed by Hirokazu Takahashi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include + +#define MOVETASK 0 +static BLOCKING_NOTIFIER_HEAD(biocgroup_chain); + +int register_biocgroup_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&biocgroup_chain, nb); +} +EXPORT_SYMBOL(register_biocgroup_notifier); + +int unregister_biocgroup_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&biocgroup_chain, nb); +} +EXPORT_SYMBOL(unregister_biocgroup_notifier); + +/* + * The block I/O tracking mechanism is implemented on the cgroup memory + * controller framework. It helps to find the the owner of an I/O request + * because every I/O request has a target page and the owner of the page + * can be easily determined on the framework. + */ + +/* Return the bio_cgroup that associates with a cgroup. */ +static inline struct bio_cgroup *cgroup_bio(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, bio_cgroup_subsys_id), + struct bio_cgroup, css); +} + +/* Return the bio_cgroup that associates with a process. */ +static inline struct bio_cgroup *bio_cgroup_from_task(struct task_struct *p) +{ + return container_of(task_subsys_state(p, bio_cgroup_subsys_id), + struct bio_cgroup, css); +} + +static struct idr bio_cgroup_id; +static DEFINE_SPINLOCK(bio_cgroup_idr_lock); +static struct io_context default_bio_io_context; +static struct bio_cgroup default_bio_cgroup = { + .id = 0, + .io_context = &default_bio_io_context, +}; + +/* + * This function is used to make a given page have the bio-cgroup id of + * the owner of this page. + */ +void bio_cgroup_set_owner(struct page *page, struct mm_struct *mm) +{ + struct bio_cgroup *biog; + struct page_cgroup *pc; + + if (bio_cgroup_disabled()) + return; + pc = lookup_page_cgroup(page); + if (unlikely(!pc)) + return; + + pc->bio_cgroup_id = 0; /* 0: default bio_cgroup id */ + if (!mm) + return; + /* + * Locking "pc" isn't necessary here since the current process is + * the only one that can access the members related to bio_cgroup. + */ + rcu_read_lock(); + biog = bio_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!biog)) + goto out; + /* + * css_get(&bio->css) isn't called to increment the reference + * count of this bio_cgroup "biog" so pc->bio_cgroup_id might turn + * invalid even if this page is still active. + * This approach is chosen to minimize the overhead. + */ + pc->bio_cgroup_id = biog->id; +out: + rcu_read_unlock(); +} + +/* + * Change the owner of a given page if necessary. + */ +void bio_cgroup_reset_owner(struct page *page, struct mm_struct *mm) +{ + /* + * A little trick: + * Just call bio_cgroup_set_owner() for pages which are already + * active since the bio_cgroup_id member of page_cgroup can be + * updated without any locks. This is because an integer type of + * variable can be set a new value at once on modern cpus. + */ + bio_cgroup_set_owner(page, mm); +} + +/* + * Change the owner of a given page. This function is only effective for + * pages in the pagecache. + */ +void bio_cgroup_reset_owner_pagedirty(struct page *page, struct mm_struct *mm) +{ + if (PageSwapCache(page) || PageAnon(page)) + return; + if (current->flags & PF_MEMALLOC) + return; + + bio_cgroup_reset_owner(page, mm); +} + +/* + * Assign "page" the same owner as "opage." + */ +void bio_cgroup_copy_owner(struct page *npage, struct page *opage) +{ + struct page_cgroup *npc, *opc; + + if (bio_cgroup_disabled()) + return; + npc = lookup_page_cgroup(npage); + if (unlikely(!npc)) + return; + opc = lookup_page_cgroup(opage); + if (unlikely(!opc)) + return; + + /* + * Do this without any locks. The reason is the same as + * bio_cgroup_reset_owner(). + */ + npc->bio_cgroup_id = opc->bio_cgroup_id; +} + +/* Create a new bio-cgroup. */ +static struct cgroup_subsys_state * +bio_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct bio_cgroup *biog; + struct io_context *ioc; + int ret; + + if (!cgrp->parent) { + biog = &default_bio_cgroup; + init_io_context(biog->io_context); + /* Increment the referrence count not to be released ever. */ + atomic_inc(&biog->io_context->refcount); + idr_init(&bio_cgroup_id); + return &biog->css; + } + + biog = kzalloc(sizeof(*biog), GFP_KERNEL); + ioc = alloc_io_context(GFP_KERNEL, -1); + if (!ioc || !biog) { + ret = -ENOMEM; + goto out_err; + } + biog->io_context = ioc; +retry: + if (!idr_pre_get(&bio_cgroup_id, GFP_KERNEL)) { + ret = -EAGAIN; + goto out_err; + } + spin_lock_irq(&bio_cgroup_idr_lock); + ret = idr_get_new_above(&bio_cgroup_id, (void *)biog, 1, &biog->id); + spin_unlock_irq(&bio_cgroup_idr_lock); + if (ret == -EAGAIN) + goto retry; + else if (ret) + goto out_err; + + return &biog->css; +out_err: + kfree(biog); + if (ioc) + put_io_context(ioc); + return ERR_PTR(ret); +} + +/* Delete the bio-cgroup. */ +static void bio_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct bio_cgroup *biog = cgroup_bio(cgrp); + + put_io_context(biog->io_context); + + spin_lock_irq(&bio_cgroup_idr_lock); + idr_remove(&bio_cgroup_id, biog->id); + spin_unlock_irq(&bio_cgroup_idr_lock); + + kfree(biog); +} + +static struct bio_cgroup *find_bio_cgroup(int id) +{ + struct bio_cgroup *biog; + spin_lock_irq(&bio_cgroup_idr_lock); + /* + * It might fail to find A bio-group associated with "id" since it + * is allowed to remove the bio-cgroup even when some of I/O requests + * this group issued haven't completed yet. + */ + biog = (struct bio_cgroup *)idr_find(&bio_cgroup_id, id); + spin_unlock_irq(&bio_cgroup_idr_lock); + return biog; +} + +struct cgroup *bio_id_to_cgroup(int id) +{ + struct bio_cgroup *biog; + + biog = find_bio_cgroup(id); + if (biog) + return biog->css.cgroup; + + return NULL; +} + +struct cgroup *get_cgroup_from_page(struct page *page) +{ + struct page_cgroup *pc; + struct bio_cgroup *biog; + struct cgroup *cgrp = NULL; + + pc = lookup_page_cgroup(page); + if (!pc) + return NULL; + lock_page_cgroup(pc); + biog = find_bio_cgroup(pc->bio_cgroup_id); + if (biog) { + css_get(&biog->css); + cgrp = biog->css.cgroup; + } + unlock_page_cgroup(pc); + return cgrp; +} + +void put_cgroup_from_page(struct page *page) +{ + struct bio_cgroup *biog; + struct page_cgroup *pc; + + pc = lookup_page_cgroup(page); + if (!pc) + return; + lock_page_cgroup(pc); + biog = find_bio_cgroup(pc->bio_cgroup_id); + if (biog) + css_put(&biog->css); + unlock_page_cgroup(pc); +} + +/* Determine the bio-cgroup id of a given bio. */ +int get_bio_cgroup_id(struct bio *bio) +{ + struct page_cgroup *pc; + struct page *page = bio_iovec_idx(bio, 0)->bv_page; + int id = 0; + + pc = lookup_page_cgroup(page); + if (pc) + id = pc->bio_cgroup_id; + return id; +} +EXPORT_SYMBOL(get_bio_cgroup_id); + +/* Determine the iocontext of the bio-cgroup that issued a given bio. */ +struct io_context *get_bio_cgroup_iocontext(struct bio *bio) +{ + struct bio_cgroup *biog = NULL; + struct io_context *ioc; + int id = 0; + + id = get_bio_cgroup_id(bio); + if (id) + biog = find_bio_cgroup(id); + if (!biog) + biog = &default_bio_cgroup; + ioc = biog->io_context; /* default io_context for this cgroup */ + atomic_inc(&ioc->refcount); + return ioc; +} +EXPORT_SYMBOL(get_bio_cgroup_iocontext); + +static u64 bio_id_read(struct cgroup *cgrp, struct cftype *cft) +{ + struct bio_cgroup *biog = cgroup_bio(cgrp); + return (u64) biog->id; +} + + +static struct cftype bio_files[] = { + { + .name = "id", + .read_u64 = bio_id_read, + }, +}; + +static int bio_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + return cgroup_add_files(cgrp, ss, bio_files, ARRAY_SIZE(bio_files)); +} + +static void bio_cgroup_attach(struct cgroup_subsys *ss, + struct cgroup *cont, struct cgroup *oldcont, + struct task_struct *tsk) +{ + struct tsk_move_msg tmm; + struct bio_cgroup *old_biog, *new_biog; + + old_biog = cgroup_bio(oldcont); + new_biog = cgroup_bio(cont); + tmm.old_id = old_biog->id; + tmm.new_id = new_biog->id; + tmm.tsk = tsk; + blocking_notifier_call_chain(&biocgroup_chain, MOVETASK, &tmm); +} + +struct cgroup_subsys bio_cgroup_subsys = { + .name = "bio", + .create = bio_cgroup_create, + .destroy = bio_cgroup_destroy, + .populate = bio_cgroup_populate, + .attach = bio_cgroup_attach, + .subsys_id = bio_cgroup_subsys_id, +}; + diff --git a/mm/bounce.c b/mm/bounce.c index e590272..1a01905 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -212,6 +213,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, to->bv_len = from->bv_len; to->bv_offset = from->bv_offset; inc_zone_page_state(to->bv_page, NR_BOUNCE); + bio_cgroup_copy_owner(to->bv_page, page); if (rw == WRITE) { char *vto, *vfrom; diff --git a/mm/filemap.c b/mm/filemap.c index 8bd4980..1ab32a2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -33,6 +33,7 @@ #include #include /* for BUG_ON(!in_atomic()) only */ #include +#include #include /* for page_is_file_cache() */ #include "internal.h" @@ -463,6 +464,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, gfp_mask & GFP_RECLAIM_MASK); if (error) goto out; + bio_cgroup_set_owner(page, current->mm); error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e44fb0f..c25eb63 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2524,6 +2524,11 @@ struct cgroup_subsys mem_cgroup_subsys = { .use_id = 1, }; +void __meminit __init_mem_page_cgroup(struct page_cgroup *pc) +{ + pc->mem_cgroup = NULL; +} + #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP static int __init disable_swap_account(char *s) diff --git a/mm/memory.c b/mm/memory.c index cf6873e..7779e12 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -2052,6 +2053,7 @@ gotten: * thread doing COW. */ ptep_clear_flush_notify(vma, address, page_table); + bio_cgroup_set_owner(new_page, mm); page_add_new_anon_rmap(new_page, vma, address); set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); @@ -2497,6 +2499,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); page_add_anon_rmap(page, vma, address); + bio_cgroup_reset_owner(page, mm); /* It's better to call commit-charge after rmap is established */ mem_cgroup_commit_charge_swapin(page, ptr); @@ -2559,6 +2562,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; inc_mm_counter(mm, anon_rss); + bio_cgroup_set_owner(page, mm); page_add_new_anon_rmap(page, vma, address); set_pte_at(mm, address, page_table, entry); @@ -2711,6 +2715,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { inc_mm_counter(mm, anon_rss); + bio_cgroup_set_owner(page, mm); page_add_new_anon_rmap(page, vma, address); } else { inc_mm_counter(mm, file_rss); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 30351f0..1379eb0 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -1243,6 +1244,7 @@ int __set_page_dirty_nobuffers(struct page *page) BUG_ON(mapping2 != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); account_page_dirtied(page, mapping); + bio_cgroup_reset_owner_pagedirty(page, current->mm); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 791905c..f692ee2 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -9,13 +9,16 @@ #include #include #include +#include +#include static void __meminit __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) { pc->flags = 0; - pc->mem_cgroup = NULL; pc->page = pfn_to_page(pfn); + __init_mem_page_cgroup(pc); + __init_bio_page_cgroup(pc); INIT_LIST_HEAD(&pc->lru); } static unsigned long total_usage; @@ -74,7 +77,7 @@ void __init page_cgroup_init(void) int nid, fail; - if (mem_cgroup_disabled()) + if (mem_cgroup_disabled() && bio_cgroup_disabled()) return; for_each_online_node(nid) { @@ -83,12 +86,12 @@ void __init page_cgroup_init(void) goto fail; } printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); - printk(KERN_INFO "please try cgroup_disable=memory option if you" + printk(KERN_INFO "please try cgroup_disable=memory,bio option if you" " don't want\n"); return; fail: printk(KERN_CRIT "allocation of page_cgroup was failed.\n"); - printk(KERN_CRIT "please try cgroup_disable=memory boot option\n"); + printk(KERN_CRIT "please try cgroup_disable=memory,bio boot options\n"); panic("Out of memory"); } @@ -248,7 +251,7 @@ void __init page_cgroup_init(void) unsigned long pfn; int fail = 0; - if (mem_cgroup_disabled()) + if (mem_cgroup_disabled() && bio_cgroup_disabled()) return; for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { @@ -263,8 +266,8 @@ void __init page_cgroup_init(void) hotplug_memory_notifier(page_cgroup_callback, 0); } printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); - printk(KERN_INFO "please try cgroup_disable=memory option if you don't" - " want\n"); + printk(KERN_INFO + "try cgroup_disable=memory,bio option if you don't want\n"); } void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) diff --git a/mm/swap_state.c b/mm/swap_state.c index 3ecea98..c7ad256 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -308,6 +309,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, */ __set_page_locked(new_page); SetPageSwapBacked(new_page); + bio_cgroup_set_owner(new_page, current->mm); err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); if (likely(!err)) { /* -- 1.5.6.3 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/