Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1756664AbYGJXtC (ORCPT ); Thu, 10 Jul 2008 19:49:02 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1751442AbYGJXsy (ORCPT ); Thu, 10 Jul 2008 19:48:54 -0400 Received: from fgwmail7.fujitsu.co.jp ([192.51.44.37]:37122 "EHLO fgwmail7.fujitsu.co.jp" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751318AbYGJXsx (ORCPT ); Thu, 10 Jul 2008 19:48:53 -0400 Date: Fri, 11 Jul 2008 08:54:49 +0900 From: KAMEZAWA Hiroyuki To: yamamoto@valinux.co.jp (YAMAMOTO Takashi) Cc: linux-kernel@vger.kernel.org, containers@lists.linux-foundation.org, a.p.zijlstra@chello.nl, menage@google.com Subject: Re: [PATCH][RFC] dirty balancing for cgroups Message-Id: <20080711085449.ba7d14dd.kamezawa.hiroyu@jp.fujitsu.com> In-Reply-To: <20080709060034.0CB2D5A29@siro.lan> References: <20080709060034.0CB2D5A29@siro.lan> Organization: Fujitsu X-Mailer: Sylpheed 2.4.2 (GTK+ 2.10.11; i686-pc-mingw32) Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 9751 Lines: 372 On Wed, 9 Jul 2008 15:00:34 +0900 (JST) yamamoto@valinux.co.jp (YAMAMOTO Takashi) wrote: > hi, > > the following patch is a simple implementation of > dirty balancing for cgroups. any comments? > > it depends on the following fix: > http://lkml.org/lkml/2008/7/8/428 > A few comments ;) - This looks simple but, could you merge this into memory resource controller ? (if conflict, I'll queue on my stack.) - Do you have some number ? or How we can test this works well ? - please CC to linux-mm. Thanks, -Kame > YAMAMOTO Takashi > > > Signed-off-by: YAMAMOTO Takashi > --- > > diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h > index 23c02e2..f5453cc 100644 > --- a/include/linux/cgroup_subsys.h > +++ b/include/linux/cgroup_subsys.h > @@ -52,3 +52,9 @@ SUBSYS(memrlimit_cgroup) > #endif > > /* */ > + > +#ifdef CONFIG_CGROUP_MEMDIRTYLIMIT_CTLR > +SUBSYS(memdirtylimit_cgroup) > +#endif > + > +/* */ > diff --git a/include/linux/memdirtylimitcgroup.h b/include/linux/memdirtylimitcgroup.h > new file mode 100644 > index 0000000..667d312 > --- /dev/null > +++ b/include/linux/memdirtylimitcgroup.h > @@ -0,0 +1,47 @@ > + > +/* > + * memdirtylimitcgroup.h COPYRIGHT FUJITSU LIMITED 2008 > + * > + * Author: yamamoto@valinux.co.jp > + */ > + > +struct task_struct; > + > +#if defined(CONFIG_CGROUP_MEMDIRTYLIMIT_CTLR) > + > +void memdirtylimitcgroup_dirty_inc(struct task_struct *); > +void memdirtylimitcgroup_dirty_limit(struct task_struct *, long *); > +void memdirtylimitcgroup_change_shift(int); > +void memdirtylimitcgroup_init(int); > + > +#else /* defined(CONFIG_CGROUP_MEMDIRTYLIMIT_CTLR) */ > + > +static inline void > +memdirtylimitcgroup_dirty_inc(struct task_struct *t) > +{ > + > + /* nothing */ > +} > + > +static inline void > +memdirtylimitcgroup_dirty_limit(struct task_struct *t, long *dirtyp) > +{ > + > + /* nothing */ > +} > + > +static inline void > +memdirtylimitcgroup_change_shift(int shift) > +{ > + > + /* nothing */ > +} > + > +static inline void > +memdirtylimitcgroup_init(int shift) > +{ > + > + /* nothing */ > +} > + > +#endif /* defined(CONFIG_CGROUP_MEMDIRTYLIMIT_CTLR) */ > diff --git a/init/Kconfig b/init/Kconfig > index 162d462..985bac8 100644 > --- a/init/Kconfig > +++ b/init/Kconfig > @@ -418,6 +418,12 @@ config CGROUP_MEMRLIMIT_CTLR > memory RSS and Page Cache control. Virtual address space control > is provided by this controller. > > +config CGROUP_MEMDIRTYLIMIT_CTLR > + bool "Memory Dirty Limit Controller for Control Groups" > + depends on CGROUPS && RESOURCE_COUNTERS > + help > + XXX TBD > + > config SYSFS_DEPRECATED > bool > > diff --git a/mm/Makefile b/mm/Makefile > index f54232d..8603d19 100644 > --- a/mm/Makefile > +++ b/mm/Makefile > @@ -35,4 +35,5 @@ obj-$(CONFIG_SMP) += allocpercpu.o > obj-$(CONFIG_QUICKLIST) += quicklist.o > obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o > obj-$(CONFIG_CGROUP_MEMRLIMIT_CTLR) += memrlimitcgroup.o > +obj-$(CONFIG_CGROUP_MEMDIRTYLIMIT_CTLR) += memdirtylimitcgroup.o > obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o > diff --git a/mm/memdirtylimitcgroup.c b/mm/memdirtylimitcgroup.c > new file mode 100644 > index 0000000..b70b33d > --- /dev/null > +++ b/mm/memdirtylimitcgroup.c > @@ -0,0 +1,179 @@ > + > +/* > + * memdirtylimitcgroup.c COPYRIGHT FUJITSU LIMITED 2008 > + * > + * Author: yamamoto@valinux.co.jp > + */ > + > +#include > +#include > +#include > +#include > +#include > + > +#include > + > +static struct prop_descriptor vm_cgroup_dirties; > + > +struct memdirtylimit_cgroup { > + struct cgroup_subsys_state dlcg_css; > + spinlock_t dlcg_lock; > + struct prop_local_single dlcg_dirties; > +}; > + > +static struct cgroup_subsys_state * > +task_to_css(struct task_struct *task) > +{ > + > + return task_subsys_state(task, memdirtylimit_cgroup_subsys_id); > +} > + > +static struct memdirtylimit_cgroup * > +css_to_dlcg(struct cgroup_subsys_state *css) > +{ > + > + return container_of(css, struct memdirtylimit_cgroup, dlcg_css); > +} > + > +static struct cgroup_subsys_state * > +cg_to_css(struct cgroup *cg) > +{ > + > + return cgroup_subsys_state(cg, memdirtylimit_cgroup_subsys_id); > +} > + > +static struct memdirtylimit_cgroup * > +cg_to_dlcg(struct cgroup *cg) > +{ > + > + return css_to_dlcg(cg_to_css(cg)); > +} > + > +/* ---------------------------------------- */ > + > +static void > +getfraction(struct memdirtylimit_cgroup *dlcg, long *numeratorp, > + long *denominatorp) > +{ > + > + spin_lock(&dlcg->dlcg_lock); > + prop_fraction_single(&vm_cgroup_dirties, &dlcg->dlcg_dirties, > + numeratorp, denominatorp); > + spin_unlock(&dlcg->dlcg_lock); > +} > + > +/* ---------------------------------------- */ > + > +void > +memdirtylimitcgroup_dirty_inc(struct task_struct *t) > +{ > + struct memdirtylimit_cgroup *dlcg; > + > + rcu_read_lock(); > + dlcg = css_to_dlcg(task_to_css(t)); > + spin_lock(&dlcg->dlcg_lock); > + prop_inc_single(&vm_cgroup_dirties, &dlcg->dlcg_dirties); > + spin_unlock(&dlcg->dlcg_lock); > + rcu_read_unlock(); > +} > + > +void > +memdirtylimitcgroup_dirty_limit(struct task_struct *t, long *dirtyp) > +{ > + struct memdirtylimit_cgroup *dlcg; > + unsigned long dirty = *dirtyp; > + uint64_t tmp; > + long numerator; > + long denominator; > + > + BUG_ON(*dirtyp < 0); > + > + rcu_read_lock(); > + dlcg = css_to_dlcg(task_to_css(t)); > + getfraction(dlcg, &numerator, &denominator); > + rcu_read_unlock(); > + > + tmp = (uint64_t)(dirty >> 1) * numerator; > + do_div(tmp, denominator); > + *dirtyp = dirty - (unsigned long)tmp; > +} > + > +void > +memdirtylimitcgroup_change_shift(int shift) > +{ > + > + prop_change_shift(&vm_cgroup_dirties, shift); > +} > + > +void > +memdirtylimitcgroup_init(int shift) > +{ > + > + prop_descriptor_init(&vm_cgroup_dirties, shift); > +} > + > +/* ---------------------------------------- */ > + > +static u64 > +memdirtylimit_cgroup_read_fraction(struct cgroup *cg, struct cftype *cft) > +{ > + struct memdirtylimit_cgroup *dlcg; > + uint64_t result; > + long numerator; > + long denominator; > + > + dlcg = cg_to_dlcg(cg); > + getfraction(dlcg, &numerator, &denominator); > + result = (uint64_t)100 * numerator; > + do_div(result, denominator); > + return result; > +} > + > +static const struct cftype files[] = { > + { > + .name = "fraction", > + .read_u64 = memdirtylimit_cgroup_read_fraction, > + }, > +}; > + > +static int > +memdirtylimit_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cg) > +{ > + > + return cgroup_add_files(cg, ss, files, ARRAY_SIZE(files)); > +} > + > +static struct cgroup_subsys_state * > +memdirtylimit_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cg) > +{ > + struct memdirtylimit_cgroup *dlcg; > + int error; > + > + dlcg = kzalloc(sizeof(*dlcg), GFP_KERNEL); > + if (dlcg == NULL) > + return ERR_PTR(-ENOMEM); > + error = prop_local_init_single(&dlcg->dlcg_dirties); > + if (error != 0) { > + kfree(dlcg); > + return ERR_PTR(error); > + } > + spin_lock_init(&dlcg->dlcg_lock); > + return &dlcg->dlcg_css; > +} > + > +static void > +memdirtylimit_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cg) > +{ > + struct memdirtylimit_cgroup *dlcg = cg_to_dlcg(cg); > + > + prop_local_destroy_single(&dlcg->dlcg_dirties); > + kfree(dlcg); > +} > + > +struct cgroup_subsys memdirtylimit_cgroup_subsys = { > + .name = "memdirtylimit", > + .subsys_id = memdirtylimit_cgroup_subsys_id, > + .create = memdirtylimit_cgroup_create, > + .destroy = memdirtylimit_cgroup_destroy, > + .populate = memdirtylimit_cgroup_populate, > +}; > diff --git a/mm/page-writeback.c b/mm/page-writeback.c > index e6fa69e..f971532 100644 > --- a/mm/page-writeback.c > +++ b/mm/page-writeback.c > @@ -34,6 +34,7 @@ > #include > #include > #include > +#include > > /* > * The maximum number of pages to writeout in a single bdflush/kupdate > @@ -152,6 +153,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write, > int shift = calc_period_shift(); > prop_change_shift(&vm_completions, shift); > prop_change_shift(&vm_dirties, shift); > + memdirtylimitcgroup_change_shift(shift); > } > return ret; > } > @@ -393,6 +395,8 @@ get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, > if (bdi) { > u64 bdi_dirty; > long numerator, denominator; > + long task_dirty; > + long cgroup_dirty; > > /* > * Calculate this BDI's share of the dirty ratio. > @@ -408,7 +412,11 @@ get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, > > *pbdi_dirty = bdi_dirty; > clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty); > - task_dirty_limit(current, pbdi_dirty); > + task_dirty = *pbdi_dirty; > + task_dirty_limit(current, &task_dirty); > + cgroup_dirty = *pbdi_dirty; > + memdirtylimitcgroup_dirty_limit(current, &cgroup_dirty); > + *pbdi_dirty = min(task_dirty, cgroup_dirty); > } > } > > @@ -842,6 +850,7 @@ void __init page_writeback_init(void) > shift = calc_period_shift(); > prop_descriptor_init(&vm_completions, shift); > prop_descriptor_init(&vm_dirties, shift); > + memdirtylimitcgroup_init(shift); > } > > /** > @@ -1105,6 +1114,7 @@ int __set_page_dirty_nobuffers(struct page *page) > } > > task_dirty_inc(current); > + memdirtylimitcgroup_dirty_inc(current); > > return 1; > } > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/