2010-02-26 22:53:01

by Andrea Righi

[permalink] [raw]
Subject: [PATCH -mmotm 0/2] memcg: per cgroup dirty limit (v2)

Control the maximum amount of dirty pages a cgroup can have at any given time.

Per cgroup dirty limit is like fixing the max amount of dirty (hard to reclaim)
page cache used by any cgroup. So, in case of multiple cgroup writers, they
will not be able to consume more than their designated share of dirty pages and
will be forced to perform write-out if they cross that limit.

The overall design is the following:

- account dirty pages per cgroup
- limit the number of dirty pages via memory.dirty_ratio / memory.dirty_bytes
and memory.dirty_background_ratio / memory.dirty_background_bytes in
cgroupfs
- start to write-out (background or actively) when the cgroup limits are
exceeded

This feature is supposed to be strictly connected to any underlying IO
controller implementation, so we can stop increasing dirty pages in VM layer
and enforce a write-out before any cgroup will consume the global amount of
dirty pages.

Changelog (v1 -> v2)
~~~~~~~~~~~~~~~~~~~~~~
* rebased to -mmotm
* properly handle hierarchical accounting
* added the same system-wide interfaces to set dirty limits
(memory.dirty_ratio / memory.dirty_bytes, memory.dirty_background_ratio, memory.dirty_background_bytes)
* other minor fixes and improvements based on the received feedbacks

TODO:
- handle the migration of tasks across different cgroups (maybe adding
DIRTY/WRITEBACK/UNSTABLE flag to struct page_cgroup)
- provide an appropriate documentation (in Documentation/cgroups/memory.txt)


2010-02-26 22:53:15

by Andrea Righi

[permalink] [raw]
Subject: [PATCH -mmotm 1/2] memcg: dirty pages accounting and limiting infrastructure

Infrastructure to account dirty pages per cgroup and add dirty limit
interfaces in the cgroupfs:

- Active write-out: memory.dirty_ratio, memory.dirty_bytes
- Background write-out: memory.dirty_background_ratio, memory.dirty_background_bytes

Signed-off-by: Andrea Righi <[email protected]>
---
include/linux/memcontrol.h | 74 +++++++++-
mm/memcontrol.c | 354 ++++++++++++++++++++++++++++++++++++++++----
2 files changed, 399 insertions(+), 29 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1f9b119..e6af95c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -25,6 +25,41 @@ struct page_cgroup;
struct page;
struct mm_struct;

+/* Cgroup memory statistics items exported to the kernel */
+enum mem_cgroup_page_stat_item {
+ MEMCG_NR_DIRTYABLE_PAGES,
+ MEMCG_NR_RECLAIM_PAGES,
+ MEMCG_NR_WRITEBACK,
+ MEMCG_NR_DIRTY_WRITEBACK_PAGES,
+};
+
+/*
+ * Statistics for memory cgroup.
+ */
+enum mem_cgroup_stat_index {
+ /*
+ * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
+ */
+ MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
+ MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
+ MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
+ MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
+ MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
+ MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
+ MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
+ MEM_CGROUP_STAT_SOFTLIMIT, /* decrements on each page in/out.
+ used by soft limit implementation */
+ MEM_CGROUP_STAT_THRESHOLDS, /* decrements on each page in/out.
+ used by threshold implementation */
+ MEM_CGROUP_STAT_FILE_DIRTY, /* # of dirty pages in page cache */
+ MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */
+ MEM_CGROUP_STAT_WRITEBACK_TEMP, /* # of pages under writeback using
+ temporary buffers */
+ MEM_CGROUP_STAT_UNSTABLE_NFS, /* # of NFS unstable pages */
+
+ MEM_CGROUP_STAT_NSTATS,
+};
+
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
/*
* All "charge" functions with gfp_mask should use GFP_KERNEL or
@@ -117,6 +152,13 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
extern int do_swap_account;
#endif

+extern long mem_cgroup_dirty_ratio(void);
+extern unsigned long mem_cgroup_dirty_bytes(void);
+extern long mem_cgroup_dirty_background_ratio(void);
+extern unsigned long mem_cgroup_dirty_background_bytes(void);
+
+extern s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item);
+
static inline bool mem_cgroup_disabled(void)
{
if (mem_cgroup_subsys.disabled)
@@ -125,7 +167,8 @@ static inline bool mem_cgroup_disabled(void)
}

extern bool mem_cgroup_oom_called(struct task_struct *task);
-void mem_cgroup_update_file_mapped(struct page *page, int val);
+void mem_cgroup_update_stat(struct page *page,
+ enum mem_cgroup_stat_index idx, int val);
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask, int nid,
int zid);
@@ -300,8 +343,8 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
}

-static inline void mem_cgroup_update_file_mapped(struct page *page,
- int val)
+static inline void mem_cgroup_update_stat(struct page *page,
+ enum mem_cgroup_stat_index idx, int val)
{
}

@@ -312,6 +355,31 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
return 0;
}

+static inline long mem_cgroup_dirty_ratio(void)
+{
+ return vm_dirty_ratio;
+}
+
+static inline unsigned long mem_cgroup_dirty_bytes(void)
+{
+ return vm_dirty_bytes;
+}
+
+static inline long mem_cgroup_dirty_background_ratio(void)
+{
+ return dirty_background_ratio;
+}
+
+static inline unsigned long mem_cgroup_dirty_background_bytes(void)
+{
+ return dirty_background_bytes;
+}
+
+static inline s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item)
+{
+ return -ENOMEM;
+}
+
#endif /* CONFIG_CGROUP_MEM_CONT */

#endif /* _LINUX_MEMCONTROL_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a443c30..56f3204 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -66,31 +66,16 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
#define SOFTLIMIT_EVENTS_THRESH (1000)
#define THRESHOLDS_EVENTS_THRESH (100)

-/*
- * Statistics for memory cgroup.
- */
-enum mem_cgroup_stat_index {
- /*
- * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
- */
- MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
- MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
- MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
- MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
- MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
- MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
- MEM_CGROUP_STAT_SOFTLIMIT, /* decrements on each page in/out.
- used by soft limit implementation */
- MEM_CGROUP_STAT_THRESHOLDS, /* decrements on each page in/out.
- used by threshold implementation */
-
- MEM_CGROUP_STAT_NSTATS,
-};
-
struct mem_cgroup_stat_cpu {
s64 count[MEM_CGROUP_STAT_NSTATS];
};

+/* Per cgroup page statistics */
+struct mem_cgroup_page_stat {
+ enum mem_cgroup_page_stat_item item;
+ s64 value;
+};
+
/*
* per-zone information in memory controller.
*/
@@ -205,6 +190,14 @@ struct mem_cgroup {

unsigned int swappiness;

+ /* control memory cgroup dirty pages */
+ long dirty_ratio;
+ unsigned long dirty_bytes;
+
+ /* control background writeback (via writeback threads) */
+ long dirty_background_ratio;
+ unsigned long dirty_background_bytes;
+
/* set when res.limit == memsw.limit */
bool memsw_is_minimum;

@@ -1021,6 +1014,169 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
return swappiness;
}

+enum mem_cgroup_dirty_param {
+ MEM_CGROUP_DIRTY_RATIO,
+ MEM_CGROUP_DIRTY_BYTES,
+ MEM_CGROUP_DIRTY_BACKGROUND_RATIO,
+ MEM_CGROUP_DIRTY_BACKGROUND_BYTES,
+};
+
+static unsigned long get_dirty_param(struct mem_cgroup *memcg,
+ enum mem_cgroup_dirty_param idx)
+{
+ unsigned long ret;
+
+ spin_lock(&memcg->reclaim_param_lock);
+ switch (idx) {
+ case MEM_CGROUP_DIRTY_RATIO:
+ ret = memcg->dirty_ratio;
+ break;
+ case MEM_CGROUP_DIRTY_BYTES:
+ ret = memcg->dirty_bytes;
+ break;
+ case MEM_CGROUP_DIRTY_BACKGROUND_RATIO:
+ ret = memcg->dirty_background_ratio;
+ break;
+ case MEM_CGROUP_DIRTY_BACKGROUND_BYTES:
+ ret = memcg->dirty_background_bytes;
+ break;
+ default:
+ VM_BUG_ON(1);
+ }
+ spin_unlock(&memcg->reclaim_param_lock);
+
+ return ret;
+}
+
+long mem_cgroup_dirty_ratio(void)
+{
+ struct mem_cgroup *memcg;
+ long ret = vm_dirty_ratio;
+
+ if (mem_cgroup_disabled())
+ goto out;
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(current);
+ if (likely(memcg))
+ ret = get_dirty_param(memcg, MEM_CGROUP_DIRTY_RATIO);
+ rcu_read_unlock();
+out:
+ return ret;
+}
+
+unsigned long mem_cgroup_dirty_bytes(void)
+{
+ struct mem_cgroup *memcg;
+ unsigned long ret = vm_dirty_bytes;
+
+ if (mem_cgroup_disabled())
+ goto out;
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(current);
+ if (likely(memcg))
+ ret = get_dirty_param(memcg, MEM_CGROUP_DIRTY_BYTES);
+ rcu_read_unlock();
+out:
+ return ret;
+}
+
+long mem_cgroup_dirty_background_ratio(void)
+{
+ struct mem_cgroup *memcg;
+ long ret = dirty_background_ratio;
+
+ if (mem_cgroup_disabled())
+ goto out;
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(current);
+ if (likely(memcg))
+ ret = get_dirty_param(memcg, MEM_CGROUP_DIRTY_BACKGROUND_RATIO);
+ rcu_read_unlock();
+out:
+ return ret;
+}
+
+unsigned long mem_cgroup_dirty_background_bytes(void)
+{
+ struct mem_cgroup *memcg;
+ unsigned long ret = dirty_background_bytes;
+
+ if (mem_cgroup_disabled())
+ goto out;
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(current);
+ if (likely(memcg))
+ ret = get_dirty_param(memcg, MEM_CGROUP_DIRTY_BACKGROUND_BYTES);
+ rcu_read_unlock();
+out:
+ return ret;
+}
+
+static s64 mem_cgroup_get_local_page_stat(struct mem_cgroup *memcg,
+ enum mem_cgroup_page_stat_item item)
+{
+ s64 ret;
+
+ switch (item) {
+ case MEMCG_NR_DIRTYABLE_PAGES:
+ ret = res_counter_read_u64(&memcg->res, RES_LIMIT) -
+ res_counter_read_u64(&memcg->res, RES_USAGE);
+ /* Translate free memory in pages */
+ ret >>= PAGE_SHIFT;
+ ret += mem_cgroup_read_stat(memcg, LRU_ACTIVE_ANON) +
+ mem_cgroup_read_stat(memcg, LRU_ACTIVE_FILE) +
+ mem_cgroup_read_stat(memcg, LRU_INACTIVE_ANON) +
+ mem_cgroup_read_stat(memcg, LRU_INACTIVE_FILE);
+ break;
+ case MEMCG_NR_RECLAIM_PAGES:
+ ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_DIRTY) +
+ mem_cgroup_read_stat(memcg,
+ MEM_CGROUP_STAT_UNSTABLE_NFS);
+ break;
+ case MEMCG_NR_WRITEBACK:
+ ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+ break;
+ case MEMCG_NR_DIRTY_WRITEBACK_PAGES:
+ ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) +
+ mem_cgroup_read_stat(memcg,
+ MEM_CGROUP_STAT_UNSTABLE_NFS);
+ break;
+ default:
+ ret = 0;
+ WARN_ON_ONCE(1);
+ }
+ return ret;
+}
+
+static int mem_cgroup_page_stat_cb(struct mem_cgroup *mem, void *data)
+{
+ struct mem_cgroup_page_stat *stat = (struct mem_cgroup_page_stat *)data;
+
+ stat->value += mem_cgroup_get_local_page_stat(mem, stat->item);
+ return 0;
+}
+
+s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item)
+{
+ struct mem_cgroup_page_stat stat = {};
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(current);
+ if (memcg) {
+ /*
+ * Recursively evaulate page statistics against all cgroup
+ * under hierarchy tree
+ */
+ stat.item = item;
+ mem_cgroup_walk_tree(memcg, &stat, mem_cgroup_page_stat_cb);
+ } else
+ stat.value = -ENOMEM;
+ rcu_read_unlock();
+
+ return stat.value;
+}
+
static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
{
int *val = data;
@@ -1263,10 +1419,10 @@ static void record_last_oom(struct mem_cgroup *mem)
}

/*
- * Currently used to update mapped file statistics, but the routine can be
- * generalized to update other statistics as well.
+ * Generalized routine to update memory cgroup statistics.
*/
-void mem_cgroup_update_file_mapped(struct page *page, int val)
+void mem_cgroup_update_stat(struct page *page,
+ enum mem_cgroup_stat_index idx, int val)
{
struct mem_cgroup *mem;
struct page_cgroup *pc;
@@ -1286,7 +1442,8 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
/*
* Preemption is already disabled. We can use __this_cpu_xxx
*/
- __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
+ VM_BUG_ON(idx >= MEM_CGROUP_STAT_NSTATS);
+ __this_cpu_add(mem->stat->count[idx], val);

done:
unlock_page_cgroup(pc);
@@ -3033,6 +3190,10 @@ enum {
MCS_PGPGIN,
MCS_PGPGOUT,
MCS_SWAP,
+ MCS_FILE_DIRTY,
+ MCS_WRITEBACK,
+ MCS_WRITEBACK_TEMP,
+ MCS_UNSTABLE_NFS,
MCS_INACTIVE_ANON,
MCS_ACTIVE_ANON,
MCS_INACTIVE_FILE,
@@ -3055,6 +3216,10 @@ struct {
{"pgpgin", "total_pgpgin"},
{"pgpgout", "total_pgpgout"},
{"swap", "total_swap"},
+ {"filedirty", "dirty_pages"},
+ {"writeback", "writeback_pages"},
+ {"writeback_tmp", "writeback_temp_pages"},
+ {"nfs", "nfs_unstable"},
{"inactive_anon", "total_inactive_anon"},
{"active_anon", "total_active_anon"},
{"inactive_file", "total_inactive_file"},
@@ -3083,6 +3248,14 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
s->stat[MCS_SWAP] += val * PAGE_SIZE;
}
+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_DIRTY);
+ s->stat[MCS_FILE_DIRTY] += val;
+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_WRITEBACK);
+ s->stat[MCS_WRITEBACK] += val;
+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_WRITEBACK_TEMP);
+ s->stat[MCS_WRITEBACK_TEMP] += val;
+ val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_UNSTABLE_NFS);
+ s->stat[MCS_UNSTABLE_NFS] += val;

/* per zone stat */
val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
@@ -3467,6 +3640,100 @@ unlock:
return ret;
}

+static u64 mem_cgroup_dirty_ratio_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+
+ return get_dirty_param(memcg, MEM_CGROUP_DIRTY_RATIO);
+}
+
+static int
+mem_cgroup_dirty_ratio_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+
+ if ((cgrp->parent == NULL) || (val > 100))
+ return -EINVAL;
+
+ spin_lock(&memcg->reclaim_param_lock);
+ memcg->dirty_ratio = val;
+ memcg->dirty_bytes = 0;
+ spin_unlock(&memcg->reclaim_param_lock);
+
+ return 0;
+}
+
+static u64 mem_cgroup_dirty_bytes_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+
+ return get_dirty_param(memcg, MEM_CGROUP_DIRTY_BYTES);
+}
+
+static int
+mem_cgroup_dirty_bytes_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+
+ if (cgrp->parent == NULL)
+ return -EINVAL;
+
+ spin_lock(&memcg->reclaim_param_lock);
+ memcg->dirty_ratio = 0;
+ memcg->dirty_bytes = val;
+ spin_unlock(&memcg->reclaim_param_lock);
+
+ return 0;
+}
+
+static u64
+mem_cgroup_dirty_background_ratio_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+
+ return get_dirty_param(memcg, MEM_CGROUP_DIRTY_BACKGROUND_RATIO);
+}
+
+static int mem_cgroup_dirty_background_ratio_write(struct cgroup *cgrp,
+ struct cftype *cft, u64 val)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+
+ if ((cgrp->parent == NULL) || (val > 100))
+ return -EINVAL;
+
+ spin_lock(&memcg->reclaim_param_lock);
+ memcg->dirty_background_ratio = val;
+ memcg->dirty_background_bytes = 0;
+ spin_unlock(&memcg->reclaim_param_lock);
+
+ return 0;
+}
+
+static u64
+mem_cgroup_dirty_background_bytes_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+
+ return get_dirty_param(memcg, MEM_CGROUP_DIRTY_BACKGROUND_BYTES);
+}
+
+static int mem_cgroup_dirty_background_bytes_write(struct cgroup *cgrp,
+ struct cftype *cft, u64 val)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+
+ if (cgrp->parent == NULL)
+ return -EINVAL;
+
+ spin_lock(&memcg->reclaim_param_lock);
+ memcg->dirty_background_ratio = 0;
+ memcg->dirty_background_bytes = val;
+ spin_unlock(&memcg->reclaim_param_lock);
+
+ return 0;
+}
+
static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
@@ -3518,6 +3785,26 @@ static struct cftype mem_cgroup_files[] = {
.write_u64 = mem_cgroup_swappiness_write,
},
{
+ .name = "dirty_ratio",
+ .read_u64 = mem_cgroup_dirty_ratio_read,
+ .write_u64 = mem_cgroup_dirty_ratio_write,
+ },
+ {
+ .name = "dirty_bytes",
+ .read_u64 = mem_cgroup_dirty_bytes_read,
+ .write_u64 = mem_cgroup_dirty_bytes_write,
+ },
+ {
+ .name = "dirty_background_ratio",
+ .read_u64 = mem_cgroup_dirty_background_ratio_read,
+ .write_u64 = mem_cgroup_dirty_background_ratio_write,
+ },
+ {
+ .name = "dirty_background_bytes",
+ .read_u64 = mem_cgroup_dirty_background_bytes_read,
+ .write_u64 = mem_cgroup_dirty_background_bytes_write,
+ },
+ {
.name = "move_charge_at_immigrate",
.read_u64 = mem_cgroup_move_charge_read,
.write_u64 = mem_cgroup_move_charge_write,
@@ -3776,8 +4063,23 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
mem->last_scanned_child = 0;
spin_lock_init(&mem->reclaim_param_lock);

- if (parent)
+ if (parent) {
mem->swappiness = get_swappiness(parent);
+
+ mem->dirty_ratio = get_dirty_param(parent,
+ MEM_CGROUP_DIRTY_RATIO);
+ mem->dirty_bytes = get_dirty_param(parent,
+ MEM_CGROUP_DIRTY_BYTES);
+ mem->dirty_background_ratio = get_dirty_param(parent,
+ MEM_CGROUP_DIRTY_BACKGROUND_RATIO);
+ mem->dirty_background_bytes = get_dirty_param(parent,
+ MEM_CGROUP_DIRTY_BACKGROUND_BYTES);
+ } else {
+ mem->dirty_ratio = vm_dirty_ratio;
+ mem->dirty_bytes = vm_dirty_bytes;
+ mem->dirty_background_ratio = vm_dirty_ratio;
+ mem->dirty_background_bytes = vm_dirty_bytes;
+ }
atomic_set(&mem->refcnt, 1);
mem->move_charge_at_immigrate = 0;
mutex_init(&mem->thresholds_lock);
--
1.6.3.3

2010-02-26 22:53:13

by Andrea Righi

[permalink] [raw]
Subject: [PATCH -mmotm 2/2] memcg: dirty pages instrumentation

Apply the cgroup dirty pages accounting and limiting infrastructure to
the opportune kernel functions.

Signed-off-by: Andrea Righi <[email protected]>
---
fs/fuse/file.c | 5 +++
fs/nfs/write.c | 4 ++
fs/nilfs2/segment.c | 10 +++++-
mm/filemap.c | 1 +
mm/page-writeback.c | 84 ++++++++++++++++++++++++++++++++------------------
mm/rmap.c | 4 +-
mm/truncate.c | 2 +
7 files changed, 76 insertions(+), 34 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a9f5e13..dbbdd53 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -11,6 +11,7 @@
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/kernel.h>
+#include <linux/memcontrol.h>
#include <linux/sched.h>
#include <linux/module.h>

@@ -1129,6 +1130,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)

list_del(&req->writepages_entry);
dec_bdi_stat(bdi, BDI_WRITEBACK);
+ mem_cgroup_update_stat(req->pages[0],
+ MEM_CGROUP_STAT_WRITEBACK_TEMP, -1);
dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP);
bdi_writeout_inc(bdi);
wake_up(&fi->page_waitq);
@@ -1240,6 +1243,8 @@ static int fuse_writepage_locked(struct page *page)
req->inode = inode;

inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
+ mem_cgroup_update_stat(tmp_page,
+ MEM_CGROUP_STAT_WRITEBACK_TEMP, 1);
inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
end_page_writeback(page);

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b753242..7316f7a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -439,6 +439,7 @@ nfs_mark_request_commit(struct nfs_page *req)
req->wb_index,
NFS_PAGE_TAG_COMMIT);
spin_unlock(&inode->i_lock);
+ mem_cgroup_update_stat(req->wb_page, MEM_CGROUP_STAT_UNSTABLE_NFS, 1);
inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_UNSTABLE);
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
@@ -450,6 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req)
struct page *page = req->wb_page;

if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
+ mem_cgroup_update_stat(page, MEM_CGROUP_STAT_UNSTABLE_NFS, -1);
dec_zone_page_state(page, NR_UNSTABLE_NFS);
dec_bdi_stat(page->mapping->backing_dev_info, BDI_UNSTABLE);
return 1;
@@ -1273,6 +1275,8 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
req = nfs_list_entry(head->next);
nfs_list_remove_request(req);
nfs_mark_request_commit(req);
+ mem_cgroup_update_stat(req->wb_page,
+ MEM_CGROUP_STAT_UNSTABLE_NFS, -1);
dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
BDI_UNSTABLE);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index ada2f1b..aef6d13 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1660,8 +1660,11 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
} while (bh = bh->b_this_page, bh2 = bh2->b_this_page, bh != head);
kunmap_atomic(kaddr, KM_USER0);

- if (!TestSetPageWriteback(clone_page))
+ if (!TestSetPageWriteback(clone_page)) {
+ mem_cgroup_update_stat(clone_page,
+ MEM_CGROUP_STAT_WRITEBACK, 1);
inc_zone_page_state(clone_page, NR_WRITEBACK);
+ }
unlock_page(clone_page);

return 0;
@@ -1783,8 +1786,11 @@ static void __nilfs_end_page_io(struct page *page, int err)
}

if (buffer_nilfs_allocated(page_buffers(page))) {
- if (TestClearPageWriteback(page))
+ if (TestClearPageWriteback(page)) {
+ mem_cgroup_update_stat(clone_page,
+ MEM_CGROUP_STAT_WRITEBACK, -1);
dec_zone_page_state(page, NR_WRITEBACK);
+ }
} else
end_page_writeback(page);
}
diff --git a/mm/filemap.c b/mm/filemap.c
index fe09e51..f85acae 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page)
* having removed the page entirely.
*/
if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
+ mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, -1);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5a0f8f3..d83f41c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -137,13 +137,14 @@ static struct prop_descriptor vm_dirties;
*/
static int calc_period_shift(void)
{
- unsigned long dirty_total;
+ unsigned long dirty_total, dirty_bytes;

- if (vm_dirty_bytes)
- dirty_total = vm_dirty_bytes / PAGE_SIZE;
+ dirty_bytes = mem_cgroup_dirty_bytes();
+ if (dirty_bytes)
+ dirty_total = dirty_bytes / PAGE_SIZE;
else
- dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
- 100;
+ dirty_total = (mem_cgroup_dirty_ratio() *
+ determine_dirtyable_memory()) / 100;
return 2 + ilog2(dirty_total - 1);
}

@@ -408,14 +409,16 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
*/
unsigned long determine_dirtyable_memory(void)
{
- unsigned long x;
-
- x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
+ unsigned long memory;
+ s64 memcg_memory;

+ memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
if (!vm_highmem_is_dirtyable)
- x -= highmem_dirtyable_memory(x);
-
- return x + 1; /* Ensure that we never return 0 */
+ memory -= highmem_dirtyable_memory(memory);
+ memcg_memory = mem_cgroup_page_stat(MEMCG_NR_DIRTYABLE_PAGES);
+ if (memcg_memory < 0)
+ return memory + 1;
+ return min((unsigned long)memcg_memory, memory + 1);
}

void
@@ -423,26 +426,28 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
{
unsigned long background;
- unsigned long dirty;
+ unsigned long dirty, dirty_bytes, dirty_background;
unsigned long available_memory = determine_dirtyable_memory();
struct task_struct *tsk;

- if (vm_dirty_bytes)
- dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
+ dirty_bytes = mem_cgroup_dirty_bytes();
+ if (dirty_bytes)
+ dirty = DIV_ROUND_UP(dirty_bytes, PAGE_SIZE);
else {
int dirty_ratio;

- dirty_ratio = vm_dirty_ratio;
+ dirty_ratio = mem_cgroup_dirty_ratio();
if (dirty_ratio < 5)
dirty_ratio = 5;
dirty = (dirty_ratio * available_memory) / 100;
}

- if (dirty_background_bytes)
- background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
+ dirty_background = mem_cgroup_dirty_background_bytes();
+ if (dirty_background)
+ background = DIV_ROUND_UP(dirty_background, PAGE_SIZE);
else
- background = (dirty_background_ratio * available_memory) / 100;
-
+ background = (mem_cgroup_dirty_background_ratio() *
+ available_memory) / 100;
if (background >= dirty)
background = dirty / 2;
tsk = current;
@@ -508,9 +513,13 @@ static void balance_dirty_pages(struct address_space *mapping,
get_dirty_limits(&background_thresh, &dirty_thresh,
&bdi_thresh, bdi);

- nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
+ nr_reclaimable = mem_cgroup_page_stat(MEMCG_NR_RECLAIM_PAGES);
+ nr_writeback = mem_cgroup_page_stat(MEMCG_NR_WRITEBACK);
+ if ((nr_reclaimable < 0) || (nr_writeback < 0)) {
+ nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
- nr_writeback = global_page_state(NR_WRITEBACK);
+ nr_writeback = global_page_state(NR_WRITEBACK);
+ }

bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY);
if (bdi_cap_account_unstable(bdi)) {
@@ -611,10 +620,12 @@ static void balance_dirty_pages(struct address_space *mapping,
* In normal mode, we start background writeout at the lower
* background_thresh, to keep the amount of dirty memory low.
*/
+ nr_reclaimable = mem_cgroup_page_stat(MEMCG_NR_RECLAIM_PAGES);
+ if (nr_reclaimable < 0)
+ nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS);
if ((laptop_mode && pages_written) ||
- (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
- + global_page_state(NR_UNSTABLE_NFS))
- > background_thresh)))
+ (!laptop_mode && (nr_reclaimable > background_thresh)))
bdi_start_writeback(bdi, NULL, 0);
}

@@ -678,6 +689,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
unsigned long dirty_thresh;

for ( ; ; ) {
+ unsigned long dirty;
+
get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);

/*
@@ -686,10 +699,14 @@ void throttle_vm_writeout(gfp_t gfp_mask)
*/
dirty_thresh += dirty_thresh / 10; /* wheeee... */

- if (global_page_state(NR_UNSTABLE_NFS) +
- global_page_state(NR_WRITEBACK) <= dirty_thresh)
- break;
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+
+ dirty = mem_cgroup_page_stat(MEMCG_NR_DIRTY_WRITEBACK_PAGES);
+ if (dirty < 0)
+ dirty = global_page_state(NR_UNSTABLE_NFS) +
+ global_page_state(NR_WRITEBACK);
+ if (dirty <= dirty_thresh)
+ break;
+ congestion_wait(BLK_RW_ASYNC, HZ/10);

/*
* The caller might hold locks which can prevent IO completion
@@ -1096,6 +1113,7 @@ int __set_page_dirty_no_writeback(struct page *page)
void account_page_dirtied(struct page *page, struct address_space *mapping)
{
if (mapping_cap_account_dirty(mapping)) {
+ mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_DIRTY, 1);
__inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
task_dirty_inc(current);
@@ -1297,6 +1315,8 @@ int clear_page_dirty_for_io(struct page *page)
* for more comments.
*/
if (TestClearPageDirty(page)) {
+ mem_cgroup_update_stat(page,
+ MEM_CGROUP_STAT_FILE_DIRTY, -1);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_bdi_stat(mapping->backing_dev_info,
BDI_DIRTY);
@@ -1332,8 +1352,10 @@ int test_clear_page_writeback(struct page *page)
} else {
ret = TestClearPageWriteback(page);
}
- if (ret)
+ if (ret) {
+ mem_cgroup_update_stat(page, MEM_CGROUP_STAT_WRITEBACK, -1);
dec_zone_page_state(page, NR_WRITEBACK);
+ }
return ret;
}

@@ -1363,8 +1385,10 @@ int test_set_page_writeback(struct page *page)
} else {
ret = TestSetPageWriteback(page);
}
- if (!ret)
+ if (!ret) {
+ mem_cgroup_update_stat(page, MEM_CGROUP_STAT_WRITEBACK, 1);
inc_zone_page_state(page, NR_WRITEBACK);
+ }
return ret;

}
diff --git a/mm/rmap.c b/mm/rmap.c
index 4d2fb93..8d74335 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -832,7 +832,7 @@ void page_add_file_rmap(struct page *page)
{
if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_update_file_mapped(page, 1);
+ mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, 1);
}
}

@@ -864,7 +864,7 @@ void page_remove_rmap(struct page *page)
__dec_zone_page_state(page, NR_ANON_PAGES);
} else {
__dec_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_update_file_mapped(page, -1);
+ mem_cgroup_update_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, -1);
}
/*
* It would be tidy to reset the PageAnon mapping here,
diff --git a/mm/truncate.c b/mm/truncate.c
index 2466e0c..5f437e7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -73,6 +73,8 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
if (TestClearPageDirty(page)) {
struct address_space *mapping = page->mapping;
if (mapping && mapping_cap_account_dirty(mapping)) {
+ mem_cgroup_update_stat(page,
+ MEM_CGROUP_STAT_FILE_DIRTY, -1);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_bdi_stat(mapping->backing_dev_info,
BDI_DIRTY);
--
1.6.3.3

2010-03-01 01:12:46

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH -mmotm 1/2] memcg: dirty pages accounting and limiting infrastructure

On Fri, 26 Feb 2010 23:52:30 +0100
Andrea Righi <[email protected]> wrote:

> Infrastructure to account dirty pages per cgroup and add dirty limit
> interfaces in the cgroupfs:
>
> - Active write-out: memory.dirty_ratio, memory.dirty_bytes
> - Background write-out: memory.dirty_background_ratio, memory.dirty_background_bytes
>
> Signed-off-by: Andrea Righi <[email protected]>

I think nice. but have some nitpicks..


> ---
> include/linux/memcontrol.h | 74 +++++++++-
> mm/memcontrol.c | 354 ++++++++++++++++++++++++++++++++++++++++----
> 2 files changed, 399 insertions(+), 29 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 1f9b119..e6af95c 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -25,6 +25,41 @@ struct page_cgroup;
> struct page;
> struct mm_struct;
>
> +/* Cgroup memory statistics items exported to the kernel */
> +enum mem_cgroup_page_stat_item {
> + MEMCG_NR_DIRTYABLE_PAGES,
> + MEMCG_NR_RECLAIM_PAGES,
> + MEMCG_NR_WRITEBACK,
> + MEMCG_NR_DIRTY_WRITEBACK_PAGES,
> +};
> +
> +/*
> + * Statistics for memory cgroup.
> + */
> +enum mem_cgroup_stat_index {
> + /*
> + * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
> + */
> + MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
> + MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
> + MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
> + MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
> + MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
> + MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
> + MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
> + MEM_CGROUP_STAT_SOFTLIMIT, /* decrements on each page in/out.
> + used by soft limit implementation */
> + MEM_CGROUP_STAT_THRESHOLDS, /* decrements on each page in/out.
> + used by threshold implementation */
> + MEM_CGROUP_STAT_FILE_DIRTY, /* # of dirty pages in page cache */
> + MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */
> + MEM_CGROUP_STAT_WRITEBACK_TEMP, /* # of pages under writeback using
> + temporary buffers */
> + MEM_CGROUP_STAT_UNSTABLE_NFS, /* # of NFS unstable pages */
> +
> + MEM_CGROUP_STAT_NSTATS,
> +};
> +
> #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> /*
> * All "charge" functions with gfp_mask should use GFP_KERNEL or
> @@ -117,6 +152,13 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
> extern int do_swap_account;
> #endif
>
> +extern long mem_cgroup_dirty_ratio(void);
> +extern unsigned long mem_cgroup_dirty_bytes(void);
> +extern long mem_cgroup_dirty_background_ratio(void);
> +extern unsigned long mem_cgroup_dirty_background_bytes(void);
> +
> +extern s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item);
> +
> static inline bool mem_cgroup_disabled(void)
> {
> if (mem_cgroup_subsys.disabled)
> @@ -125,7 +167,8 @@ static inline bool mem_cgroup_disabled(void)
> }
>
> extern bool mem_cgroup_oom_called(struct task_struct *task);
> -void mem_cgroup_update_file_mapped(struct page *page, int val);
> +void mem_cgroup_update_stat(struct page *page,
> + enum mem_cgroup_stat_index idx, int val);
> unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
> gfp_t gfp_mask, int nid,
> int zid);
> @@ -300,8 +343,8 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
> {
> }
>
> -static inline void mem_cgroup_update_file_mapped(struct page *page,
> - int val)
> +static inline void mem_cgroup_update_stat(struct page *page,
> + enum mem_cgroup_stat_index idx, int val)
> {
> }
>
> @@ -312,6 +355,31 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
> return 0;
> }
>
> +static inline long mem_cgroup_dirty_ratio(void)
> +{
> + return vm_dirty_ratio;
> +}
> +
> +static inline unsigned long mem_cgroup_dirty_bytes(void)
> +{
> + return vm_dirty_bytes;
> +}
> +
> +static inline long mem_cgroup_dirty_background_ratio(void)
> +{
> + return dirty_background_ratio;
> +}
> +
> +static inline unsigned long mem_cgroup_dirty_background_bytes(void)
> +{
> + return dirty_background_bytes;
> +}
> +
> +static inline s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item)
> +{
> + return -ENOMEM;
> +}
> +
> #endif /* CONFIG_CGROUP_MEM_CONT */
>
> #endif /* _LINUX_MEMCONTROL_H */
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index a443c30..56f3204 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -66,31 +66,16 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
> #define SOFTLIMIT_EVENTS_THRESH (1000)
> #define THRESHOLDS_EVENTS_THRESH (100)
>
> -/*
> - * Statistics for memory cgroup.
> - */
> -enum mem_cgroup_stat_index {
> - /*
> - * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
> - */
> - MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
> - MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
> - MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
> - MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
> - MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
> - MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
> - MEM_CGROUP_STAT_SOFTLIMIT, /* decrements on each page in/out.
> - used by soft limit implementation */
> - MEM_CGROUP_STAT_THRESHOLDS, /* decrements on each page in/out.
> - used by threshold implementation */
> -
> - MEM_CGROUP_STAT_NSTATS,
> -};
> -
> struct mem_cgroup_stat_cpu {
> s64 count[MEM_CGROUP_STAT_NSTATS];
> };
>
> +/* Per cgroup page statistics */
> +struct mem_cgroup_page_stat {
> + enum mem_cgroup_page_stat_item item;
> + s64 value;
> +};
> +
> /*
> * per-zone information in memory controller.
> */
> @@ -205,6 +190,14 @@ struct mem_cgroup {
>
> unsigned int swappiness;
>
> + /* control memory cgroup dirty pages */
> + long dirty_ratio;
> + unsigned long dirty_bytes;
> +
> + /* control background writeback (via writeback threads) */
> + long dirty_background_ratio;
> + unsigned long dirty_background_bytes;
> +
> /* set when res.limit == memsw.limit */
> bool memsw_is_minimum;
>
> @@ -1021,6 +1014,169 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
> return swappiness;
> }
>
> +enum mem_cgroup_dirty_param {
> + MEM_CGROUP_DIRTY_RATIO,
> + MEM_CGROUP_DIRTY_BYTES,
> + MEM_CGROUP_DIRTY_BACKGROUND_RATIO,
> + MEM_CGROUP_DIRTY_BACKGROUND_BYTES,
> +};
> +
> +static unsigned long get_dirty_param(struct mem_cgroup *memcg,
> + enum mem_cgroup_dirty_param idx)
> +{
> + unsigned long ret;
> +
> + spin_lock(&memcg->reclaim_param_lock);

do we need lock ?


> + switch (idx) {
> + case MEM_CGROUP_DIRTY_RATIO:
> + ret = memcg->dirty_ratio;
> + break;
> + case MEM_CGROUP_DIRTY_BYTES:
> + ret = memcg->dirty_bytes;
> + break;
> + case MEM_CGROUP_DIRTY_BACKGROUND_RATIO:
> + ret = memcg->dirty_background_ratio;
> + break;
> + case MEM_CGROUP_DIRTY_BACKGROUND_BYTES:
> + ret = memcg->dirty_background_bytes;
> + break;
> + default:
> + VM_BUG_ON(1);
> + }
> + spin_unlock(&memcg->reclaim_param_lock);
> +
> + return ret;
> +}
> +
> +long mem_cgroup_dirty_ratio(void)
> +{
> + struct mem_cgroup *memcg;
> + long ret = vm_dirty_ratio;
> +
> + if (mem_cgroup_disabled())
> + goto out;
> + rcu_read_lock();
> + memcg = mem_cgroup_from_task(current);

please add some excuse comment here. As...

/*
* It's possible that "current" may be moved to other cgroup while
* we access cgroup. But precise check is meaningless because the task
* can be moved after our access and writeback tends to take long time.
* At least, "memcg" will not be freed under rcu_read_lock().
*/


> + if (likely(memcg))
> + ret = get_dirty_param(memcg, MEM_CGROUP_DIRTY_RATIO);
> + rcu_read_unlock();
> +out:
> + return ret;
> +}
> +
> +unsigned long mem_cgroup_dirty_bytes(void)
> +{
> + struct mem_cgroup *memcg;
> + unsigned long ret = vm_dirty_bytes;
> +
> + if (mem_cgroup_disabled())
> + goto out;
> + rcu_read_lock();
> + memcg = mem_cgroup_from_task(current);
> + if (likely(memcg))
> + ret = get_dirty_param(memcg, MEM_CGROUP_DIRTY_BYTES);
> + rcu_read_unlock();
> +out:
> + return ret;
> +}
> +
> +long mem_cgroup_dirty_background_ratio(void)
> +{
> + struct mem_cgroup *memcg;
> + long ret = dirty_background_ratio;
> +
> + if (mem_cgroup_disabled())
> + goto out;
> + rcu_read_lock();
> + memcg = mem_cgroup_from_task(current);
> + if (likely(memcg))
> + ret = get_dirty_param(memcg, MEM_CGROUP_DIRTY_BACKGROUND_RATIO);
> + rcu_read_unlock();
> +out:
> + return ret;
> +}
> +
> +unsigned long mem_cgroup_dirty_background_bytes(void)
> +{
> + struct mem_cgroup *memcg;
> + unsigned long ret = dirty_background_bytes;
> +
> + if (mem_cgroup_disabled())
> + goto out;
> + rcu_read_lock();
> + memcg = mem_cgroup_from_task(current);
> + if (likely(memcg))
> + ret = get_dirty_param(memcg, MEM_CGROUP_DIRTY_BACKGROUND_BYTES);
> + rcu_read_unlock();
> +out:
> + return ret;
> +}
> +
Hmm, how about

memcg->dirty_params[XXXX]
and access by memcg->dirt_params[MEM_CGROUP_DIRTY_BACKGROUND_BYTES]] ?

Then, we don't need to have 4 functsion in the same implementation.


> +static s64 mem_cgroup_get_local_page_stat(struct mem_cgroup *memcg,
> + enum mem_cgroup_page_stat_item item)
> +{
> + s64 ret;
> +
> + switch (item) {
> + case MEMCG_NR_DIRTYABLE_PAGES:
> + ret = res_counter_read_u64(&memcg->res, RES_LIMIT) -
> + res_counter_read_u64(&memcg->res, RES_USAGE);
> + /* Translate free memory in pages */
> + ret >>= PAGE_SHIFT;
> + ret += mem_cgroup_read_stat(memcg, LRU_ACTIVE_ANON) +
> + mem_cgroup_read_stat(memcg, LRU_ACTIVE_FILE) +
> + mem_cgroup_read_stat(memcg, LRU_INACTIVE_ANON) +
> + mem_cgroup_read_stat(memcg, LRU_INACTIVE_FILE);
> + break;
Hmm, is this correct in swapless case ?


> + case MEMCG_NR_RECLAIM_PAGES:
> + ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_DIRTY) +
> + mem_cgroup_read_stat(memcg,
> + MEM_CGROUP_STAT_UNSTABLE_NFS);
> + break;
> + case MEMCG_NR_WRITEBACK:
> + ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
> + break;
> + case MEMCG_NR_DIRTY_WRITEBACK_PAGES:
> + ret = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) +
> + mem_cgroup_read_stat(memcg,
> + MEM_CGROUP_STAT_UNSTABLE_NFS);
> + break;
> + default:
> + ret = 0;
> + WARN_ON_ONCE(1);
> + }
> + return ret;
> +}
> +
> +static int mem_cgroup_page_stat_cb(struct mem_cgroup *mem, void *data)
> +{
> + struct mem_cgroup_page_stat *stat = (struct mem_cgroup_page_stat *)data;
> +
> + stat->value += mem_cgroup_get_local_page_stat(mem, stat->item);
> + return 0;
> +}
> +
> +s64 mem_cgroup_page_stat(enum mem_cgroup_page_stat_item item)
> +{
> + struct mem_cgroup_page_stat stat = {};
> + struct mem_cgroup *memcg;
> +
> + rcu_read_lock();
> + memcg = mem_cgroup_from_task(current);
> + if (memcg) {
> + /*
> + * Recursively evaulate page statistics against all cgroup
> + * under hierarchy tree
> + */
> + stat.item = item;
> + mem_cgroup_walk_tree(memcg, &stat, mem_cgroup_page_stat_cb);
> + } else
> + stat.value = -ENOMEM;
> + rcu_read_unlock();
> +
> + return stat.value;
> +}
> +
> static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
> {
> int *val = data;
> @@ -1263,10 +1419,10 @@ static void record_last_oom(struct mem_cgroup *mem)
> }
>
> /*
> - * Currently used to update mapped file statistics, but the routine can be
> - * generalized to update other statistics as well.
> + * Generalized routine to update memory cgroup statistics.
> */
> -void mem_cgroup_update_file_mapped(struct page *page, int val)
> +void mem_cgroup_update_stat(struct page *page,
> + enum mem_cgroup_stat_index idx, int val)
> {
> struct mem_cgroup *mem;
> struct page_cgroup *pc;
> @@ -1286,7 +1442,8 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
> /*
> * Preemption is already disabled. We can use __this_cpu_xxx
> */
> - __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
> + VM_BUG_ON(idx >= MEM_CGROUP_STAT_NSTATS);
> + __this_cpu_add(mem->stat->count[idx], val);
>
> done:
> unlock_page_cgroup(pc);
> @@ -3033,6 +3190,10 @@ enum {
> MCS_PGPGIN,
> MCS_PGPGOUT,
> MCS_SWAP,
> + MCS_FILE_DIRTY,
> + MCS_WRITEBACK,
> + MCS_WRITEBACK_TEMP,
> + MCS_UNSTABLE_NFS,
> MCS_INACTIVE_ANON,
> MCS_ACTIVE_ANON,
> MCS_INACTIVE_FILE,
> @@ -3055,6 +3216,10 @@ struct {
> {"pgpgin", "total_pgpgin"},
> {"pgpgout", "total_pgpgout"},
> {"swap", "total_swap"},
> + {"filedirty", "dirty_pages"},
> + {"writeback", "writeback_pages"},
> + {"writeback_tmp", "writeback_temp_pages"},
> + {"nfs", "nfs_unstable"},
> {"inactive_anon", "total_inactive_anon"},
> {"active_anon", "total_active_anon"},
> {"inactive_file", "total_inactive_file"},
> @@ -3083,6 +3248,14 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
> val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
> s->stat[MCS_SWAP] += val * PAGE_SIZE;
> }
> + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_DIRTY);
> + s->stat[MCS_FILE_DIRTY] += val;
> + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_WRITEBACK);
> + s->stat[MCS_WRITEBACK] += val;
> + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_WRITEBACK_TEMP);
> + s->stat[MCS_WRITEBACK_TEMP] += val;
> + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_UNSTABLE_NFS);
> + s->stat[MCS_UNSTABLE_NFS] += val;
>
> /* per zone stat */
> val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
> @@ -3467,6 +3640,100 @@ unlock:
> return ret;
> }
>
> +static u64 mem_cgroup_dirty_ratio_read(struct cgroup *cgrp, struct cftype *cft)
> +{
> + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
> +
> + return get_dirty_param(memcg, MEM_CGROUP_DIRTY_RATIO);
> +}
> +
> +static int
> +mem_cgroup_dirty_ratio_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
> +{
> + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
> +
> + if ((cgrp->parent == NULL) || (val > 100))
> + return -EINVAL;
> +
> + spin_lock(&memcg->reclaim_param_lock);
> + memcg->dirty_ratio = val;
> + memcg->dirty_bytes = 0;
> + spin_unlock(&memcg->reclaim_param_lock);
> +
> + return 0;
> +}
> +
> +static u64 mem_cgroup_dirty_bytes_read(struct cgroup *cgrp, struct cftype *cft)
> +{
> + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
> +
> + return get_dirty_param(memcg, MEM_CGROUP_DIRTY_BYTES);
> +}
> +
> +static int
> +mem_cgroup_dirty_bytes_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
> +{
> + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
> +
> + if (cgrp->parent == NULL)
> + return -EINVAL;
> +
> + spin_lock(&memcg->reclaim_param_lock);
> + memcg->dirty_ratio = 0;
> + memcg->dirty_bytes = val;
> + spin_unlock(&memcg->reclaim_param_lock);
> +
> + return 0;
> +}
> +
> +static u64
> +mem_cgroup_dirty_background_ratio_read(struct cgroup *cgrp, struct cftype *cft)
> +{
> + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
> +
> + return get_dirty_param(memcg, MEM_CGROUP_DIRTY_BACKGROUND_RATIO);
> +}
> +
> +static int mem_cgroup_dirty_background_ratio_write(struct cgroup *cgrp,
> + struct cftype *cft, u64 val)
> +{
> + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
> +
> + if ((cgrp->parent == NULL) || (val > 100))
> + return -EINVAL;
> +
> + spin_lock(&memcg->reclaim_param_lock);
> + memcg->dirty_background_ratio = val;
> + memcg->dirty_background_bytes = 0;
> + spin_unlock(&memcg->reclaim_param_lock);
> +
> + return 0;
> +}
> +
> +static u64
> +mem_cgroup_dirty_background_bytes_read(struct cgroup *cgrp, struct cftype *cft)
> +{
> + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
> +
> + return get_dirty_param(memcg, MEM_CGROUP_DIRTY_BACKGROUND_BYTES);
> +}
> +
> +static int mem_cgroup_dirty_background_bytes_write(struct cgroup *cgrp,
> + struct cftype *cft, u64 val)
> +{
> + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
> +
> + if (cgrp->parent == NULL)
> + return -EINVAL;
> +
> + spin_lock(&memcg->reclaim_param_lock);
> + memcg->dirty_background_ratio = 0;
> + memcg->dirty_background_bytes = val;
> + spin_unlock(&memcg->reclaim_param_lock);
> +
> + return 0;
> +}
> +
> static struct cftype mem_cgroup_files[] = {
> {
> .name = "usage_in_bytes",
> @@ -3518,6 +3785,26 @@ static struct cftype mem_cgroup_files[] = {
> .write_u64 = mem_cgroup_swappiness_write,
> },
> {
> + .name = "dirty_ratio",
> + .read_u64 = mem_cgroup_dirty_ratio_read,
> + .write_u64 = mem_cgroup_dirty_ratio_write,
> + },
> + {
> + .name = "dirty_bytes",
> + .read_u64 = mem_cgroup_dirty_bytes_read,
> + .write_u64 = mem_cgroup_dirty_bytes_write,
> + },
> + {
> + .name = "dirty_background_ratio",
> + .read_u64 = mem_cgroup_dirty_background_ratio_read,
> + .write_u64 = mem_cgroup_dirty_background_ratio_write,
> + },
> + {
> + .name = "dirty_background_bytes",
> + .read_u64 = mem_cgroup_dirty_background_bytes_read,
> + .write_u64 = mem_cgroup_dirty_background_bytes_write,
> + },
> + {
> .name = "move_charge_at_immigrate",
> .read_u64 = mem_cgroup_move_charge_read,
> .write_u64 = mem_cgroup_move_charge_write,
> @@ -3776,8 +4063,23 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
> mem->last_scanned_child = 0;
> spin_lock_init(&mem->reclaim_param_lock);
>
> - if (parent)
> + if (parent) {
> mem->swappiness = get_swappiness(parent);
> +
> + mem->dirty_ratio = get_dirty_param(parent,
> + MEM_CGROUP_DIRTY_RATIO);
> + mem->dirty_bytes = get_dirty_param(parent,
> + MEM_CGROUP_DIRTY_BYTES);
> + mem->dirty_background_ratio = get_dirty_param(parent,
> + MEM_CGROUP_DIRTY_BACKGROUND_RATIO);
> + mem->dirty_background_bytes = get_dirty_param(parent,
> + MEM_CGROUP_DIRTY_BACKGROUND_BYTES);
> + } else {
> + mem->dirty_ratio = vm_dirty_ratio;
> + mem->dirty_bytes = vm_dirty_bytes;
> + mem->dirty_background_ratio = vm_dirty_ratio;
> + mem->dirty_background_bytes = vm_dirty_bytes;

background_dirty_ratio ?

Thanks,
-Kame

> + }
> atomic_set(&mem->refcnt, 1);
> mem->move_charge_at_immigrate = 0;
> mutex_init(&mem->thresholds_lock);
> --
> 1.6.3.3
>
>