Hi, this version removes virt-array and use simple id <-> memcg table.
and removed RFC.
This set has 2+1 purposes.
1. re-desgin struct page_cgroup and makes room for blocckio-cgroup ID.
2. implement quick updating method for memcg's file stat.
3. optionally? use spin_lock instead of bit_spinlock.
Plans after this.
1. check influence of Mel's new writeback method.
I think we'll see OOM easier. IIUC, memory cgroup needs a thread like kswapd
to do background writeback or low-high watermark.
(By this, we can control priority of background writeout thread priority
by CFS. This is very good.)
2. implementing dirty_ratio.
Now, Greg Thelen is working on. One of biggest problems of previous trial was
update cost of status. I think this patch set can reduce it.
3. record blockio cgroup's ID.
Ikeda posted one. IIUC, it requires some consideration on (swapin)readahead
for assigning IDs. But it seemed to be good in general.
Importance is in this order in my mind. But all aboves can be done in parallel.
Beyond that, some guys has problem with file-cache-control. If it need to use
account migration, we have to take care of races.
Thanks,
-Kame
From: KAMEZAWA Hiroyuki <[email protected]>
Now, memory cgroup has an ID per cgroup and make use of it at
- hierarchy walk,
- swap recording.
This patch is for making more use of it. The final purpose is
to replace page_cgroup->mem_cgroup's pointer to an unsigned short.
This patch caches a pointer of memcg in an array. By this, we
don't have to call css_lookup() which requires radix-hash walk.
This saves some amount of memory footprint at lookup memcg via id.
It's called in very fast path and need to be quick AMAP.
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
init/Kconfig | 11 +++++++++++
mm/memcontrol.c | 46 ++++++++++++++++++++++++++++++++--------------
2 files changed, 43 insertions(+), 14 deletions(-)
Index: mmotm-0727/mm/memcontrol.c
===================================================================
--- mmotm-0727.orig/mm/memcontrol.c
+++ mmotm-0727/mm/memcontrol.c
@@ -292,6 +292,27 @@ static bool move_file(void)
&mc.to->move_charge_at_immigrate);
}
+atomic_t mem_cgroup_num;
+struct mem_cgroup *mem_cgroups[CONFIG_MEM_CGROUP_MAX_GROUPS] __read_mostly;
+
+static struct mem_cgroup* id_to_memcg(unsigned short id)
+{
+ /*
+ * This array is set to NULL when mem_cgroup is freed.
+ * IOW, there are no more references && rcu_synchronized().
+ * This lookup-caching is safe.
+ */
+ if (unlikely(!mem_cgroups[id])) {
+ struct cgroup_subsys_state *css;
+ rcu_read_lock();
+ css = css_lookup(&mem_cgroup_subsys, id);
+ if (!css)
+ return NULL;
+ rcu_read_unlock();
+ mem_cgroups[id] = container_of(css, struct mem_cgroup, css);
+ }
+ return mem_cgroups[id];
+}
/*
* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
* limit reclaim to prevent infinite loops, if they ever occur.
@@ -1824,18 +1845,7 @@ static void mem_cgroup_cancel_charge(str
* it's concern. (dropping refcnt from swap can be called against removed
* memcg.)
*/
-static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
-{
- struct cgroup_subsys_state *css;
- /* ID 0 is unused ID */
- if (!id)
- return NULL;
- css = css_lookup(&mem_cgroup_subsys, id);
- if (!css)
- return NULL;
- return container_of(css, struct mem_cgroup, css);
-}
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
{
@@ -1856,7 +1866,7 @@ struct mem_cgroup *try_get_mem_cgroup_fr
ent.val = page_private(page);
id = lookup_swap_cgroup(ent);
rcu_read_lock();
- mem = mem_cgroup_lookup(id);
+ mem = id_to_memcg(id);
if (mem && !css_tryget(&mem->css))
mem = NULL;
rcu_read_unlock();
@@ -2208,7 +2218,7 @@ __mem_cgroup_commit_charge_swapin(struct
id = swap_cgroup_record(ent, 0);
rcu_read_lock();
- memcg = mem_cgroup_lookup(id);
+ memcg = id_to_memcg(id);
if (memcg) {
/*
* This recorded memcg can be obsolete one. So, avoid
@@ -2472,7 +2482,7 @@ void mem_cgroup_uncharge_swap(swp_entry_
id = swap_cgroup_record(ent, 0);
rcu_read_lock();
- memcg = mem_cgroup_lookup(id);
+ memcg = id_to_memcg(id);
if (memcg) {
/*
* We uncharge this because swap is freed.
@@ -3988,6 +3998,10 @@ static struct mem_cgroup *mem_cgroup_all
struct mem_cgroup *mem;
int size = sizeof(struct mem_cgroup);
+ /* 0 is unused */
+ if (atomic_read(&mem_cgroup_num) == CONFIG_MEM_CGROUP_MAX_GROUPS-1)
+ return NULL;
+
/* Can be very big if MAX_NUMNODES is very big */
if (size < PAGE_SIZE)
mem = kmalloc(size, GFP_KERNEL);
@@ -4025,7 +4039,10 @@ static void __mem_cgroup_free(struct mem
int node;
mem_cgroup_remove_from_trees(mem);
+ /* No more lookup against this ID */
+ mem_cgroups[css_id(&mem->css)] = NULL;
free_css_id(&mem_cgroup_subsys, &mem->css);
+ atomic_dec(&mem_cgroup_num);
for_each_node_state(node, N_POSSIBLE)
free_mem_cgroup_per_zone_info(mem, node);
@@ -4162,6 +4179,7 @@ mem_cgroup_create(struct cgroup_subsys *
atomic_set(&mem->refcnt, 1);
mem->move_charge_at_immigrate = 0;
mutex_init(&mem->thresholds_lock);
+ atomic_inc(&mem_cgroup_num);
return &mem->css;
free_out:
__mem_cgroup_free(mem);
Index: mmotm-0727/init/Kconfig
===================================================================
--- mmotm-0727.orig/init/Kconfig
+++ mmotm-0727/init/Kconfig
@@ -594,6 +594,17 @@ config CGROUP_MEM_RES_CTLR_SWAP
Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
size is 4096bytes, 512k per 1Gbytes of swap.
+config MEM_CGROUP_MAX_GROUPS
+ int "Maximum number of memory cgroups on a system"
+ range 1 65535
+ default 8192 if 64BIT
+ default 2048 if 32BIT
+ help
+ Memory cgroup has limitation of the number of groups created.
+ Please select your favorite value. The more you allow, the more
+ memory will be consumed. This consumes vmalloc() area, so,
+ this should be small on 32bit arch.
+
menuconfig CGROUP_SCHED
bool "Group CPU scheduler"
depends on EXPERIMENTAL && CGROUPS
From: KAMEZAWA Hiroyuki <[email protected]>
Now, addresses of memory cgroup can be calculated by their ID without complex.
This patch relplaces pc->mem_cgroup from a pointer to a unsigned short.
On 64bit architecture, this offers us more 6bytes room per page_cgroup.
Use 2bytes for blkio-cgroup's page tracking. More 4bytes will be used for
some light-weight concurrent access.
We may able to move this id onto flags field but ...go step by step.
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
include/linux/page_cgroup.h | 3 ++-
mm/memcontrol.c | 40 +++++++++++++++++++++++++---------------
mm/page_cgroup.c | 2 +-
3 files changed, 28 insertions(+), 17 deletions(-)
Index: mmotm-0727/include/linux/page_cgroup.h
===================================================================
--- mmotm-0727.orig/include/linux/page_cgroup.h
+++ mmotm-0727/include/linux/page_cgroup.h
@@ -12,7 +12,8 @@
*/
struct page_cgroup {
unsigned long flags;
- struct mem_cgroup *mem_cgroup;
+ unsigned short mem_cgroup; /* ID of assigned memory cgroup */
+ unsigned short blk_cgroup; /* Not Used..but will be. */
struct page *page;
struct list_head lru; /* per cgroup LRU list */
};
Index: mmotm-0727/mm/page_cgroup.c
===================================================================
--- mmotm-0727.orig/mm/page_cgroup.c
+++ mmotm-0727/mm/page_cgroup.c
@@ -15,7 +15,7 @@ static void __meminit
__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
{
pc->flags = 0;
- pc->mem_cgroup = NULL;
+ pc->mem_cgroup = 0;
pc->page = pfn_to_page(pfn);
INIT_LIST_HEAD(&pc->lru);
}
Index: mmotm-0727/mm/memcontrol.c
===================================================================
--- mmotm-0727.orig/mm/memcontrol.c
+++ mmotm-0727/mm/memcontrol.c
@@ -376,7 +376,7 @@ struct cgroup_subsys_state *mem_cgroup_c
static struct mem_cgroup_per_zone *
page_cgroup_zoneinfo(struct page_cgroup *pc)
{
- struct mem_cgroup *mem = pc->mem_cgroup;
+ struct mem_cgroup *mem = id_to_memcg(pc->mem_cgroup);
int nid = page_cgroup_nid(pc);
int zid = page_cgroup_zid(pc);
@@ -581,7 +581,11 @@ static void mem_cgroup_charge_statistics
bool charge)
{
int val = (charge) ? 1 : -1;
-
+ if (pc->mem_cgroup == 0) {
+ show_stack(NULL, NULL);
+ printk("charge to 0\n");
+ while(1);
+ }
preempt_disable();
if (PageCgroupCache(pc))
@@ -718,6 +722,11 @@ static inline bool mem_cgroup_is_root(st
return (mem == root_mem_cgroup);
}
+static inline bool mem_cgroup_is_rootid(unsigned short id)
+{
+ return (id == 1);
+}
+
/*
* Following LRU functions are allowed to be used without PCG_LOCK.
* Operations are called by routine of global LRU independently from memcg.
@@ -750,7 +759,7 @@ void mem_cgroup_del_lru_list(struct page
*/
mz = page_cgroup_zoneinfo(pc);
MEM_CGROUP_ZSTAT(mz, lru) -= 1;
- if (mem_cgroup_is_root(pc->mem_cgroup))
+ if (mem_cgroup_is_rootid(pc->mem_cgroup))
return;
VM_BUG_ON(list_empty(&pc->lru));
list_del_init(&pc->lru);
@@ -777,7 +786,7 @@ void mem_cgroup_rotate_lru_list(struct p
*/
smp_rmb();
/* unused or root page is not rotated. */
- if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
+ if (!PageCgroupUsed(pc) || mem_cgroup_is_rootid(pc->mem_cgroup))
return;
mz = page_cgroup_zoneinfo(pc);
list_move(&pc->lru, &mz->lists[lru]);
@@ -803,7 +812,7 @@ void mem_cgroup_add_lru_list(struct page
mz = page_cgroup_zoneinfo(pc);
MEM_CGROUP_ZSTAT(mz, lru) += 1;
SetPageCgroupAcctLRU(pc);
- if (mem_cgroup_is_root(pc->mem_cgroup))
+ if (mem_cgroup_is_rootid(pc->mem_cgroup))
return;
list_add(&pc->lru, &mz->lists[lru]);
}
@@ -1471,7 +1480,7 @@ void mem_cgroup_update_file_mapped(struc
return;
lock_page_cgroup(pc);
- mem = pc->mem_cgroup;
+ mem = id_to_memcg(pc->mem_cgroup);
if (!mem || !PageCgroupUsed(pc))
goto done;
@@ -1859,7 +1868,7 @@ struct mem_cgroup *try_get_mem_cgroup_fr
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
- mem = pc->mem_cgroup;
+ mem = id_to_memcg(pc->mem_cgroup);
if (mem && !css_tryget(&mem->css))
mem = NULL;
} else if (PageSwapCache(page)) {
@@ -1895,7 +1904,7 @@ static void __mem_cgroup_commit_charge(s
return;
}
- pc->mem_cgroup = mem;
+ pc->mem_cgroup = css_id(&mem->css);
/*
* We access a page_cgroup asynchronously without lock_page_cgroup().
* Especially when a page_cgroup is taken from a page, pc->mem_cgroup
@@ -1953,7 +1962,7 @@ static void __mem_cgroup_move_account(st
VM_BUG_ON(PageLRU(pc->page));
VM_BUG_ON(!PageCgroupLocked(pc));
VM_BUG_ON(!PageCgroupUsed(pc));
- VM_BUG_ON(pc->mem_cgroup != from);
+ VM_BUG_ON(id_to_memcg(pc->mem_cgroup) != from);
if (PageCgroupFileMapped(pc)) {
/* Update mapped_file data for mem_cgroup */
@@ -1968,7 +1977,7 @@ static void __mem_cgroup_move_account(st
mem_cgroup_cancel_charge(from);
/* caller should have done css_get */
- pc->mem_cgroup = to;
+ pc->mem_cgroup = css_id(&to->css);
mem_cgroup_charge_statistics(to, pc, true);
/*
* We charges against "to" which may not have any tasks. Then, "to"
@@ -1988,7 +1997,7 @@ static int mem_cgroup_move_account(struc
{
int ret = -EINVAL;
lock_page_cgroup(pc);
- if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
+ if (PageCgroupUsed(pc) && id_to_memcg(pc->mem_cgroup) == from) {
__mem_cgroup_move_account(pc, from, to, uncharge);
ret = 0;
}
@@ -2327,9 +2336,9 @@ __mem_cgroup_uncharge_common(struct page
lock_page_cgroup(pc);
- mem = pc->mem_cgroup;
+ mem = id_to_memcg(pc->mem_cgroup);
- if (!PageCgroupUsed(pc))
+ if (!mem || !PageCgroupUsed(pc))
goto unlock_out;
switch (ctype) {
@@ -2572,7 +2581,7 @@ int mem_cgroup_prepare_migration(struct
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
- mem = pc->mem_cgroup;
+ mem = id_to_memcg(pc->mem_cgroup);
css_get(&mem->css);
/*
* At migrating an anonymous page, its mapcount goes down
@@ -4396,7 +4405,8 @@ static int is_target_pte_for_mc(struct v
* mem_cgroup_move_account() checks the pc is valid or not under
* the lock.
*/
- if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
+ if (PageCgroupUsed(pc) &&
+ id_to_memcg(pc->mem_cgroup) == mc.from) {
ret = MC_TARGET_PAGE;
if (target)
target->page = page;
From: KAMEZAWA Hiroyuki <[email protected]>
At accounting file events per memory cgroup, we need to find memory cgroup
via page_cgroup->mem_cgroup. Now, we use lock_page_cgroup().
But, considering the context which page-cgroup for files are accessed,
we can use alternative light-weight mutual execusion in the most case.
At handling file-caches, the only race we have to take care of is "moving"
account, IOW, overwriting page_cgroup->mem_cgroup. Because file status
update is done while the page-cache is in stable state, we don't have to
take care of race with charge/uncharge.
Unlike charge/uncharge, "move" happens not so frequently. It happens only when
rmdir() and task-moving (with a special settings.)
This patch adds a race-checker for file-cache-status accounting v.s. account
moving. The new per-cpu-per-memcg counter MEM_CGROUP_ON_MOVE is added.
The routine for account move
1. Increment it before start moving
2. Call synchronize_rcu()
3. Decrement it after the end of moving.
By this, file-status-counting routine can check it needs to call
lock_page_cgroup(). In most case, I doesn't need to call it.
Changelog: 20100729
- replaced __this_cpu_xxx() with this_cpu_xxx
(because we don't call spinlock)
- added VM_BUG_ON().
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
mm/memcontrol.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 63 insertions(+), 9 deletions(-)
Index: mmotm-0727/mm/memcontrol.c
===================================================================
--- mmotm-0727.orig/mm/memcontrol.c
+++ mmotm-0727/mm/memcontrol.c
@@ -88,6 +88,7 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */
+ MEM_CGROUP_ON_MOVE, /* A check for locking move account/status */
MEM_CGROUP_STAT_NSTATS,
};
@@ -1075,7 +1076,49 @@ static unsigned int get_swappiness(struc
return swappiness;
}
-/* A routine for testing mem is not under move_account */
+static void mem_cgroup_start_move(struct mem_cgroup *mem)
+{
+ int cpu;
+ /* for fast checking in mem_cgroup_update_file_stat() etc..*/
+ spin_lock(&mc.lock);
+ for_each_possible_cpu(cpu)
+ per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
+ spin_unlock(&mc.lock);
+
+ synchronize_rcu();
+}
+
+static void mem_cgroup_end_move(struct mem_cgroup *mem)
+{
+ int cpu;
+
+ if (!mem)
+ return;
+ /* for fast checking in mem_cgroup_update_file_stat() etc..*/
+ spin_lock(&mc.lock);
+ for_each_possible_cpu(cpu)
+ per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
+ spin_unlock(&mc.lock);
+}
+
+/*
+ * mem_cgroup_is_moved -- checking a cgroup is mc.from target or not.
+ * used for avoiding race.
+ * mem_cgroup_under_move -- checking a cgroup is mc.from or mc.to or
+ * under hierarchy of them. used for waiting at
+ * memory pressure.
+ * Result of is_moved can be trusted until the end of rcu_read_unlock().
+ * The caller must do
+ * rcu_read_lock();
+ * result = mem_cgroup_is_moved();
+ * .....make use of result here....
+ * rcu_read_unlock();
+ */
+static bool mem_cgroup_is_moved(struct mem_cgroup *mem)
+{
+ VM_BUG_ON(!rcu_read_lock_held());
+ return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
+}
static bool mem_cgroup_under_move(struct mem_cgroup *mem)
{
@@ -1474,29 +1517,36 @@ void mem_cgroup_update_file_mapped(struc
{
struct mem_cgroup *mem;
struct page_cgroup *pc;
+ bool need_lock = false;
pc = lookup_page_cgroup(page);
if (unlikely(!pc))
return;
-
- lock_page_cgroup(pc);
+ rcu_read_lock();
mem = id_to_memcg(pc->mem_cgroup);
+ if (!mem)
+ goto done;
+ need_lock = mem_cgroup_is_moved(mem);
+ if (need_lock) {
+ /* need to serialize with move_account */
+ lock_page_cgroup(pc);
+ mem = id_to_memcg(pc->mem_cgroup);
+ }
if (!mem || !PageCgroupUsed(pc))
goto done;
- /*
- * Preemption is already disabled. We can use __this_cpu_xxx
- */
if (val > 0) {
- __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+ this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
SetPageCgroupFileMapped(pc);
} else {
- __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+ this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
ClearPageCgroupFileMapped(pc);
}
done:
- unlock_page_cgroup(pc);
+ if (need_lock)
+ unlock_page_cgroup(pc);
+ rcu_read_unlock();
}
/*
@@ -3035,6 +3085,7 @@ move_account:
lru_add_drain_all();
drain_all_stock_sync();
ret = 0;
+ mem_cgroup_start_move(mem);
for_each_node_state(node, N_HIGH_MEMORY) {
for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
enum lru_list l;
@@ -3048,6 +3099,7 @@ move_account:
if (ret)
break;
}
+ mem_cgroup_end_move(mem);
memcg_oom_recover(mem);
/* it seems parent cgroup doesn't have enough mem */
if (ret == -ENOMEM)
@@ -4515,6 +4567,7 @@ static void mem_cgroup_clear_mc(void)
mc.to = NULL;
mc.moving_task = NULL;
spin_unlock(&mc.lock);
+ mem_cgroup_end_move(from);
memcg_oom_recover(from);
memcg_oom_recover(to);
wake_up_all(&mc.waitq);
@@ -4545,6 +4598,7 @@ static int mem_cgroup_can_attach(struct
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);
VM_BUG_ON(mc.moving_task);
+ mem_cgroup_start_move(from);
spin_lock(&mc.lock);
mc.from = from;
mc.to = mem;
From: KAMEZAWA Hiroyuki <[email protected]>
Preparing for adding new status arounf file caches.(dirty, writeback,etc..)
Using a unified macro and more generic names.
All counters will have the same rule for updating.
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
include/linux/memcontrol.h | 4 ++++
include/linux/page_cgroup.h | 21 +++++++++++++++------
mm/memcontrol.c | 35 +++++++++++++++++++++++------------
3 files changed, 42 insertions(+), 18 deletions(-)
Index: mmotm-0727/include/linux/memcontrol.h
===================================================================
--- mmotm-0727.orig/include/linux/memcontrol.h
+++ mmotm-0727/include/linux/memcontrol.h
@@ -121,6 +121,10 @@ static inline bool mem_cgroup_disabled(v
return false;
}
+enum {
+ __MEMCG_FILE_MAPPED,
+ NR_MEMCG_FILE_STAT
+};
void mem_cgroup_update_file_mapped(struct page *page, int val);
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask, int nid,
Index: mmotm-0727/mm/memcontrol.c
===================================================================
--- mmotm-0727.orig/mm/memcontrol.c
+++ mmotm-0727/mm/memcontrol.c
@@ -83,16 +83,20 @@ enum mem_cgroup_stat_index {
*/
MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
- MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */
MEM_CGROUP_ON_MOVE, /* A check for locking move account/status */
-
+ MEM_CGROUP_FSTAT_BASE,
+ MEM_CGROUP_FSTAT_END
+ = MEM_CGROUP_FSTAT_BASE + NR_MEMCG_FILE_STAT,
MEM_CGROUP_STAT_NSTATS,
};
+#define MEM_CGROUP_STAT_FILE_MAPPED\
+ (MEM_CGROUP_FSTAT_BASE + __MEMCG_FILE_MAPPED)
+
struct mem_cgroup_stat_cpu {
s64 count[MEM_CGROUP_STAT_NSTATS];
};
@@ -334,7 +338,6 @@ enum charge_type {
/* only for here (for easy reading.) */
#define PCGF_CACHE (1UL << PCG_CACHE)
#define PCGF_USED (1UL << PCG_USED)
-#define PCGF_LOCK (1UL << PCG_LOCK)
/* Not used, but added here for completeness */
#define PCGF_ACCT (1UL << PCG_ACCT)
@@ -1513,7 +1516,7 @@ bool mem_cgroup_handle_oom(struct mem_cg
* Currently used to update mapped file statistics, but the routine can be
* generalized to update other statistics as well.
*/
-void mem_cgroup_update_file_mapped(struct page *page, int val)
+static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
{
struct mem_cgroup *mem;
struct page_cgroup *pc;
@@ -1536,11 +1539,11 @@ void mem_cgroup_update_file_mapped(struc
goto done;
if (val > 0) {
- this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
- SetPageCgroupFileMapped(pc);
+ this_cpu_inc(mem->stat->count[MEM_CGROUP_FSTAT_BASE + idx]);
+ SetPCGFileFlag(pc, idx);
} else {
- this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
- ClearPageCgroupFileMapped(pc);
+ this_cpu_dec(mem->stat->count[MEM_CGROUP_FSTAT_BASE + idx]);
+ ClearPCGFileFlag(pc, idx);
}
done:
@@ -1549,6 +1552,11 @@ done:
rcu_read_unlock();
}
+void mem_cgroup_update_file_mapped(struct page *page, int val)
+{
+ return mem_cgroup_update_file_stat(page, __MEMCG_FILE_MAPPED, val);
+}
+
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
* TODO: maybe necessary to use big numbers in big irons.
@@ -2008,17 +2016,20 @@ static void __mem_cgroup_commit_charge(s
static void __mem_cgroup_move_account(struct page_cgroup *pc,
struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
{
+ int i;
VM_BUG_ON(from == to);
VM_BUG_ON(PageLRU(pc->page));
- VM_BUG_ON(!PageCgroupLocked(pc));
+ VM_BUG_ON(!page_cgroup_locked(pc));
VM_BUG_ON(!PageCgroupUsed(pc));
VM_BUG_ON(id_to_memcg(pc->mem_cgroup) != from);
- if (PageCgroupFileMapped(pc)) {
+ for (i = 0; i < NR_MEMCG_FILE_STAT; ++i) {
+ if (!TestPCGFileFlag(pc, i))
+ continue;
/* Update mapped_file data for mem_cgroup */
preempt_disable();
- __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
- __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+ __this_cpu_dec(from->stat->count[MEM_CGROUP_FSTAT_BASE + i]);
+ __this_cpu_inc(to->stat->count[MEM_CGROUP_FSTAT_BASE + i]);
preempt_enable();
}
mem_cgroup_charge_statistics(from, pc, false);
Index: mmotm-0727/include/linux/page_cgroup.h
===================================================================
--- mmotm-0727.orig/include/linux/page_cgroup.h
+++ mmotm-0727/include/linux/page_cgroup.h
@@ -40,8 +40,8 @@ enum {
PCG_CACHE, /* charged as cache */
PCG_USED, /* this object is in use. */
PCG_ACCT_LRU, /* page has been accounted for */
- PCG_FILE_MAPPED, /* page is accounted as "mapped" */
PCG_MIGRATION, /* under page migration */
+ PCG_FILE_FLAGS, /* see memcontrol.h */
};
#define TESTPCGFLAG(uname, lname) \
@@ -76,11 +76,6 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU)
TESTPCGFLAG(AcctLRU, ACCT_LRU)
TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
-
-SETPCGFLAG(FileMapped, FILE_MAPPED)
-CLEARPCGFLAG(FileMapped, FILE_MAPPED)
-TESTPCGFLAG(FileMapped, FILE_MAPPED)
-
SETPCGFLAG(Migration, MIGRATION)
CLEARPCGFLAG(Migration, MIGRATION)
TESTPCGFLAG(Migration, MIGRATION)
@@ -105,6 +100,20 @@ static inline void unlock_page_cgroup(st
bit_spin_unlock(PCG_LOCK, &pc->flags);
}
+static inline void SetPCGFileFlag(struct page_cgroup *pc, int idx)
+{
+ set_bit(PCG_FILE_FLAGS + idx, &pc->flags);
+}
+
+static inline void ClearPCGFileFlag(struct page_cgroup *pc, int idx)
+{
+ clear_bit(PCG_FILE_FLAGS + idx, &pc->flags);
+}
+static inline bool TestPCGFileFlag(struct page_cgroup *pc, int idx)
+{
+ return test_bit(PCG_FILE_FLAGS + idx, &pc->flags);
+}
+
#else /* CONFIG_CGROUP_MEM_RES_CTLR */
struct page_cgroup;
From: KAMEZAWA Hiroyuki <[email protected]>
This patch replaces bit_spinlock with spinlock. In general,
spinlock has good functinality than bit_spin_lock and we should use
it if we have a room for it. In 64bit arch, we have extra 4bytes.
Let's use it.
expected effects:
- use better codes.
- ticket lock on x86-64
- para-vitualization aware lock
etc..
Chagelog: 20090729
- fixed page_cgroup_locked().
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
--
---
include/linux/page_cgroup.h | 35 +++++++++++++++++++++++++++++++++--
mm/page_cgroup.c | 3 +++
2 files changed, 36 insertions(+), 2 deletions(-)
Index: mmotm-0727/include/linux/page_cgroup.h
===================================================================
--- mmotm-0727.orig/include/linux/page_cgroup.h
+++ mmotm-0727/include/linux/page_cgroup.h
@@ -10,8 +10,14 @@
* All page cgroups are allocated at boot or memory hotplug event,
* then the page cgroup for pfn always exists.
*/
+#ifdef CONFIG_64BIT
+#define PCG_HAS_SPINLOCK
+#endif
struct page_cgroup {
unsigned long flags;
+#ifdef PCG_HAS_SPINLOCK
+ spinlock_t lock;
+#endif
unsigned short mem_cgroup; /* ID of assigned memory cgroup */
unsigned short blk_cgroup; /* Not Used..but will be. */
struct page *page;
@@ -36,7 +42,9 @@ struct page_cgroup *lookup_page_cgroup(s
enum {
/* flags for mem_cgroup */
- PCG_LOCK, /* page cgroup is locked */
+#ifndef PCG_HAS_SPINLOCK
+ PCG_LOCK, /* page cgroup is locked (see below also.)*/
+#endif
PCG_CACHE, /* charged as cache */
PCG_USED, /* this object is in use. */
PCG_ACCT_LRU, /* page has been accounted for */
@@ -60,7 +68,7 @@ static inline void ClearPageCgroup##unam
static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \
{ return test_and_clear_bit(PCG_##lname, &pc->flags); }
-TESTPCGFLAG(Locked, LOCK)
+
/* Cache flag is set only once (at allocation) */
TESTPCGFLAG(Cache, CACHE)
@@ -90,6 +98,22 @@ static inline enum zone_type page_cgroup
return page_zonenum(pc->page);
}
+#ifdef PCG_HAS_SPINLOCK
+static inline void lock_page_cgroup(struct page_cgroup *pc)
+{
+ spin_lock(&pc->lock);
+}
+static inline void unlock_page_cgroup(struct page_cgroup *pc)
+{
+ spin_unlock(&pc->lock);
+}
+
+static inline bool page_cgroup_locked(struct page_cgroup *pc)
+{
+ return spin_is_locked(&pc->lock);
+}
+
+#else
static inline void lock_page_cgroup(struct page_cgroup *pc)
{
bit_spin_lock(PCG_LOCK, &pc->flags);
@@ -100,6 +124,13 @@ static inline void unlock_page_cgroup(st
bit_spin_unlock(PCG_LOCK, &pc->flags);
}
+static inline bool page_cgroup_locked(struct page_cgroup *pc)
+{
+ return test_bit(PCG_LOCK, &pc->flags);
+}
+
+#endif
+
static inline void SetPCGFileFlag(struct page_cgroup *pc, int idx)
{
set_bit(PCG_FILE_FLAGS + idx, &pc->flags);
Index: mmotm-0727/mm/page_cgroup.c
===================================================================
--- mmotm-0727.orig/mm/page_cgroup.c
+++ mmotm-0727/mm/page_cgroup.c
@@ -18,6 +18,9 @@ __init_page_cgroup(struct page_cgroup *p
pc->mem_cgroup = 0;
pc->page = pfn_to_page(pfn);
INIT_LIST_HEAD(&pc->lru);
+#ifdef PCG_HAS_SPINLOCK
+ spin_lock_init(&pc->lock);
+#endif
}
static unsigned long total_usage;
KAMEZAWA Hiroyuki <[email protected]> writes:
> From: KAMEZAWA Hiroyuki <[email protected]>
>
> Now, memory cgroup has an ID per cgroup and make use of it at
> - hierarchy walk,
> - swap recording.
>
> This patch is for making more use of it. The final purpose is
> to replace page_cgroup->mem_cgroup's pointer to an unsigned short.
>
> This patch caches a pointer of memcg in an array. By this, we
> don't have to call css_lookup() which requires radix-hash walk.
> This saves some amount of memory footprint at lookup memcg via id.
> It's called in very fast path and need to be quick AMAP.
>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> ---
> init/Kconfig | 11 +++++++++++
> mm/memcontrol.c | 46 ++++++++++++++++++++++++++++++++--------------
> 2 files changed, 43 insertions(+), 14 deletions(-)
>
> Index: mmotm-0727/mm/memcontrol.c
> ===================================================================
> --- mmotm-0727.orig/mm/memcontrol.c
> +++ mmotm-0727/mm/memcontrol.c
> @@ -292,6 +292,27 @@ static bool move_file(void)
> &mc.to->move_charge_at_immigrate);
> }
>
> +atomic_t mem_cgroup_num;
Maybe static and init?
+ static atomic_t mem_cgroup_num = ATOMIC_INIT(0);
> +struct mem_cgroup *mem_cgroups[CONFIG_MEM_CGROUP_MAX_GROUPS] __read_mostly;
Make this static?
Because value [0] is reserved, maybe this should be:
+struct mem_cgroup *mem_cgroups[CONFIG_MEM_CGROUP_MAX_GROUPS+1] __read_mostly;
> +
> +static struct mem_cgroup* id_to_memcg(unsigned short id)
> +{
> + /*
> + * This array is set to NULL when mem_cgroup is freed.
> + * IOW, there are no more references && rcu_synchronized().
> + * This lookup-caching is safe.
> + */
> + if (unlikely(!mem_cgroups[id])) {
> + struct cgroup_subsys_state *css;
> + rcu_read_lock();
> + css = css_lookup(&mem_cgroup_subsys, id);
> + if (!css)
> + return NULL;
> + rcu_read_unlock();
I think we should move rcu_read_unlock() above to just before "if
(!css)" to unlock rcu when returning NULL.
> + mem_cgroups[id] = container_of(css, struct mem_cgroup, css);
> + }
> + return mem_cgroups[id];
> +}
> /*
> * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
> * limit reclaim to prevent infinite loops, if they ever occur.
> @@ -1824,18 +1845,7 @@ static void mem_cgroup_cancel_charge(str
> * it's concern. (dropping refcnt from swap can be called against removed
> * memcg.)
> */
> -static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
> -{
> - struct cgroup_subsys_state *css;
>
> - /* ID 0 is unused ID */
> - if (!id)
> - return NULL;
> - css = css_lookup(&mem_cgroup_subsys, id);
> - if (!css)
> - return NULL;
> - return container_of(css, struct mem_cgroup, css);
> -}
>
> struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
> {
> @@ -1856,7 +1866,7 @@ struct mem_cgroup *try_get_mem_cgroup_fr
> ent.val = page_private(page);
> id = lookup_swap_cgroup(ent);
> rcu_read_lock();
> - mem = mem_cgroup_lookup(id);
> + mem = id_to_memcg(id);
> if (mem && !css_tryget(&mem->css))
> mem = NULL;
> rcu_read_unlock();
> @@ -2208,7 +2218,7 @@ __mem_cgroup_commit_charge_swapin(struct
>
> id = swap_cgroup_record(ent, 0);
> rcu_read_lock();
> - memcg = mem_cgroup_lookup(id);
> + memcg = id_to_memcg(id);
> if (memcg) {
> /*
> * This recorded memcg can be obsolete one. So, avoid
> @@ -2472,7 +2482,7 @@ void mem_cgroup_uncharge_swap(swp_entry_
>
> id = swap_cgroup_record(ent, 0);
> rcu_read_lock();
> - memcg = mem_cgroup_lookup(id);
> + memcg = id_to_memcg(id);
> if (memcg) {
> /*
> * We uncharge this because swap is freed.
> @@ -3988,6 +3998,10 @@ static struct mem_cgroup *mem_cgroup_all
> struct mem_cgroup *mem;
> int size = sizeof(struct mem_cgroup);
>
> + /* 0 is unused */
> + if (atomic_read(&mem_cgroup_num) == CONFIG_MEM_CGROUP_MAX_GROUPS-1)
> + return NULL;
> +
> /* Can be very big if MAX_NUMNODES is very big */
> if (size < PAGE_SIZE)
> mem = kmalloc(size, GFP_KERNEL);
> @@ -4025,7 +4039,10 @@ static void __mem_cgroup_free(struct mem
> int node;
>
> mem_cgroup_remove_from_trees(mem);
> + /* No more lookup against this ID */
> + mem_cgroups[css_id(&mem->css)] = NULL;
Are css_id() values tightly packed? If there are 4 memcg allocated, are
we guaranteed that all of their id's have value 1..4?
> free_css_id(&mem_cgroup_subsys, &mem->css);
> + atomic_dec(&mem_cgroup_num);
>
> for_each_node_state(node, N_POSSIBLE)
> free_mem_cgroup_per_zone_info(mem, node);
> @@ -4162,6 +4179,7 @@ mem_cgroup_create(struct cgroup_subsys *
> atomic_set(&mem->refcnt, 1);
> mem->move_charge_at_immigrate = 0;
> mutex_init(&mem->thresholds_lock);
> + atomic_inc(&mem_cgroup_num);
> return &mem->css;
> free_out:
> __mem_cgroup_free(mem);
> Index: mmotm-0727/init/Kconfig
> ===================================================================
> --- mmotm-0727.orig/init/Kconfig
> +++ mmotm-0727/init/Kconfig
> @@ -594,6 +594,17 @@ config CGROUP_MEM_RES_CTLR_SWAP
> Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
> size is 4096bytes, 512k per 1Gbytes of swap.
>
> +config MEM_CGROUP_MAX_GROUPS
> + int "Maximum number of memory cgroups on a system"
> + range 1 65535
> + default 8192 if 64BIT
> + default 2048 if 32BIT
> + help
> + Memory cgroup has limitation of the number of groups created.
> + Please select your favorite value. The more you allow, the more
> + memory will be consumed. This consumes vmalloc() area, so,
> + this should be small on 32bit arch.
> +
> menuconfig CGROUP_SCHED
> bool "Group CPU scheduler"
> depends on EXPERIMENTAL && CGROUPS
KAMEZAWA Hiroyuki <[email protected]> writes:
> From: KAMEZAWA Hiroyuki <[email protected]>
>
> Now, addresses of memory cgroup can be calculated by their ID without complex.
> This patch relplaces pc->mem_cgroup from a pointer to a unsigned short.
> On 64bit architecture, this offers us more 6bytes room per page_cgroup.
> Use 2bytes for blkio-cgroup's page tracking. More 4bytes will be used for
> some light-weight concurrent access.
>
> We may able to move this id onto flags field but ...go step by step.
>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> ---
> include/linux/page_cgroup.h | 3 ++-
> mm/memcontrol.c | 40 +++++++++++++++++++++++++---------------
> mm/page_cgroup.c | 2 +-
> 3 files changed, 28 insertions(+), 17 deletions(-)
>
> Index: mmotm-0727/include/linux/page_cgroup.h
> ===================================================================
> --- mmotm-0727.orig/include/linux/page_cgroup.h
> +++ mmotm-0727/include/linux/page_cgroup.h
> @@ -12,7 +12,8 @@
> */
> struct page_cgroup {
> unsigned long flags;
> - struct mem_cgroup *mem_cgroup;
> + unsigned short mem_cgroup; /* ID of assigned memory cgroup */
> + unsigned short blk_cgroup; /* Not Used..but will be. */
> struct page *page;
> struct list_head lru; /* per cgroup LRU list */
> };
> Index: mmotm-0727/mm/page_cgroup.c
> ===================================================================
> --- mmotm-0727.orig/mm/page_cgroup.c
> +++ mmotm-0727/mm/page_cgroup.c
> @@ -15,7 +15,7 @@ static void __meminit
> __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
> {
> pc->flags = 0;
> - pc->mem_cgroup = NULL;
> + pc->mem_cgroup = 0;
> pc->page = pfn_to_page(pfn);
> INIT_LIST_HEAD(&pc->lru);
> }
> Index: mmotm-0727/mm/memcontrol.c
> ===================================================================
> --- mmotm-0727.orig/mm/memcontrol.c
> +++ mmotm-0727/mm/memcontrol.c
> @@ -376,7 +376,7 @@ struct cgroup_subsys_state *mem_cgroup_c
> static struct mem_cgroup_per_zone *
> page_cgroup_zoneinfo(struct page_cgroup *pc)
> {
> - struct mem_cgroup *mem = pc->mem_cgroup;
> + struct mem_cgroup *mem = id_to_memcg(pc->mem_cgroup);
> int nid = page_cgroup_nid(pc);
> int zid = page_cgroup_zid(pc);
>
> @@ -581,7 +581,11 @@ static void mem_cgroup_charge_statistics
> bool charge)
> {
> int val = (charge) ? 1 : -1;
> -
> + if (pc->mem_cgroup == 0) {
> + show_stack(NULL, NULL);
> + printk("charge to 0\n");
> + while(1);
> + }
Why hang the task here. If this is bad then maybe BUG_ON()?
> preempt_disable();
>
> if (PageCgroupCache(pc))
> @@ -718,6 +722,11 @@ static inline bool mem_cgroup_is_root(st
> return (mem == root_mem_cgroup);
> }
>
> +static inline bool mem_cgroup_is_rootid(unsigned short id)
> +{
> + return (id == 1);
> +}
> +
> /*
> * Following LRU functions are allowed to be used without PCG_LOCK.
> * Operations are called by routine of global LRU independently from memcg.
> @@ -750,7 +759,7 @@ void mem_cgroup_del_lru_list(struct page
> */
> mz = page_cgroup_zoneinfo(pc);
> MEM_CGROUP_ZSTAT(mz, lru) -= 1;
> - if (mem_cgroup_is_root(pc->mem_cgroup))
> + if (mem_cgroup_is_rootid(pc->mem_cgroup))
> return;
> VM_BUG_ON(list_empty(&pc->lru));
> list_del_init(&pc->lru);
> @@ -777,7 +786,7 @@ void mem_cgroup_rotate_lru_list(struct p
> */
> smp_rmb();
> /* unused or root page is not rotated. */
> - if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
> + if (!PageCgroupUsed(pc) || mem_cgroup_is_rootid(pc->mem_cgroup))
> return;
> mz = page_cgroup_zoneinfo(pc);
> list_move(&pc->lru, &mz->lists[lru]);
> @@ -803,7 +812,7 @@ void mem_cgroup_add_lru_list(struct page
> mz = page_cgroup_zoneinfo(pc);
> MEM_CGROUP_ZSTAT(mz, lru) += 1;
> SetPageCgroupAcctLRU(pc);
> - if (mem_cgroup_is_root(pc->mem_cgroup))
> + if (mem_cgroup_is_rootid(pc->mem_cgroup))
> return;
> list_add(&pc->lru, &mz->lists[lru]);
> }
> @@ -1471,7 +1480,7 @@ void mem_cgroup_update_file_mapped(struc
> return;
>
> lock_page_cgroup(pc);
> - mem = pc->mem_cgroup;
> + mem = id_to_memcg(pc->mem_cgroup);
> if (!mem || !PageCgroupUsed(pc))
> goto done;
>
> @@ -1859,7 +1868,7 @@ struct mem_cgroup *try_get_mem_cgroup_fr
> pc = lookup_page_cgroup(page);
> lock_page_cgroup(pc);
> if (PageCgroupUsed(pc)) {
> - mem = pc->mem_cgroup;
> + mem = id_to_memcg(pc->mem_cgroup);
> if (mem && !css_tryget(&mem->css))
> mem = NULL;
> } else if (PageSwapCache(page)) {
> @@ -1895,7 +1904,7 @@ static void __mem_cgroup_commit_charge(s
> return;
> }
>
> - pc->mem_cgroup = mem;
> + pc->mem_cgroup = css_id(&mem->css);
> /*
> * We access a page_cgroup asynchronously without lock_page_cgroup().
> * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
> @@ -1953,7 +1962,7 @@ static void __mem_cgroup_move_account(st
> VM_BUG_ON(PageLRU(pc->page));
> VM_BUG_ON(!PageCgroupLocked(pc));
> VM_BUG_ON(!PageCgroupUsed(pc));
> - VM_BUG_ON(pc->mem_cgroup != from);
> + VM_BUG_ON(id_to_memcg(pc->mem_cgroup) != from);
>
> if (PageCgroupFileMapped(pc)) {
> /* Update mapped_file data for mem_cgroup */
> @@ -1968,7 +1977,7 @@ static void __mem_cgroup_move_account(st
> mem_cgroup_cancel_charge(from);
>
> /* caller should have done css_get */
> - pc->mem_cgroup = to;
> + pc->mem_cgroup = css_id(&to->css);
> mem_cgroup_charge_statistics(to, pc, true);
> /*
> * We charges against "to" which may not have any tasks. Then, "to"
> @@ -1988,7 +1997,7 @@ static int mem_cgroup_move_account(struc
> {
> int ret = -EINVAL;
> lock_page_cgroup(pc);
> - if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
> + if (PageCgroupUsed(pc) && id_to_memcg(pc->mem_cgroup) == from) {
> __mem_cgroup_move_account(pc, from, to, uncharge);
> ret = 0;
> }
> @@ -2327,9 +2336,9 @@ __mem_cgroup_uncharge_common(struct page
>
> lock_page_cgroup(pc);
>
> - mem = pc->mem_cgroup;
> + mem = id_to_memcg(pc->mem_cgroup);
>
> - if (!PageCgroupUsed(pc))
> + if (!mem || !PageCgroupUsed(pc))
Why add the extra !mem check here?
Can PageCgroupUsed() return true if mem==NULL?
> goto unlock_out;
>
> switch (ctype) {
> @@ -2572,7 +2581,7 @@ int mem_cgroup_prepare_migration(struct
> pc = lookup_page_cgroup(page);
> lock_page_cgroup(pc);
> if (PageCgroupUsed(pc)) {
> - mem = pc->mem_cgroup;
> + mem = id_to_memcg(pc->mem_cgroup);
> css_get(&mem->css);
> /*
> * At migrating an anonymous page, its mapcount goes down
> @@ -4396,7 +4405,8 @@ static int is_target_pte_for_mc(struct v
> * mem_cgroup_move_account() checks the pc is valid or not under
> * the lock.
> */
> - if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
> + if (PageCgroupUsed(pc) &&
> + id_to_memcg(pc->mem_cgroup) == mc.from) {
> ret = MC_TARGET_PAGE;
> if (target)
> target->page = page;
On Thu, 29 Jul 2010 11:03:26 -0700
Greg Thelen <[email protected]> wrote:
> KAMEZAWA Hiroyuki <[email protected]> writes:
>
> > From: KAMEZAWA Hiroyuki <[email protected]>
> >
> > Now, memory cgroup has an ID per cgroup and make use of it at
> > - hierarchy walk,
> > - swap recording.
> >
> > This patch is for making more use of it. The final purpose is
> > to replace page_cgroup->mem_cgroup's pointer to an unsigned short.
> >
> > This patch caches a pointer of memcg in an array. By this, we
> > don't have to call css_lookup() which requires radix-hash walk.
> > This saves some amount of memory footprint at lookup memcg via id.
> > It's called in very fast path and need to be quick AMAP.
> >
> > Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> > ---
> > init/Kconfig | 11 +++++++++++
> > mm/memcontrol.c | 46 ++++++++++++++++++++++++++++++++--------------
> > 2 files changed, 43 insertions(+), 14 deletions(-)
> >
> > Index: mmotm-0727/mm/memcontrol.c
> > ===================================================================
> > --- mmotm-0727.orig/mm/memcontrol.c
> > +++ mmotm-0727/mm/memcontrol.c
> > @@ -292,6 +292,27 @@ static bool move_file(void)
> > &mc.to->move_charge_at_immigrate);
> > }
> >
> > +atomic_t mem_cgroup_num;
>
> Maybe static and init?
> + static atomic_t mem_cgroup_num = ATOMIC_INIT(0);
>
IIUC, "0" is not required to be initialized. but ok, this is static.
> > +struct mem_cgroup *mem_cgroups[CONFIG_MEM_CGROUP_MAX_GROUPS] __read_mostly;
>
> Make this static?
>
> Because value [0] is reserved, maybe this should be:
> +struct mem_cgroup *mem_cgroups[CONFIG_MEM_CGROUP_MAX_GROUPS+1] __read_mostly;
>
Hmm ? I don't like this. I'll write "0" is unused in CONFIG.
> > +
> > +static struct mem_cgroup* id_to_memcg(unsigned short id)
> > +{
> > + /*
> > + * This array is set to NULL when mem_cgroup is freed.
> > + * IOW, there are no more references && rcu_synchronized().
> > + * This lookup-caching is safe.
> > + */
> > + if (unlikely(!mem_cgroups[id])) {
> > + struct cgroup_subsys_state *css;
> > + rcu_read_lock();
> > + css = css_lookup(&mem_cgroup_subsys, id);
> > + if (!css)
> > + return NULL;
> > + rcu_read_unlock();
>
> I think we should move rcu_read_unlock() above to just before "if
> (!css)" to unlock rcu when returning NULL.
>
yes.
> > + mem_cgroups[id] = container_of(css, struct mem_cgroup, css);
> > + }
> > + return mem_cgroups[id];
> > +}
> > /*
> > * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
> > * limit reclaim to prevent infinite loops, if they ever occur.
> > @@ -1824,18 +1845,7 @@ static void mem_cgroup_cancel_charge(str
> > * it's concern. (dropping refcnt from swap can be called against removed
> > * memcg.)
> > */
> > -static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
> > -{
> > - struct cgroup_subsys_state *css;
> >
> > - /* ID 0 is unused ID */
> > - if (!id)
> > - return NULL;
> > - css = css_lookup(&mem_cgroup_subsys, id);
> > - if (!css)
> > - return NULL;
> > - return container_of(css, struct mem_cgroup, css);
> > -}
> >
> > struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
> > {
> > @@ -1856,7 +1866,7 @@ struct mem_cgroup *try_get_mem_cgroup_fr
> > ent.val = page_private(page);
> > id = lookup_swap_cgroup(ent);
> > rcu_read_lock();
> > - mem = mem_cgroup_lookup(id);
> > + mem = id_to_memcg(id);
> > if (mem && !css_tryget(&mem->css))
> > mem = NULL;
> > rcu_read_unlock();
> > @@ -2208,7 +2218,7 @@ __mem_cgroup_commit_charge_swapin(struct
> >
> > id = swap_cgroup_record(ent, 0);
> > rcu_read_lock();
> > - memcg = mem_cgroup_lookup(id);
> > + memcg = id_to_memcg(id);
> > if (memcg) {
> > /*
> > * This recorded memcg can be obsolete one. So, avoid
> > @@ -2472,7 +2482,7 @@ void mem_cgroup_uncharge_swap(swp_entry_
> >
> > id = swap_cgroup_record(ent, 0);
> > rcu_read_lock();
> > - memcg = mem_cgroup_lookup(id);
> > + memcg = id_to_memcg(id);
> > if (memcg) {
> > /*
> > * We uncharge this because swap is freed.
> > @@ -3988,6 +3998,10 @@ static struct mem_cgroup *mem_cgroup_all
> > struct mem_cgroup *mem;
> > int size = sizeof(struct mem_cgroup);
> >
> > + /* 0 is unused */
> > + if (atomic_read(&mem_cgroup_num) == CONFIG_MEM_CGROUP_MAX_GROUPS-1)
> > + return NULL;
> > +
> > /* Can be very big if MAX_NUMNODES is very big */
> > if (size < PAGE_SIZE)
> > mem = kmalloc(size, GFP_KERNEL);
> > @@ -4025,7 +4039,10 @@ static void __mem_cgroup_free(struct mem
> > int node;
> >
> > mem_cgroup_remove_from_trees(mem);
> > + /* No more lookup against this ID */
> > + mem_cgroups[css_id(&mem->css)] = NULL;
>
> Are css_id() values tightly packed? If there are 4 memcg allocated, are
> we guaranteed that all of their id's have value 1..4?
>
No. It can be sparse.
Thanks,
-Kame
On Thu, 29 Jul 2010 11:31:15 -0700
Greg Thelen <[email protected]> wrote:
> KAMEZAWA Hiroyuki <[email protected]> writes:
>
> > From: KAMEZAWA Hiroyuki <[email protected]>
> >
> > Now, addresses of memory cgroup can be calculated by their ID without complex.
> > This patch relplaces pc->mem_cgroup from a pointer to a unsigned short.
> > On 64bit architecture, this offers us more 6bytes room per page_cgroup.
> > Use 2bytes for blkio-cgroup's page tracking. More 4bytes will be used for
> > some light-weight concurrent access.
> >
> > We may able to move this id onto flags field but ...go step by step.
> >
> > Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> > ---
> > include/linux/page_cgroup.h | 3 ++-
> > mm/memcontrol.c | 40 +++++++++++++++++++++++++---------------
> > mm/page_cgroup.c | 2 +-
> > 3 files changed, 28 insertions(+), 17 deletions(-)
> >
> > Index: mmotm-0727/include/linux/page_cgroup.h
> > ===================================================================
> > --- mmotm-0727.orig/include/linux/page_cgroup.h
> > +++ mmotm-0727/include/linux/page_cgroup.h
> > @@ -12,7 +12,8 @@
> > */
> > struct page_cgroup {
> > unsigned long flags;
> > - struct mem_cgroup *mem_cgroup;
> > + unsigned short mem_cgroup; /* ID of assigned memory cgroup */
> > + unsigned short blk_cgroup; /* Not Used..but will be. */
> > struct page *page;
> > struct list_head lru; /* per cgroup LRU list */
> > };
> > Index: mmotm-0727/mm/page_cgroup.c
> > ===================================================================
> > --- mmotm-0727.orig/mm/page_cgroup.c
> > +++ mmotm-0727/mm/page_cgroup.c
> > @@ -15,7 +15,7 @@ static void __meminit
> > __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
> > {
> > pc->flags = 0;
> > - pc->mem_cgroup = NULL;
> > + pc->mem_cgroup = 0;
> > pc->page = pfn_to_page(pfn);
> > INIT_LIST_HEAD(&pc->lru);
> > }
> > Index: mmotm-0727/mm/memcontrol.c
> > ===================================================================
> > --- mmotm-0727.orig/mm/memcontrol.c
> > +++ mmotm-0727/mm/memcontrol.c
> > @@ -376,7 +376,7 @@ struct cgroup_subsys_state *mem_cgroup_c
> > static struct mem_cgroup_per_zone *
> > page_cgroup_zoneinfo(struct page_cgroup *pc)
> > {
> > - struct mem_cgroup *mem = pc->mem_cgroup;
> > + struct mem_cgroup *mem = id_to_memcg(pc->mem_cgroup);
> > int nid = page_cgroup_nid(pc);
> > int zid = page_cgroup_zid(pc);
> >
> > @@ -581,7 +581,11 @@ static void mem_cgroup_charge_statistics
> > bool charge)
> > {
> > int val = (charge) ? 1 : -1;
> > -
> > + if (pc->mem_cgroup == 0) {
> > + show_stack(NULL, NULL);
> > + printk("charge to 0\n");
> > + while(1);
> > + }
> Why hang the task here. If this is bad then maybe BUG_ON()?
Ouch, debug code is remaining.
> > preempt_disable();
> >
> > if (PageCgroupCache(pc))
> > @@ -718,6 +722,11 @@ static inline bool mem_cgroup_is_root(st
> > return (mem == root_mem_cgroup);
> > }
> >
> > +static inline bool mem_cgroup_is_rootid(unsigned short id)
> > +{
> > + return (id == 1);
> > +}
> > +
> > /*
> > * Following LRU functions are allowed to be used without PCG_LOCK.
> > * Operations are called by routine of global LRU independently from memcg.
> > @@ -750,7 +759,7 @@ void mem_cgroup_del_lru_list(struct page
> > */
> > mz = page_cgroup_zoneinfo(pc);
> > MEM_CGROUP_ZSTAT(mz, lru) -= 1;
> > - if (mem_cgroup_is_root(pc->mem_cgroup))
> > + if (mem_cgroup_is_rootid(pc->mem_cgroup))
> > return;
> > VM_BUG_ON(list_empty(&pc->lru));
> > list_del_init(&pc->lru);
> > @@ -777,7 +786,7 @@ void mem_cgroup_rotate_lru_list(struct p
> > */
> > smp_rmb();
> > /* unused or root page is not rotated. */
> > - if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
> > + if (!PageCgroupUsed(pc) || mem_cgroup_is_rootid(pc->mem_cgroup))
> > return;
> > mz = page_cgroup_zoneinfo(pc);
> > list_move(&pc->lru, &mz->lists[lru]);
> > @@ -803,7 +812,7 @@ void mem_cgroup_add_lru_list(struct page
> > mz = page_cgroup_zoneinfo(pc);
> > MEM_CGROUP_ZSTAT(mz, lru) += 1;
> > SetPageCgroupAcctLRU(pc);
> > - if (mem_cgroup_is_root(pc->mem_cgroup))
> > + if (mem_cgroup_is_rootid(pc->mem_cgroup))
> > return;
> > list_add(&pc->lru, &mz->lists[lru]);
> > }
> > @@ -1471,7 +1480,7 @@ void mem_cgroup_update_file_mapped(struc
> > return;
> >
> > lock_page_cgroup(pc);
> > - mem = pc->mem_cgroup;
> > + mem = id_to_memcg(pc->mem_cgroup);
> > if (!mem || !PageCgroupUsed(pc))
> > goto done;
> >
> > @@ -1859,7 +1868,7 @@ struct mem_cgroup *try_get_mem_cgroup_fr
> > pc = lookup_page_cgroup(page);
> > lock_page_cgroup(pc);
> > if (PageCgroupUsed(pc)) {
> > - mem = pc->mem_cgroup;
> > + mem = id_to_memcg(pc->mem_cgroup);
> > if (mem && !css_tryget(&mem->css))
> > mem = NULL;
> > } else if (PageSwapCache(page)) {
> > @@ -1895,7 +1904,7 @@ static void __mem_cgroup_commit_charge(s
> > return;
> > }
> >
> > - pc->mem_cgroup = mem;
> > + pc->mem_cgroup = css_id(&mem->css);
> > /*
> > * We access a page_cgroup asynchronously without lock_page_cgroup().
> > * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
> > @@ -1953,7 +1962,7 @@ static void __mem_cgroup_move_account(st
> > VM_BUG_ON(PageLRU(pc->page));
> > VM_BUG_ON(!PageCgroupLocked(pc));
> > VM_BUG_ON(!PageCgroupUsed(pc));
> > - VM_BUG_ON(pc->mem_cgroup != from);
> > + VM_BUG_ON(id_to_memcg(pc->mem_cgroup) != from);
> >
> > if (PageCgroupFileMapped(pc)) {
> > /* Update mapped_file data for mem_cgroup */
> > @@ -1968,7 +1977,7 @@ static void __mem_cgroup_move_account(st
> > mem_cgroup_cancel_charge(from);
> >
> > /* caller should have done css_get */
> > - pc->mem_cgroup = to;
> > + pc->mem_cgroup = css_id(&to->css);
> > mem_cgroup_charge_statistics(to, pc, true);
> > /*
> > * We charges against "to" which may not have any tasks. Then, "to"
> > @@ -1988,7 +1997,7 @@ static int mem_cgroup_move_account(struc
> > {
> > int ret = -EINVAL;
> > lock_page_cgroup(pc);
> > - if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
> > + if (PageCgroupUsed(pc) && id_to_memcg(pc->mem_cgroup) == from) {
> > __mem_cgroup_move_account(pc, from, to, uncharge);
> > ret = 0;
> > }
> > @@ -2327,9 +2336,9 @@ __mem_cgroup_uncharge_common(struct page
> >
> > lock_page_cgroup(pc);
> >
> > - mem = pc->mem_cgroup;
> > + mem = id_to_memcg(pc->mem_cgroup);
> >
> > - if (!PageCgroupUsed(pc))
> > + if (!mem || !PageCgroupUsed(pc))
> Why add the extra !mem check here?
>
> Can PageCgroupUsed() return true if mem==NULL?
>
mem && PageCgroupUsed() => what we want
mem && !PageCgroupUsed() => can be true
!mem && PageCgroupUsed() => never happens(bug?)
!mem && !PageCgroupUsed() => a clean state.
AH, yes. debug code again.
Thanks,
-Kame
On Thu, 29 Jul 2010 18:42:50 +0900
KAMEZAWA Hiroyuki <[email protected]> wrote:
> Hi, this version removes virt-array and use simple id <-> memcg table.
> and removed RFC.
>
Very Sorry, I'll post v3. ignore this. too low quality.
Thanks,
-Kame