2011-02-03 14:28:00

by Johannes Weiner

[permalink] [raw]
Subject: memcg: save 20% of per-page memcg memory overhead

This patch series removes the direct page pointer from struct
page_cgroup, which saves 20% of per-page memcg memory overhead (Fedora
and Ubuntu enable memcg per default, openSUSE apparently too).

The node id or section number is encoded in the remaining free bits of
pc->flags which allows calculating the corresponding page without the
extra pointer.

I ran, what I think is, a worst-case microbenchmark that just cats a
large sparse file to /dev/null, because it means that walking the LRU
list on behalf of per-cgroup reclaim and looking up pages from
page_cgroups is happening constantly and at a high rate. But it made
no measurable difference. A profile reported a 0.11% share of the new
lookup_cgroup_page() function in this benchmark.

Hannes


2011-02-03 14:27:12

by Johannes Weiner

[permalink] [raw]
Subject: [patch 1/5] memcg: no uncharged pages reach page_cgroup_zoneinfo

All callsites check PCG_USED before passing pc->mem_cgroup, so the
latter is never NULL.

Signed-off-by: Johannes Weiner <[email protected]>
---
mm/memcontrol.c | 3 ---
1 files changed, 0 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e071d7e..85b4b5a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -370,9 +370,6 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
int nid = page_cgroup_nid(pc);
int zid = page_cgroup_zid(pc);

- if (!mem)
- return NULL;
-
return mem_cgroup_zoneinfo(mem, nid, zid);
}

--
1.7.4

2011-02-03 14:27:17

by Johannes Weiner

[permalink] [raw]
Subject: [patch 2/5] memcg: change page_cgroup_zoneinfo signature

Instead of passing a whole struct page_cgroup to this function, let it
take only what it really needs from it: the struct mem_cgroup and the
page.

This has the advantage that reading pc->mem_cgroup is now done at the
same place where the ordering rules for this pointer are enforced and
explained.

It is also in preparation for removing the pc->page backpointer.

Signed-off-by: Johannes Weiner <[email protected]>
---
include/linux/page_cgroup.h | 10 ----------
mm/memcontrol.c | 17 ++++++++---------
2 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 6d6cb7a..363bbc8 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -85,16 +85,6 @@ SETPCGFLAG(Migration, MIGRATION)
CLEARPCGFLAG(Migration, MIGRATION)
TESTPCGFLAG(Migration, MIGRATION)

-static inline int page_cgroup_nid(struct page_cgroup *pc)
-{
- return page_to_nid(pc->page);
-}
-
-static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
-{
- return page_zonenum(pc->page);
-}
-
static inline void lock_page_cgroup(struct page_cgroup *pc)
{
/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 85b4b5a..77a3f87 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -364,11 +364,10 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
}

static struct mem_cgroup_per_zone *
-page_cgroup_zoneinfo(struct page_cgroup *pc)
+page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)
{
- struct mem_cgroup *mem = pc->mem_cgroup;
- int nid = page_cgroup_nid(pc);
- int zid = page_cgroup_zid(pc);
+ int nid = page_to_nid(page);
+ int zid = page_zonenum(page);

return mem_cgroup_zoneinfo(mem, nid, zid);
}
@@ -800,7 +799,7 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
* We don't check PCG_USED bit. It's cleared when the "page" is finally
* removed from global LRU.
*/
- mz = page_cgroup_zoneinfo(pc);
+ mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
/* huge page split is done under lru_lock. so, we have no races. */
MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
if (mem_cgroup_is_root(pc->mem_cgroup))
@@ -830,7 +829,7 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
smp_rmb();
if (mem_cgroup_is_root(pc->mem_cgroup))
return;
- mz = page_cgroup_zoneinfo(pc);
+ mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
list_move(&pc->lru, &mz->lists[lru]);
}

@@ -847,7 +846,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
return;
/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
smp_rmb();
- mz = page_cgroup_zoneinfo(pc);
+ mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
/* huge page split is done under lru_lock. so, we have no races. */
MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
SetPageCgroupAcctLRU(pc);
@@ -1017,7 +1016,7 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
return NULL;
/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
smp_rmb();
- mz = page_cgroup_zoneinfo(pc);
+ mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
if (!mz)
return NULL;

@@ -2166,7 +2165,7 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
* We hold lru_lock, then, reduce counter directly.
*/
lru = page_lru(head);
- mz = page_cgroup_zoneinfo(head_pc);
+ mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);
MEM_CGROUP_ZSTAT(mz, lru) -= 1;
}
tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
--
1.7.4

2011-02-03 14:27:18

by Johannes Weiner

[permalink] [raw]
Subject: [patch 3/5] memcg: fold __mem_cgroup_move_account into caller

It is one logical function, no need to have it split up.

Also, get rid of some checks from the inner function that ensured the
sanity of the outer function.

Signed-off-by: Johannes Weiner <[email protected]>
---
include/linux/page_cgroup.h | 5 ---
mm/memcontrol.c | 66 +++++++++++++++++++------------------------
2 files changed, 29 insertions(+), 42 deletions(-)

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 363bbc8..6b63679 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -99,11 +99,6 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc)
bit_spin_unlock(PCG_LOCK, &pc->flags);
}

-static inline int page_is_cgroup_locked(struct page_cgroup *pc)
-{
- return bit_spin_is_locked(PCG_LOCK, &pc->flags);
-}
-
static inline void move_lock_page_cgroup(struct page_cgroup *pc,
unsigned long *flags)
{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 77a3f87..5eb0dc2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2174,33 +2174,49 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
#endif

/**
- * __mem_cgroup_move_account - move account of the page
+ * mem_cgroup_move_account - move account of the page
* @pc: page_cgroup of the page.
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
* @uncharge: whether we should call uncharge and css_put against @from.
+ * @charge_size: number of bytes to charge (regular or huge page)
*
* The caller must confirm following.
* - page is not on LRU (isolate_page() is useful.)
- * - the pc is locked, used, and ->mem_cgroup points to @from.
+ * - compound_lock is held when charge_size > PAGE_SIZE
*
* This function doesn't do "charge" nor css_get to new cgroup. It should be
* done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is
* true, this function does "uncharge" from old cgroup, but it doesn't if
* @uncharge is false, so a caller should do "uncharge".
*/
-
-static void __mem_cgroup_move_account(struct page_cgroup *pc,
- struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge,
- int charge_size)
+static int mem_cgroup_move_account(struct page_cgroup *pc,
+ struct mem_cgroup *from, struct mem_cgroup *to,
+ bool uncharge, int charge_size)
{
int nr_pages = charge_size >> PAGE_SHIFT;
+ unsigned long flags;
+ int ret;

VM_BUG_ON(from == to);
VM_BUG_ON(PageLRU(pc->page));
- VM_BUG_ON(!page_is_cgroup_locked(pc));
- VM_BUG_ON(!PageCgroupUsed(pc));
- VM_BUG_ON(pc->mem_cgroup != from);
+ /*
+ * The page is isolated from LRU. So, collapse function
+ * will not handle this page. But page splitting can happen.
+ * Do this check under compound_page_lock(). The caller should
+ * hold it.
+ */
+ ret = -EBUSY;
+ if (charge_size > PAGE_SIZE && !PageTransHuge(pc->page))
+ goto out;
+
+ lock_page_cgroup(pc);
+
+ ret = -EINVAL;
+ if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
+ goto unlock;
+
+ move_lock_page_cgroup(pc, &flags);

if (PageCgroupFileMapped(pc)) {
/* Update mapped_file data for mem_cgroup */
@@ -2224,40 +2240,16 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
* garanteed that "to" is never removed. So, we don't check rmdir
* status here.
*/
-}
-
-/*
- * check whether the @pc is valid for moving account and call
- * __mem_cgroup_move_account()
- */
-static int mem_cgroup_move_account(struct page_cgroup *pc,
- struct mem_cgroup *from, struct mem_cgroup *to,
- bool uncharge, int charge_size)
-{
- int ret = -EINVAL;
- unsigned long flags;
- /*
- * The page is isolated from LRU. So, collapse function
- * will not handle this page. But page splitting can happen.
- * Do this check under compound_page_lock(). The caller should
- * hold it.
- */
- if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
- return -EBUSY;
-
- lock_page_cgroup(pc);
- if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
- move_lock_page_cgroup(pc, &flags);
- __mem_cgroup_move_account(pc, from, to, uncharge, charge_size);
- move_unlock_page_cgroup(pc, &flags);
- ret = 0;
- }
+ move_unlock_page_cgroup(pc, &flags);
+ ret = 0;
+unlock:
unlock_page_cgroup(pc);
/*
* check events
*/
memcg_check_events(to, pc->page);
memcg_check_events(from, pc->page);
+out:
return ret;
}

--
1.7.4

2011-02-03 14:27:21

by Johannes Weiner

[permalink] [raw]
Subject: [patch 5/5] memcg: remove direct page_cgroup-to-page pointer

In struct page_cgroup, we have a full word for flags but only a few
are reserved. Use the remaining upper bits to encode, depending on
configuration, the node or the section, to enable page_cgroup-to-page
lookups without a direct pointer.

This saves a full word for every page in a system with memory cgroups
enabled.

Signed-off-by: Johannes Weiner <[email protected]>
---
include/linux/page_cgroup.h | 70 +++++++++++++++++++++++++++---------
kernel/bounds.c | 2 +
mm/memcontrol.c | 6 ++-
mm/page_cgroup.c | 85 +++++++++++++++++++++++++------------------
4 files changed, 108 insertions(+), 55 deletions(-)

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 6b63679..05d8618 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -1,8 +1,26 @@
#ifndef __LINUX_PAGE_CGROUP_H
#define __LINUX_PAGE_CGROUP_H

+enum {
+ /* flags for mem_cgroup */
+ PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */
+ PCG_CACHE, /* charged as cache */
+ PCG_USED, /* this object is in use. */
+ PCG_MIGRATION, /* under page migration */
+ /* flags for mem_cgroup and file and I/O status */
+ PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */
+ PCG_FILE_MAPPED, /* page is accounted as "mapped" */
+ /* No lock in page_cgroup */
+ PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */
+ __NR_PCG_FLAGS,
+};
+
+#ifndef __GENERATING_BOUNDS_H
+#include <generated/bounds.h>
+
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
#include <linux/bit_spinlock.h>
+
/*
* Page Cgroup can be considered as an extended mem_map.
* A page_cgroup page is associated with every page descriptor. The
@@ -13,7 +31,6 @@
struct page_cgroup {
unsigned long flags;
struct mem_cgroup *mem_cgroup;
- struct page *page;
struct list_head lru; /* per cgroup LRU list */
};

@@ -32,19 +49,7 @@ static inline void __init page_cgroup_init(void)
#endif

struct page_cgroup *lookup_page_cgroup(struct page *page);
-
-enum {
- /* flags for mem_cgroup */
- PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */
- PCG_CACHE, /* charged as cache */
- PCG_USED, /* this object is in use. */
- PCG_MIGRATION, /* under page migration */
- /* flags for mem_cgroup and file and I/O status */
- PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */
- PCG_FILE_MAPPED, /* page is accounted as "mapped" */
- /* No lock in page_cgroup */
- PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */
-};
+struct page *lookup_cgroup_page(struct page_cgroup *pc);

#define TESTPCGFLAG(uname, lname) \
static inline int PageCgroup##uname(struct page_cgroup *pc) \
@@ -117,6 +122,34 @@ static inline void move_unlock_page_cgroup(struct page_cgroup *pc,
local_irq_restore(*flags);
}

+#ifdef CONFIG_SPARSEMEM
+#define PCG_ARRAYID_SHIFT SECTIONS_SHIFT
+#else
+#define PCG_ARRAYID_SHIFT NODES_SHIFT
+#endif
+
+#if (PCG_ARRAYID_SHIFT > BITS_PER_LONG - NR_PCG_FLAGS)
+#error Not enough space left in pc->flags to store page_cgroup array IDs
+#endif
+
+/* pc->flags: ARRAY-ID | FLAGS */
+
+#define PCG_ARRAYID_MASK ((1UL << PCG_ARRAYID_SHIFT) - 1)
+
+#define PCG_ARRAYID_OFFSET (sizeof(unsigned long) * 8 - PCG_ARRAYID_SHIFT)
+
+static inline void set_page_cgroup_array_id(struct page_cgroup *pc,
+ unsigned long id)
+{
+ pc->flags &= ~(PCG_ARRAYID_MASK << PCG_ARRAYID_OFFSET);
+ pc->flags |= (id & PCG_ARRAYID_MASK) << PCG_ARRAYID_OFFSET;
+}
+
+static inline unsigned long page_cgroup_array_id(struct page_cgroup *pc)
+{
+ return (pc->flags >> PCG_ARRAYID_OFFSET) & PCG_ARRAYID_MASK;
+}
+
#else /* CONFIG_CGROUP_MEM_RES_CTLR */
struct page_cgroup;

@@ -137,7 +170,7 @@ static inline void __init page_cgroup_init_flatmem(void)
{
}

-#endif
+#endif /* CONFIG_CGROUP_MEM_RES_CTLR */

#include <linux/swap.h>

@@ -173,5 +206,8 @@ static inline void swap_cgroup_swapoff(int type)
return;
}

-#endif
-#endif
+#endif /* CONFIG_CGROUP_MEM_RES_CTLR_SWAP */
+
+#endif /* !__GENERATING_BOUNDS_H */
+
+#endif /* __LINUX_PAGE_CGROUP_H */
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 98a51f2..0c9b862 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,11 +9,13 @@
#include <linux/page-flags.h>
#include <linux/mmzone.h>
#include <linux/kbuild.h>
+#include <linux/page_cgroup.h>

void foo(void)
{
/* The enum constants to put into include/generated/bounds.h */
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
+ DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
/* End of constants */
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 998da06..4e10f46 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1054,7 +1054,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
if (unlikely(!PageCgroupUsed(pc)))
continue;

- page = pc->page;
+ page = lookup_cgroup_page(pc);
+ VM_BUG_ON(pc != lookup_page_cgroup(page));

if (unlikely(!PageLRU(page)))
continue;
@@ -3296,7 +3297,8 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
}
spin_unlock_irqrestore(&zone->lru_lock, flags);

- page = pc->page;
+ page = lookup_cgroup_page(pc);
+ VM_BUG_ON(pc != lookup_page_cgroup(page));

ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);
if (ret == -ENOMEM)
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 59a3cd4..e5f38e8 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -11,12 +11,11 @@
#include <linux/swapops.h>
#include <linux/kmemleak.h>

-static void __meminit
-__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
+static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
{
pc->flags = 0;
+ set_page_cgroup_array_id(pc, id);
pc->mem_cgroup = NULL;
- pc->page = pfn_to_page(pfn);
INIT_LIST_HEAD(&pc->lru);
}
static unsigned long total_usage;
@@ -43,6 +42,16 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
return base + offset;
}

+struct page *lookup_cgroup_page(struct page_cgroup *pc)
+{
+ unsigned long pfn;
+ pg_data_t *pgdat;
+
+ pgdat = NODE_DATA(page_cgroup_array_id(pc));
+ pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
+ return pfn_to_page(pfn);
+}
+
static int __init alloc_node_page_cgroup(int nid)
{
struct page_cgroup *base, *pc;
@@ -63,7 +72,7 @@ static int __init alloc_node_page_cgroup(int nid)
return -ENOMEM;
for (index = 0; index < nr_pages; index++) {
pc = base + index;
- __init_page_cgroup(pc, start_pfn + index);
+ init_page_cgroup(pc, nid);
}
NODE_DATA(nid)->node_page_cgroup = base;
total_usage += table_size;
@@ -105,46 +114,50 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
return section->page_cgroup + pfn;
}

+struct page *lookup_cgroup_page(struct page_cgroup *pc)
+{
+ struct mem_section *section;
+ unsigned long nr;
+
+ nr = page_cgroup_array_id(pc);
+ section = __nr_to_section(nr);
+ return pfn_to_page(pc - section->page_cgroup);
+}
+
/* __alloc_bootmem...() is protected by !slab_available() */
static int __init_refok init_section_page_cgroup(unsigned long pfn)
{
- struct mem_section *section = __pfn_to_section(pfn);
struct page_cgroup *base, *pc;
+ struct mem_section *section;
unsigned long table_size;
+ unsigned long nr;
int nid, index;

- if (!section->page_cgroup) {
- nid = page_to_nid(pfn_to_page(pfn));
- table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
- VM_BUG_ON(!slab_is_available());
- if (node_state(nid, N_HIGH_MEMORY)) {
- base = kmalloc_node(table_size,
- GFP_KERNEL | __GFP_NOWARN, nid);
- if (!base)
- base = vmalloc_node(table_size, nid);
- } else {
- base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN);
- if (!base)
- base = vmalloc(table_size);
- }
- /*
- * The value stored in section->page_cgroup is (base - pfn)
- * and it does not point to the memory block allocated above,
- * causing kmemleak false positives.
- */
- kmemleak_not_leak(base);
+ nr = pfn_to_section_nr(pfn);
+ section = __nr_to_section(nr);
+
+ if (section->page_cgroup)
+ return 0;
+
+ nid = page_to_nid(pfn_to_page(pfn));
+ table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
+ VM_BUG_ON(!slab_is_available());
+ if (node_state(nid, N_HIGH_MEMORY)) {
+ base = kmalloc_node(table_size,
+ GFP_KERNEL | __GFP_NOWARN, nid);
+ if (!base)
+ base = vmalloc_node(table_size, nid);
} else {
- /*
- * We don't have to allocate page_cgroup again, but
- * address of memmap may be changed. So, we have to initialize
- * again.
- */
- base = section->page_cgroup + pfn;
- table_size = 0;
- /* check address of memmap is changed or not. */
- if (base->page == pfn_to_page(pfn))
- return 0;
+ base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN);
+ if (!base)
+ base = vmalloc(table_size);
}
+ /*
+ * The value stored in section->page_cgroup is (base - pfn)
+ * and it does not point to the memory block allocated above,
+ * causing kmemleak false positives.
+ */
+ kmemleak_not_leak(base);

if (!base) {
printk(KERN_ERR "page cgroup allocation failure\n");
@@ -153,7 +166,7 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)

for (index = 0; index < PAGES_PER_SECTION; index++) {
pc = base + index;
- __init_page_cgroup(pc, pfn + index);
+ init_page_cgroup(pc, nr);
}

section->page_cgroup = base - pfn;
--
1.7.4

2011-02-03 14:27:16

by Johannes Weiner

[permalink] [raw]
Subject: [patch 4/5] memcg: condense page_cgroup-to-page lookup points

The per-cgroup LRU lists string up 'struct page_cgroup's. To get from
those structures to the page they represent, a lookup is required.
Currently, the lookup is done through a direct pointer in struct
page_cgroup, so a lot of functions down the callchain do this lookup
by themselves instead of receiving the page pointer from their
callers.

The next patch removes this pointer, however, and the lookup is no
longer that straight-forward. In preparation for that, this patch
only leaves the non-optional lookups when coming directly from the LRU
list and passes the page down the stack.

Signed-off-by: Johannes Weiner <[email protected]>
---
mm/memcontrol.c | 38 +++++++++++++++++++++++---------------
1 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5eb0dc2..998da06 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1051,9 +1051,11 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
if (scan >= nr_to_scan)
break;

- page = pc->page;
if (unlikely(!PageCgroupUsed(pc)))
continue;
+
+ page = pc->page;
+
if (unlikely(!PageLRU(page)))
continue;

@@ -2082,6 +2084,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
}

static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+ struct page *page,
struct page_cgroup *pc,
enum charge_type ctype,
int page_size)
@@ -2128,7 +2131,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
* if they exceeds softlimit.
*/
- memcg_check_events(mem, pc->page);
+ memcg_check_events(mem, page);
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -2175,6 +2178,7 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)

/**
* mem_cgroup_move_account - move account of the page
+ * @page: the page
* @pc: page_cgroup of the page.
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
@@ -2190,7 +2194,7 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
* true, this function does "uncharge" from old cgroup, but it doesn't if
* @uncharge is false, so a caller should do "uncharge".
*/
-static int mem_cgroup_move_account(struct page_cgroup *pc,
+static int mem_cgroup_move_account(struct page *page, struct page_cgroup *pc,
struct mem_cgroup *from, struct mem_cgroup *to,
bool uncharge, int charge_size)
{
@@ -2199,7 +2203,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
int ret;

VM_BUG_ON(from == to);
- VM_BUG_ON(PageLRU(pc->page));
+ VM_BUG_ON(PageLRU(page));
/*
* The page is isolated from LRU. So, collapse function
* will not handle this page. But page splitting can happen.
@@ -2207,7 +2211,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
* hold it.
*/
ret = -EBUSY;
- if (charge_size > PAGE_SIZE && !PageTransHuge(pc->page))
+ if (charge_size > PAGE_SIZE && !PageTransHuge(page))
goto out;

lock_page_cgroup(pc);
@@ -2247,8 +2251,8 @@ unlock:
/*
* check events
*/
- memcg_check_events(to, pc->page);
- memcg_check_events(from, pc->page);
+ memcg_check_events(to, page);
+ memcg_check_events(from, page);
out:
return ret;
}
@@ -2257,11 +2261,11 @@ out:
* move charges to its parent.
*/

-static int mem_cgroup_move_parent(struct page_cgroup *pc,
+static int mem_cgroup_move_parent(struct page *page,
+ struct page_cgroup *pc,
struct mem_cgroup *child,
gfp_t gfp_mask)
{
- struct page *page = pc->page;
struct cgroup *cg = child->css.cgroup;
struct cgroup *pcg = cg->parent;
struct mem_cgroup *parent;
@@ -2291,7 +2295,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
if (page_size > PAGE_SIZE)
flags = compound_lock_irqsave(page);

- ret = mem_cgroup_move_account(pc, child, parent, true, page_size);
+ ret = mem_cgroup_move_account(page, pc, child, parent, true, page_size);
if (ret)
mem_cgroup_cancel_charge(parent, page_size);

@@ -2337,7 +2341,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
if (ret || !mem)
return ret;

- __mem_cgroup_commit_charge(mem, pc, ctype, page_size);
+ __mem_cgroup_commit_charge(mem, page, pc, ctype, page_size);
return 0;
}

@@ -2473,7 +2477,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
cgroup_exclude_rmdir(&ptr->css);
pc = lookup_page_cgroup(page);
mem_cgroup_lru_del_before_commit_swapcache(page);
- __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE);
+ __mem_cgroup_commit_charge(ptr, page, pc, ctype, PAGE_SIZE);
mem_cgroup_lru_add_after_commit_swapcache(page);
/*
* Now swap is on-memory. This means this page may be
@@ -2926,7 +2930,7 @@ int mem_cgroup_prepare_migration(struct page *page,
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
else
ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
- __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE);
+ __mem_cgroup_commit_charge(mem, page, pc, ctype, PAGE_SIZE);
return ret;
}

@@ -3275,6 +3279,8 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
loop += 256;
busy = NULL;
while (loop--) {
+ struct page *page;
+
ret = 0;
spin_lock_irqsave(&zone->lru_lock, flags);
if (list_empty(list)) {
@@ -3290,7 +3296,9 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
}
spin_unlock_irqrestore(&zone->lru_lock, flags);

- ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
+ page = pc->page;
+
+ ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);
if (ret == -ENOMEM)
break;

@@ -4907,7 +4915,7 @@ retry:
if (isolate_lru_page(page))
goto put;
pc = lookup_page_cgroup(page);
- if (!mem_cgroup_move_account(pc,
+ if (!mem_cgroup_move_account(page, pc,
mc.from, mc.to, false, PAGE_SIZE)) {
mc.precharge--;
/* we uncharge from mc.from later. */
--
1.7.4

2011-02-04 00:07:52

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [patch 1/5] memcg: no uncharged pages reach page_cgroup_zoneinfo

On Thu, 3 Feb 2011 15:26:02 +0100
Johannes Weiner <[email protected]> wrote:

> All callsites check PCG_USED before passing pc->mem_cgroup, so the
> latter is never NULL.
>
> Signed-off-by: Johannes Weiner <[email protected]>

Acked-by: KAMEZAWA Hiroyuki <[email protected]>

I want BUG_ON() here.


> ---
> mm/memcontrol.c | 3 ---
> 1 files changed, 0 insertions(+), 3 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e071d7e..85b4b5a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -370,9 +370,6 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
> int nid = page_cgroup_nid(pc);
> int zid = page_cgroup_zid(pc);
>
> - if (!mem)
> - return NULL;
> -
> return mem_cgroup_zoneinfo(mem, nid, zid);
> }
>
> --
> 1.7.4
>
>

2011-02-04 00:09:14

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [patch 2/5] memcg: change page_cgroup_zoneinfo signature

On Thu, 3 Feb 2011 15:26:03 +0100
Johannes Weiner <[email protected]> wrote:

> Instead of passing a whole struct page_cgroup to this function, let it
> take only what it really needs from it: the struct mem_cgroup and the
> page.
>
> This has the advantage that reading pc->mem_cgroup is now done at the
> same place where the ordering rules for this pointer are enforced and
> explained.
>
> It is also in preparation for removing the pc->page backpointer.
>
> Signed-off-by: Johannes Weiner <[email protected]>

Acked-by: KAMEZAWA Hiroyuki <[email protected]>

2011-02-04 00:13:47

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [patch 3/5] memcg: fold __mem_cgroup_move_account into caller

On Thu, 3 Feb 2011 15:26:04 +0100
Johannes Weiner <[email protected]> wrote:

> It is one logical function, no need to have it split up.
>
> Also, get rid of some checks from the inner function that ensured the
> sanity of the outer function.
>
> Signed-off-by: Johannes Weiner <[email protected]>

I think there was a reason to split them...but it seems I forget it..

The patch seems good.
Acked-by: KAMEZAWA Hiroyuki <[email protected]>

2011-02-04 00:17:08

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [patch 4/5] memcg: condense page_cgroup-to-page lookup points

On Thu, 3 Feb 2011 15:26:05 +0100
Johannes Weiner <[email protected]> wrote:

> The per-cgroup LRU lists string up 'struct page_cgroup's. To get from
> those structures to the page they represent, a lookup is required.
> Currently, the lookup is done through a direct pointer in struct
> page_cgroup, so a lot of functions down the callchain do this lookup
> by themselves instead of receiving the page pointer from their
> callers.
>
> The next patch removes this pointer, however, and the lookup is no
> longer that straight-forward. In preparation for that, this patch
> only leaves the non-optional lookups when coming directly from the LRU
> list and passes the page down the stack.
>
> Signed-off-by: Johannes Weiner <[email protected]>

Maybe good.

Acked-by: KAMEZAWA Hiroyuki <[email protected]>


2011-02-04 00:25:55

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [patch 5/5] memcg: remove direct page_cgroup-to-page pointer

On Thu, 3 Feb 2011 15:26:06 +0100
Johannes Weiner <[email protected]> wrote:

> In struct page_cgroup, we have a full word for flags but only a few
> are reserved. Use the remaining upper bits to encode, depending on
> configuration, the node or the section, to enable page_cgroup-to-page
> lookups without a direct pointer.
>
> This saves a full word for every page in a system with memory cgroups
> enabled.
>
> Signed-off-by: Johannes Weiner <[email protected]>

In general,

Acked-by: KAMEZAWA Hiroyuki <[email protected]>

Thank you. A few questions below.



> ---
> include/linux/page_cgroup.h | 70 +++++++++++++++++++++++++++---------
> kernel/bounds.c | 2 +
> mm/memcontrol.c | 6 ++-
> mm/page_cgroup.c | 85 +++++++++++++++++++++++++------------------
> 4 files changed, 108 insertions(+), 55 deletions(-)
>
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> index 6b63679..05d8618 100644
> --- a/include/linux/page_cgroup.h
> +++ b/include/linux/page_cgroup.h
> @@ -1,8 +1,26 @@
> #ifndef __LINUX_PAGE_CGROUP_H
> #define __LINUX_PAGE_CGROUP_H
>
> +enum {
> + /* flags for mem_cgroup */
> + PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */
> + PCG_CACHE, /* charged as cache */
> + PCG_USED, /* this object is in use. */
> + PCG_MIGRATION, /* under page migration */
> + /* flags for mem_cgroup and file and I/O status */
> + PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */
> + PCG_FILE_MAPPED, /* page is accounted as "mapped" */
> + /* No lock in page_cgroup */
> + PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */
> + __NR_PCG_FLAGS,
> +};
> +
> +#ifndef __GENERATING_BOUNDS_H
> +#include <generated/bounds.h>
> +
> #ifdef CONFIG_CGROUP_MEM_RES_CTLR
> #include <linux/bit_spinlock.h>
> +
> /*
> * Page Cgroup can be considered as an extended mem_map.
> * A page_cgroup page is associated with every page descriptor. The
> @@ -13,7 +31,6 @@
> struct page_cgroup {
> unsigned long flags;
> struct mem_cgroup *mem_cgroup;
> - struct page *page;
> struct list_head lru; /* per cgroup LRU list */
> };
>
> @@ -32,19 +49,7 @@ static inline void __init page_cgroup_init(void)
> #endif
>
> struct page_cgroup *lookup_page_cgroup(struct page *page);
> -
> -enum {
> - /* flags for mem_cgroup */
> - PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */
> - PCG_CACHE, /* charged as cache */
> - PCG_USED, /* this object is in use. */
> - PCG_MIGRATION, /* under page migration */
> - /* flags for mem_cgroup and file and I/O status */
> - PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */
> - PCG_FILE_MAPPED, /* page is accounted as "mapped" */
> - /* No lock in page_cgroup */
> - PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */
> -};
> +struct page *lookup_cgroup_page(struct page_cgroup *pc);
>
> #define TESTPCGFLAG(uname, lname) \
> static inline int PageCgroup##uname(struct page_cgroup *pc) \
> @@ -117,6 +122,34 @@ static inline void move_unlock_page_cgroup(struct page_cgroup *pc,
> local_irq_restore(*flags);
> }
>
> +#ifdef CONFIG_SPARSEMEM
> +#define PCG_ARRAYID_SHIFT SECTIONS_SHIFT
> +#else
> +#define PCG_ARRAYID_SHIFT NODES_SHIFT
> +#endif
> +
> +#if (PCG_ARRAYID_SHIFT > BITS_PER_LONG - NR_PCG_FLAGS)
> +#error Not enough space left in pc->flags to store page_cgroup array IDs
> +#endif
> +
> +/* pc->flags: ARRAY-ID | FLAGS */
> +
> +#define PCG_ARRAYID_MASK ((1UL << PCG_ARRAYID_SHIFT) - 1)
> +
> +#define PCG_ARRAYID_OFFSET (sizeof(unsigned long) * 8 - PCG_ARRAYID_SHIFT)
> +
> +static inline void set_page_cgroup_array_id(struct page_cgroup *pc,
> + unsigned long id)
> +{
> + pc->flags &= ~(PCG_ARRAYID_MASK << PCG_ARRAYID_OFFSET);
> + pc->flags |= (id & PCG_ARRAYID_MASK) << PCG_ARRAYID_OFFSET;
> +}
> +
> +static inline unsigned long page_cgroup_array_id(struct page_cgroup *pc)
> +{
> + return (pc->flags >> PCG_ARRAYID_OFFSET) & PCG_ARRAYID_MASK;
> +}
> +

If a function for looking up a page from a page_cgroup in inline,
I think these function should be static in page_cgroup.c



> #else /* CONFIG_CGROUP_MEM_RES_CTLR */
> struct page_cgroup;
>
> @@ -137,7 +170,7 @@ static inline void __init page_cgroup_init_flatmem(void)
> {
> }
>
> -#endif
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
>
> #include <linux/swap.h>
>
> @@ -173,5 +206,8 @@ static inline void swap_cgroup_swapoff(int type)
> return;
> }
>
> -#endif
> -#endif
> +#endif /* CONFIG_CGROUP_MEM_RES_CTLR_SWAP */
> +
> +#endif /* !__GENERATING_BOUNDS_H */
> +
> +#endif /* __LINUX_PAGE_CGROUP_H */
> diff --git a/kernel/bounds.c b/kernel/bounds.c
> index 98a51f2..0c9b862 100644
> --- a/kernel/bounds.c
> +++ b/kernel/bounds.c
> @@ -9,11 +9,13 @@
> #include <linux/page-flags.h>
> #include <linux/mmzone.h>
> #include <linux/kbuild.h>
> +#include <linux/page_cgroup.h>
>
> void foo(void)
> {
> /* The enum constants to put into include/generated/bounds.h */
> DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
> DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
> + DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
> /* End of constants */
> }
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 998da06..4e10f46 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1054,7 +1054,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
> if (unlikely(!PageCgroupUsed(pc)))
> continue;
>
> - page = pc->page;
> + page = lookup_cgroup_page(pc);
> + VM_BUG_ON(pc != lookup_page_cgroup(page));

If you're afraid of corruption in ->flags bit, checking this in page_cgroup.c
is better.


Anyway, seems great.

Thanks,
-Kame

>
> if (unlikely(!PageLRU(page)))
> continue;
> @@ -3296,7 +3297,8 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
> }
> spin_unlock_irqrestore(&zone->lru_lock, flags);
>
> - page = pc->page;
> + page = lookup_cgroup_page(pc);
> + VM_BUG_ON(pc != lookup_page_cgroup(page));
>
> ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);
> if (ret == -ENOMEM)
> diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> index 59a3cd4..e5f38e8 100644
> --- a/mm/page_cgroup.c
> +++ b/mm/page_cgroup.c
> @@ -11,12 +11,11 @@
> #include <linux/swapops.h>
> #include <linux/kmemleak.h>
>
> -static void __meminit
> -__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
> +static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
> {
> pc->flags = 0;
> + set_page_cgroup_array_id(pc, id);
> pc->mem_cgroup = NULL;
> - pc->page = pfn_to_page(pfn);
> INIT_LIST_HEAD(&pc->lru);
> }
> static unsigned long total_usage;
> @@ -43,6 +42,16 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
> return base + offset;
> }
>
> +struct page *lookup_cgroup_page(struct page_cgroup *pc)
> +{
> + unsigned long pfn;
> + pg_data_t *pgdat;
> +
> + pgdat = NODE_DATA(page_cgroup_array_id(pc));
> + pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
> + return pfn_to_page(pfn);
> +}
> +
> static int __init alloc_node_page_cgroup(int nid)
> {
> struct page_cgroup *base, *pc;
> @@ -63,7 +72,7 @@ static int __init alloc_node_page_cgroup(int nid)
> return -ENOMEM;
> for (index = 0; index < nr_pages; index++) {
> pc = base + index;
> - __init_page_cgroup(pc, start_pfn + index);
> + init_page_cgroup(pc, nid);
> }
> NODE_DATA(nid)->node_page_cgroup = base;
> total_usage += table_size;
> @@ -105,46 +114,50 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
> return section->page_cgroup + pfn;
> }
>
> +struct page *lookup_cgroup_page(struct page_cgroup *pc)
> +{
> + struct mem_section *section;
> + unsigned long nr;
> +
> + nr = page_cgroup_array_id(pc);
> + section = __nr_to_section(nr);
> + return pfn_to_page(pc - section->page_cgroup);
> +}
> +
> /* __alloc_bootmem...() is protected by !slab_available() */
> static int __init_refok init_section_page_cgroup(unsigned long pfn)
> {
> - struct mem_section *section = __pfn_to_section(pfn);
> struct page_cgroup *base, *pc;
> + struct mem_section *section;
> unsigned long table_size;
> + unsigned long nr;
> int nid, index;
>
> - if (!section->page_cgroup) {
> - nid = page_to_nid(pfn_to_page(pfn));
> - table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
> - VM_BUG_ON(!slab_is_available());
> - if (node_state(nid, N_HIGH_MEMORY)) {
> - base = kmalloc_node(table_size,
> - GFP_KERNEL | __GFP_NOWARN, nid);
> - if (!base)
> - base = vmalloc_node(table_size, nid);
> - } else {
> - base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN);
> - if (!base)
> - base = vmalloc(table_size);
> - }
> - /*
> - * The value stored in section->page_cgroup is (base - pfn)
> - * and it does not point to the memory block allocated above,
> - * causing kmemleak false positives.
> - */
> - kmemleak_not_leak(base);
> + nr = pfn_to_section_nr(pfn);
> + section = __nr_to_section(nr);
> +
> + if (section->page_cgroup)
> + return 0;
> +
> + nid = page_to_nid(pfn_to_page(pfn));
> + table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
> + VM_BUG_ON(!slab_is_available());
> + if (node_state(nid, N_HIGH_MEMORY)) {
> + base = kmalloc_node(table_size,
> + GFP_KERNEL | __GFP_NOWARN, nid);
> + if (!base)
> + base = vmalloc_node(table_size, nid);
> } else {
> - /*
> - * We don't have to allocate page_cgroup again, but
> - * address of memmap may be changed. So, we have to initialize
> - * again.
> - */
> - base = section->page_cgroup + pfn;
> - table_size = 0;
> - /* check address of memmap is changed or not. */
> - if (base->page == pfn_to_page(pfn))
> - return 0;
> + base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN);
> + if (!base)
> + base = vmalloc(table_size);
> }
> + /*
> + * The value stored in section->page_cgroup is (base - pfn)
> + * and it does not point to the memory block allocated above,
> + * causing kmemleak false positives.
> + */
> + kmemleak_not_leak(base);
>
> if (!base) {
> printk(KERN_ERR "page cgroup allocation failure\n");
> @@ -153,7 +166,7 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
>
> for (index = 0; index < PAGES_PER_SECTION; index++) {
> pc = base + index;
> - __init_page_cgroup(pc, pfn + index);
> + init_page_cgroup(pc, nr);
> }
>
> section->page_cgroup = base - pfn;
> --
> 1.7.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2011-02-04 01:02:33

by Daisuke Nishimura

[permalink] [raw]
Subject: Re: [patch 3/5] memcg: fold __mem_cgroup_move_account into caller

On Fri, 4 Feb 2011 09:07:38 +0900
KAMEZAWA Hiroyuki <[email protected]> wrote:

> On Thu, 3 Feb 2011 15:26:04 +0100
> Johannes Weiner <[email protected]> wrote:
>
> > It is one logical function, no need to have it split up.
> >
> > Also, get rid of some checks from the inner function that ensured the
> > sanity of the outer function.
> >
> > Signed-off-by: Johannes Weiner <[email protected]>
>
> I think there was a reason to split them...but it seems I forget it..
>
IIRC, it's me who split them up in commit 57f9fd7d.

But the purpose of the commit was cleanup move_parent() and move_account()
to use move_accout() in move_charge() later.
So, there was no technical reason why I split move_account() and __move_account().
It was just because I liked to make each functions do one thing: check validness
and actually move account.

Anyway, I don't have any objection to folding them. page_is_cgroup_locked()
can be removed by this change.

Acked-by: Daisuke Nishimura <[email protected]>

Thanks,
Daisuke Nishimura.

2011-02-04 04:16:07

by Balbir Singh

[permalink] [raw]
Subject: Re: [patch 1/5] memcg: no uncharged pages reach page_cgroup_zoneinfo

On Thu, Feb 3, 2011 at 7:56 PM, Johannes Weiner <[email protected]> wrote:
> All callsites check PCG_USED before passing pc->mem_cgroup, so the
> latter is never NULL.
>
> Signed-off-by: Johannes Weiner <[email protected]>

Acked-by: Balbir Singh <[email protected]>

2011-02-04 09:27:07

by Johannes Weiner

[permalink] [raw]
Subject: Re: [patch 1/5] memcg: no uncharged pages reach page_cgroup_zoneinfo

On Fri, Feb 04, 2011 at 09:01:45AM +0900, KAMEZAWA Hiroyuki wrote:
> On Thu, 3 Feb 2011 15:26:02 +0100
> Johannes Weiner <[email protected]> wrote:
>
> > All callsites check PCG_USED before passing pc->mem_cgroup, so the
> > latter is never NULL.
> >
> > Signed-off-by: Johannes Weiner <[email protected]>
>
> Acked-by: KAMEZAWA Hiroyuki <[email protected]>

Thank you!

> I want BUG_ON() here.

I thought about it too at first. But look at the callsites, all but
one of them do not even expect this function to return NULL, so if
this condition had ever been true, we would have seen crashes in the
callsites.

The only caller that checks for NULL is
mem_cgroup_get_reclaim_stat_from_page() and I propose to remove that
as well; patch attached.

Do you insist on the BUG_ON?

---
Subject: memcg: page_cgroup_zoneinfo never returns NULL

For a page charged to a memcg, there is always valid memcg per-zone
info.

Signed-off-by: Johannes Weiner <[email protected]>
---
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4a4483d..5f974b3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1017,9 +1017,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
smp_rmb();
mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
- if (!mz)
- return NULL;
-
return &mz->reclaim_stat;
}

2011-02-04 09:29:40

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [patch 1/5] memcg: no uncharged pages reach page_cgroup_zoneinfo

On Fri, 4 Feb 2011 10:26:50 +0100
Johannes Weiner <[email protected]> wrote:

> On Fri, Feb 04, 2011 at 09:01:45AM +0900, KAMEZAWA Hiroyuki wrote:
> > On Thu, 3 Feb 2011 15:26:02 +0100
> > Johannes Weiner <[email protected]> wrote:
> >
> > > All callsites check PCG_USED before passing pc->mem_cgroup, so the
> > > latter is never NULL.
> > >
> > > Signed-off-by: Johannes Weiner <[email protected]>
> >
> > Acked-by: KAMEZAWA Hiroyuki <[email protected]>
>
> Thank you!
>
> > I want BUG_ON() here.
>
> I thought about it too at first. But look at the callsites, all but
> one of them do not even expect this function to return NULL, so if
> this condition had ever been true, we would have seen crashes in the
> callsites.
>

Hmm ok.
-Kame

> The only caller that checks for NULL is
> mem_cgroup_get_reclaim_stat_from_page() and I propose to remove that
> as well; patch attached.
>
> Do you insist on the BUG_ON?
>
> ---
> Subject: memcg: page_cgroup_zoneinfo never returns NULL
>
> For a page charged to a memcg, there is always valid memcg per-zone
> info.
>
> Signed-off-by: Johannes Weiner <[email protected]>
> ---
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 4a4483d..5f974b3 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1017,9 +1017,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
> /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
> smp_rmb();
> mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
> - if (!mz)
> - return NULL;
> -
> return &mz->reclaim_stat;
> }
>
>

2011-02-04 09:51:21

by Johannes Weiner

[permalink] [raw]
Subject: Re: [patch 5/5] memcg: remove direct page_cgroup-to-page pointer

On Fri, Feb 04, 2011 at 09:19:49AM +0900, KAMEZAWA Hiroyuki wrote:
> On Thu, 3 Feb 2011 15:26:06 +0100
> Johannes Weiner <[email protected]> wrote:
>
> > In struct page_cgroup, we have a full word for flags but only a few
> > are reserved. Use the remaining upper bits to encode, depending on
> > configuration, the node or the section, to enable page_cgroup-to-page
> > lookups without a direct pointer.
> >
> > This saves a full word for every page in a system with memory cgroups
> > enabled.
> >
> > Signed-off-by: Johannes Weiner <[email protected]>
>
> In general,
>
> Acked-by: KAMEZAWA Hiroyuki <[email protected]>

Thanks!

> A few questions below.

[snip]

> > @@ -117,6 +122,34 @@ static inline void move_unlock_page_cgroup(struct page_cgroup *pc,
> > local_irq_restore(*flags);
> > }
> >
> > +#ifdef CONFIG_SPARSEMEM
> > +#define PCG_ARRAYID_SHIFT SECTIONS_SHIFT
> > +#else
> > +#define PCG_ARRAYID_SHIFT NODES_SHIFT
> > +#endif
> > +
> > +#if (PCG_ARRAYID_SHIFT > BITS_PER_LONG - NR_PCG_FLAGS)
> > +#error Not enough space left in pc->flags to store page_cgroup array IDs
> > +#endif
> > +
> > +/* pc->flags: ARRAY-ID | FLAGS */
> > +
> > +#define PCG_ARRAYID_MASK ((1UL << PCG_ARRAYID_SHIFT) - 1)
> > +
> > +#define PCG_ARRAYID_OFFSET (sizeof(unsigned long) * 8 - PCG_ARRAYID_SHIFT)
> > +
> > +static inline void set_page_cgroup_array_id(struct page_cgroup *pc,
> > + unsigned long id)
> > +{
> > + pc->flags &= ~(PCG_ARRAYID_MASK << PCG_ARRAYID_OFFSET);
> > + pc->flags |= (id & PCG_ARRAYID_MASK) << PCG_ARRAYID_OFFSET;
> > +}
> > +
> > +static inline unsigned long page_cgroup_array_id(struct page_cgroup *pc)
> > +{
> > + return (pc->flags >> PCG_ARRAYID_OFFSET) & PCG_ARRAYID_MASK;
> > +}
> > +
>
> If a function for looking up a page from a page_cgroup in inline,
> I think these function should be static in page_cgroup.c

I stole all of this from mm.h which does the same for page flags. Of
course, in their case, most of them are used in more than one file,
but some of them are not and it still has merit to keep related things
together.

Should I just move the functions? I would like to keep them together
with the _MASK and _OFFSET definitions, so should I move them too?
Suggestion welcome ;)

> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 998da06..4e10f46 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -1054,7 +1054,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
> > if (unlikely(!PageCgroupUsed(pc)))
> > continue;
> >
> > - page = pc->page;
> > + page = lookup_cgroup_page(pc);
> > + VM_BUG_ON(pc != lookup_page_cgroup(page));
>
> If you're afraid of corruption in ->flags bit, checking this in page_cgroup.c
> is better.

I thought I'd keep them visible so we could remove them in a cycle or
two and I am sure they would get lost in mm/page_cgroup.c. Who looks
at this file regularly? :)

But OTOH, they are under CONFIG_DEBUG_VM and corruption does not get
less likely as more code changes around pc->flags.

So I agree, they should be moved to lookup_page_cgroup() and be kept
indefinitely.

Thanks for your review.

Hannes

---
Subject: memcg: remove direct page_cgroup-to-page pointer fix

Move pc -> page linkage checks to the lookup function itself. I no
longer plan on removing them again, so there is no point in keeping
them visible at the callsites.

Signed-off-by: Johannes Weiner <[email protected]>
---

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4e10f46..8438988 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1055,7 +1055,6 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
continue;

page = lookup_cgroup_page(pc);
- VM_BUG_ON(pc != lookup_page_cgroup(page));

if (unlikely(!PageLRU(page)))
continue;
@@ -3298,7 +3297,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
spin_unlock_irqrestore(&zone->lru_lock, flags);

page = lookup_cgroup_page(pc);
- VM_BUG_ON(pc != lookup_page_cgroup(page));

ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);
if (ret == -ENOMEM)
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index e5f38e8..6c3f7a6 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -45,11 +45,14 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
struct page *lookup_cgroup_page(struct page_cgroup *pc)
{
unsigned long pfn;
+ struct page *page;
pg_data_t *pgdat;

pgdat = NODE_DATA(page_cgroup_array_id(pc));
pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
- return pfn_to_page(pfn);
+ page = pfn_to_page(pfn);
+ VM_BUG_ON(pc != lookup_page_cgroup(page));
+ return page;
}

static int __init alloc_node_page_cgroup(int nid)
@@ -117,11 +120,14 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
struct page *lookup_cgroup_page(struct page_cgroup *pc)
{
struct mem_section *section;
+ struct page *page;
unsigned long nr;

nr = page_cgroup_array_id(pc);
section = __nr_to_section(nr);
- return pfn_to_page(pc - section->page_cgroup);
+ page = pfn_to_page(pc - section->page_cgroup);
+ VM_BUG_ON(pc != lookup_page_cgroup(page));
+ return page;
}

/* __alloc_bootmem...() is protected by !slab_available() */

2011-02-03 15:02:54

by Balbir Singh

[permalink] [raw]
Subject: Re: memcg: save 20% of per-page memcg memory overhead

* Johannes Weiner <[email protected]> [2011-02-03 15:26:01]:

> This patch series removes the direct page pointer from struct
> page_cgroup, which saves 20% of per-page memcg memory overhead (Fedora
> and Ubuntu enable memcg per default, openSUSE apparently too).
>
> The node id or section number is encoded in the remaining free bits of
> pc->flags which allows calculating the corresponding page without the
> extra pointer.
>
> I ran, what I think is, a worst-case microbenchmark that just cats a
> large sparse file to /dev/null, because it means that walking the LRU
> list on behalf of per-cgroup reclaim and looking up pages from
> page_cgroups is happening constantly and at a high rate. But it made
> no measurable difference. A profile reported a 0.11% share of the new
> lookup_cgroup_page() function in this benchmark.

Wow! defintely worth a deeper look.

>
> Hannes

--
Three Cheers,
Balbir