Memory cgroups used to have 5 per-page pointers. To allow users to
disable that amount of overhead during runtime, those pointers were
allocated in a separate array, with a translation layer between them
and struct page.
There is now only one page pointer remaining: the memcg pointer, that
indicates which cgroup the page is associated with when charged. The
complexity of runtime allocation and the runtime translation overhead
is no longer justified to save that *potential* 0.19% of memory. With
CONFIG_SLUB, page->mem_cgroup actually sits in the doubleword padding
after the page->private member and doesn't even increase struct page,
and then this patch actually saves space. Remaining users that care
can still compile their kernels without CONFIG_MEMCG.
text data bss dec hex filename
8828345 1725264 983040 11536649 b00909 vmlinux.old
8827425 1725264 966656 11519345 afc571 vmlinux.new
Signed-off-by: Johannes Weiner <[email protected]>
---
include/linux/memcontrol.h | 6 +-
include/linux/mm_types.h | 5 +
include/linux/mmzone.h | 12 --
include/linux/page_cgroup.h | 53 --------
init/main.c | 7 -
mm/memcontrol.c | 124 +++++------------
mm/page_alloc.c | 2 -
mm/page_cgroup.c | 319 --------------------------------------------
8 files changed, 41 insertions(+), 487 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d4575a1d6e99..dafba59b31b4 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -25,7 +25,6 @@
#include <linux/jump_label.h>
struct mem_cgroup;
-struct page_cgroup;
struct page;
struct mm_struct;
struct kmem_cache;
@@ -466,8 +465,6 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
* memcg_kmem_uncharge_pages: uncharge pages from memcg
* @page: pointer to struct page being freed
* @order: allocation order.
- *
- * there is no need to specify memcg here, since it is embedded in page_cgroup
*/
static inline void
memcg_kmem_uncharge_pages(struct page *page, int order)
@@ -484,8 +481,7 @@ memcg_kmem_uncharge_pages(struct page *page, int order)
*
* Needs to be called after memcg_kmem_newpage_charge, regardless of success or
* failure of the allocation. if @page is NULL, this function will revert the
- * charges. Otherwise, it will commit the memcg given by @memcg to the
- * corresponding page_cgroup.
+ * charges. Otherwise, it will commit @page to @memcg.
*/
static inline void
memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index de183328abb0..57e47dffbdda 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -23,6 +23,7 @@
#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
struct address_space;
+struct mem_cgroup;
#define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
#define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \
@@ -168,6 +169,10 @@ struct page {
struct page *first_page; /* Compound tail pages */
};
+#ifdef CONFIG_MEMCG
+ struct mem_cgroup *mem_cgroup;
+#endif
+
/*
* On machines where all RAM is mapped into kernel address space,
* we can simply calculate the virtual address. On machines with
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 48bf12ef6620..de32d9344446 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -713,9 +713,6 @@ typedef struct pglist_data {
int nr_zones;
#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
struct page *node_mem_map;
-#ifdef CONFIG_MEMCG
- struct page_cgroup *node_page_cgroup;
-#endif
#endif
#ifndef CONFIG_NO_BOOTMEM
struct bootmem_data *bdata;
@@ -1069,7 +1066,6 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
#define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK)
struct page;
-struct page_cgroup;
struct mem_section {
/*
* This is, logically, a pointer to an array of struct
@@ -1087,14 +1083,6 @@ struct mem_section {
/* See declaration of similar field in struct zone */
unsigned long *pageblock_flags;
-#ifdef CONFIG_MEMCG
- /*
- * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
- * section. (see memcontrol.h/page_cgroup.h about this.)
- */
- struct page_cgroup *page_cgroup;
- unsigned long pad;
-#endif
/*
* WARNING: mem_section must be a power-of-2 in size for the
* calculation and use of SECTION_ROOT_MASK to make sense.
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 1289be6b436c..65be35785c86 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -1,59 +1,6 @@
#ifndef __LINUX_PAGE_CGROUP_H
#define __LINUX_PAGE_CGROUP_H
-struct pglist_data;
-
-#ifdef CONFIG_MEMCG
-struct mem_cgroup;
-
-/*
- * Page Cgroup can be considered as an extended mem_map.
- * A page_cgroup page is associated with every page descriptor. The
- * page_cgroup helps us identify information about the cgroup
- * All page cgroups are allocated at boot or memory hotplug event,
- * then the page cgroup for pfn always exists.
- */
-struct page_cgroup {
- struct mem_cgroup *mem_cgroup;
-};
-
-extern void pgdat_page_cgroup_init(struct pglist_data *pgdat);
-
-#ifdef CONFIG_SPARSEMEM
-static inline void page_cgroup_init_flatmem(void)
-{
-}
-extern void page_cgroup_init(void);
-#else
-extern void page_cgroup_init_flatmem(void);
-static inline void page_cgroup_init(void)
-{
-}
-#endif
-
-struct page_cgroup *lookup_page_cgroup(struct page *page);
-
-#else /* !CONFIG_MEMCG */
-struct page_cgroup;
-
-static inline void pgdat_page_cgroup_init(struct pglist_data *pgdat)
-{
-}
-
-static inline struct page_cgroup *lookup_page_cgroup(struct page *page)
-{
- return NULL;
-}
-
-static inline void page_cgroup_init(void)
-{
-}
-
-static inline void page_cgroup_init_flatmem(void)
-{
-}
-#endif /* CONFIG_MEMCG */
-
#include <linux/swap.h>
#ifdef CONFIG_MEMCG_SWAP
diff --git a/init/main.c b/init/main.c
index c4912d9abaee..a091bbcadd33 100644
--- a/init/main.c
+++ b/init/main.c
@@ -51,7 +51,6 @@
#include <linux/mempolicy.h>
#include <linux/key.h>
#include <linux/buffer_head.h>
-#include <linux/page_cgroup.h>
#include <linux/debug_locks.h>
#include <linux/debugobjects.h>
#include <linux/lockdep.h>
@@ -485,11 +484,6 @@ void __init __weak thread_info_cache_init(void)
*/
static void __init mm_init(void)
{
- /*
- * page_cgroup requires contiguous pages,
- * bigger than MAX_ORDER unless SPARSEMEM.
- */
- page_cgroup_init_flatmem();
mem_init();
kmem_cache_init();
percpu_init_late();
@@ -627,7 +621,6 @@ asmlinkage __visible void __init start_kernel(void)
initrd_start = 0;
}
#endif
- page_cgroup_init();
debug_objects_mem_init();
kmemleak_init();
setup_per_cpu_pageset();
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 19bac2ce827c..dc5e0abb18cb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1274,7 +1274,6 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
{
struct mem_cgroup_per_zone *mz;
struct mem_cgroup *memcg;
- struct page_cgroup *pc;
struct lruvec *lruvec;
if (mem_cgroup_disabled()) {
@@ -1282,8 +1281,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
goto out;
}
- pc = lookup_page_cgroup(page);
- memcg = pc->mem_cgroup;
+ memcg = page->mem_cgroup;
/*
* Swapcache readahead pages are added to the LRU - and
* possibly migrated - before they are charged.
@@ -2020,16 +2018,13 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
unsigned long *flags)
{
struct mem_cgroup *memcg;
- struct page_cgroup *pc;
rcu_read_lock();
if (mem_cgroup_disabled())
return NULL;
-
- pc = lookup_page_cgroup(page);
again:
- memcg = pc->mem_cgroup;
+ memcg = page->mem_cgroup;
if (unlikely(!memcg))
return NULL;
@@ -2038,7 +2033,7 @@ again:
return memcg;
spin_lock_irqsave(&memcg->move_lock, *flags);
- if (memcg != pc->mem_cgroup) {
+ if (memcg != page->mem_cgroup) {
spin_unlock_irqrestore(&memcg->move_lock, *flags);
goto again;
}
@@ -2405,15 +2400,12 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
{
struct mem_cgroup *memcg;
- struct page_cgroup *pc;
unsigned short id;
swp_entry_t ent;
VM_BUG_ON_PAGE(!PageLocked(page), page);
- pc = lookup_page_cgroup(page);
- memcg = pc->mem_cgroup;
-
+ memcg = page->mem_cgroup;
if (memcg) {
if (!css_tryget_online(&memcg->css))
memcg = NULL;
@@ -2463,10 +2455,9 @@ static void unlock_page_lru(struct page *page, int isolated)
static void commit_charge(struct page *page, struct mem_cgroup *memcg,
bool lrucare)
{
- struct page_cgroup *pc = lookup_page_cgroup(page);
int isolated;
- VM_BUG_ON_PAGE(pc->mem_cgroup, page);
+ VM_BUG_ON_PAGE(page->mem_cgroup, page);
/*
* In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
@@ -2477,7 +2468,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
/*
* Nobody should be changing or seriously looking at
- * pc->mem_cgroup at this point:
+ * page->mem_cgroup at this point:
*
* - the page is uncharged
*
@@ -2489,7 +2480,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
* - a page cache insertion, a swapin fault, or a migration
* have the page locked
*/
- pc->mem_cgroup = memcg;
+ page->mem_cgroup = memcg;
if (lrucare)
unlock_page_lru(page, isolated);
@@ -2972,8 +2963,6 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
int order)
{
- struct page_cgroup *pc;
-
VM_BUG_ON(mem_cgroup_is_root(memcg));
/* The page allocation failed. Revert */
@@ -2981,14 +2970,12 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
memcg_uncharge_kmem(memcg, 1 << order);
return;
}
- pc = lookup_page_cgroup(page);
- pc->mem_cgroup = memcg;
+ page->mem_cgroup = memcg;
}
void __memcg_kmem_uncharge_pages(struct page *page, int order)
{
- struct page_cgroup *pc = lookup_page_cgroup(page);
- struct mem_cgroup *memcg = pc->mem_cgroup;
+ struct mem_cgroup *memcg = page->mem_cgroup;
if (!memcg)
return;
@@ -2996,7 +2983,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
memcg_uncharge_kmem(memcg, 1 << order);
- pc->mem_cgroup = NULL;
+ page->mem_cgroup = NULL;
}
#else
static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
@@ -3014,16 +3001,15 @@ static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
- struct page_cgroup *pc = lookup_page_cgroup(head);
int i;
if (mem_cgroup_disabled())
return;
for (i = 1; i < HPAGE_PMD_NR; i++)
- pc[i].mem_cgroup = pc[0].mem_cgroup;
+ head[i].mem_cgroup = head->mem_cgroup;
- __this_cpu_sub(pc[0].mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
+ __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
HPAGE_PMD_NR);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -3032,7 +3018,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
* mem_cgroup_move_account - move account of the page
* @page: the page
* @nr_pages: number of regular pages (>1 for huge pages)
- * @pc: page_cgroup of the page.
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
*
@@ -3045,7 +3030,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
*/
static int mem_cgroup_move_account(struct page *page,
unsigned int nr_pages,
- struct page_cgroup *pc,
struct mem_cgroup *from,
struct mem_cgroup *to)
{
@@ -3065,7 +3049,7 @@ static int mem_cgroup_move_account(struct page *page,
goto out;
/*
- * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup
+ * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
* of its source page while we change it: page migration takes
* both pages off the LRU, but page cache replacement doesn't.
*/
@@ -3073,7 +3057,7 @@ static int mem_cgroup_move_account(struct page *page,
goto out;
ret = -EINVAL;
- if (pc->mem_cgroup != from)
+ if (page->mem_cgroup != from)
goto out_unlock;
spin_lock_irqsave(&from->move_lock, flags);
@@ -3093,13 +3077,13 @@ static int mem_cgroup_move_account(struct page *page,
}
/*
- * It is safe to change pc->mem_cgroup here because the page
+ * It is safe to change page->mem_cgroup here because the page
* is referenced, charged, and isolated - we can't race with
* uncharging, charging, migration, or LRU putback.
*/
/* caller should have done css_get */
- pc->mem_cgroup = to;
+ page->mem_cgroup = to;
spin_unlock_irqrestore(&from->move_lock, flags);
ret = 0;
@@ -3174,36 +3158,17 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
#endif
#ifdef CONFIG_DEBUG_VM
-static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
-{
- struct page_cgroup *pc;
-
- pc = lookup_page_cgroup(page);
- /*
- * Can be NULL while feeding pages into the page allocator for
- * the first time, i.e. during boot or memory hotplug;
- * or when mem_cgroup_disabled().
- */
- if (likely(pc) && pc->mem_cgroup)
- return pc;
- return NULL;
-}
-
bool mem_cgroup_bad_page_check(struct page *page)
{
if (mem_cgroup_disabled())
return false;
- return lookup_page_cgroup_used(page) != NULL;
+ return page->mem_cgroup != NULL;
}
void mem_cgroup_print_bad_page(struct page *page)
{
- struct page_cgroup *pc;
-
- pc = lookup_page_cgroup_used(page);
- if (pc)
- pr_alert("pc:%p pc->mem_cgroup:%p\n", pc, pc->mem_cgroup);
+ pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
}
#endif
@@ -5123,7 +5088,6 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, union mc_target *target)
{
struct page *page = NULL;
- struct page_cgroup *pc;
enum mc_target_type ret = MC_TARGET_NONE;
swp_entry_t ent = { .val = 0 };
@@ -5137,13 +5101,12 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
if (!page && !ent.val)
return ret;
if (page) {
- pc = lookup_page_cgroup(page);
/*
* Do only loose check w/o serialization.
- * mem_cgroup_move_account() checks the pc is valid or
+ * mem_cgroup_move_account() checks the page is valid or
* not under LRU exclusion.
*/
- if (pc->mem_cgroup == mc.from) {
+ if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
if (target)
target->page = page;
@@ -5171,15 +5134,13 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, union mc_target *target)
{
struct page *page = NULL;
- struct page_cgroup *pc;
enum mc_target_type ret = MC_TARGET_NONE;
page = pmd_page(pmd);
VM_BUG_ON_PAGE(!page || !PageHead(page), page);
if (!move_anon())
return ret;
- pc = lookup_page_cgroup(page);
- if (pc->mem_cgroup == mc.from) {
+ if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
if (target) {
get_page(page);
@@ -5378,7 +5339,6 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
enum mc_target_type target_type;
union mc_target target;
struct page *page;
- struct page_cgroup *pc;
/*
* We don't take compound_lock() here but no race with splitting thp
@@ -5399,9 +5359,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
if (target_type == MC_TARGET_PAGE) {
page = target.page;
if (!isolate_lru_page(page)) {
- pc = lookup_page_cgroup(page);
if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
- pc, mc.from, mc.to)) {
+ mc.from, mc.to)) {
mc.precharge -= HPAGE_PMD_NR;
mc.moved_charge += HPAGE_PMD_NR;
}
@@ -5429,9 +5388,7 @@ retry:
page = target.page;
if (isolate_lru_page(page))
goto put;
- pc = lookup_page_cgroup(page);
- if (!mem_cgroup_move_account(page, 1, pc,
- mc.from, mc.to)) {
+ if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
mc.precharge--;
/* we uncharge from mc.from later. */
mc.moved_charge++;
@@ -5623,7 +5580,6 @@ static void __init enable_swap_cgroup(void)
void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
{
struct mem_cgroup *memcg;
- struct page_cgroup *pc;
unsigned short oldid;
VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -5632,8 +5588,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
if (!do_swap_account)
return;
- pc = lookup_page_cgroup(page);
- memcg = pc->mem_cgroup;
+ memcg = page->mem_cgroup;
/* Readahead page, never charged */
if (!memcg)
@@ -5643,7 +5598,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
VM_BUG_ON_PAGE(oldid, page);
mem_cgroup_swap_statistics(memcg, true);
- pc->mem_cgroup = NULL;
+ page->mem_cgroup = NULL;
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, 1);
@@ -5710,7 +5665,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
goto out;
if (PageSwapCache(page)) {
- struct page_cgroup *pc = lookup_page_cgroup(page);
/*
* Every swap fault against a single page tries to charge the
* page, bail as early as possible. shmem_unuse() encounters
@@ -5718,7 +5672,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
* the page lock, which serializes swap cache removal, which
* in turn serializes uncharging.
*/
- if (pc->mem_cgroup)
+ if (page->mem_cgroup)
goto out;
}
@@ -5871,7 +5825,6 @@ static void uncharge_list(struct list_head *page_list)
next = page_list->next;
do {
unsigned int nr_pages = 1;
- struct page_cgroup *pc;
page = list_entry(next, struct page, lru);
next = page->lru.next;
@@ -5879,23 +5832,22 @@ static void uncharge_list(struct list_head *page_list)
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
- pc = lookup_page_cgroup(page);
- if (!pc->mem_cgroup)
+ if (!page->mem_cgroup)
continue;
/*
* Nobody should be changing or seriously looking at
- * pc->mem_cgroup at this point, we have fully
+ * page->mem_cgroup at this point, we have fully
* exclusive access to the page.
*/
- if (memcg != pc->mem_cgroup) {
+ if (memcg != page->mem_cgroup) {
if (memcg) {
uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
nr_huge, page);
pgpgout = nr_anon = nr_file = nr_huge = 0;
}
- memcg = pc->mem_cgroup;
+ memcg = page->mem_cgroup;
}
if (PageTransHuge(page)) {
@@ -5909,7 +5861,7 @@ static void uncharge_list(struct list_head *page_list)
else
nr_file += nr_pages;
- pc->mem_cgroup = NULL;
+ page->mem_cgroup = NULL;
pgpgout++;
} while (next != page_list);
@@ -5928,14 +5880,11 @@ static void uncharge_list(struct list_head *page_list)
*/
void mem_cgroup_uncharge(struct page *page)
{
- struct page_cgroup *pc;
-
if (mem_cgroup_disabled())
return;
/* Don't touch page->lru of any random page, pre-check: */
- pc = lookup_page_cgroup(page);
- if (!pc->mem_cgroup)
+ if (!page->mem_cgroup)
return;
INIT_LIST_HEAD(&page->lru);
@@ -5972,7 +5921,6 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
bool lrucare)
{
struct mem_cgroup *memcg;
- struct page_cgroup *pc;
int isolated;
VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
@@ -5987,8 +5935,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
return;
/* Page cache replacement: new page already charged? */
- pc = lookup_page_cgroup(newpage);
- if (pc->mem_cgroup)
+ if (newpage->mem_cgroup)
return;
/*
@@ -5997,15 +5944,14 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
* uncharged page when the PFN walker finds a page that
* reclaim just put back on the LRU but has not released yet.
*/
- pc = lookup_page_cgroup(oldpage);
- memcg = pc->mem_cgroup;
+ memcg = oldpage->mem_cgroup;
if (!memcg)
return;
if (lrucare)
lock_page_lru(oldpage, &isolated);
- pc->mem_cgroup = NULL;
+ oldpage->mem_cgroup = NULL;
if (lrucare)
unlock_page_lru(oldpage, isolated);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fa9426389c6c..6a952237a677 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,7 +48,6 @@
#include <linux/backing-dev.h>
#include <linux/fault-inject.h>
#include <linux/page-isolation.h>
-#include <linux/page_cgroup.h>
#include <linux/debugobjects.h>
#include <linux/kmemleak.h>
#include <linux/compaction.h>
@@ -4899,7 +4898,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
#endif
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
- pgdat_page_cgroup_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5331c2bd85a2..f0f31c1d4d0c 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -1,326 +1,7 @@
#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/bootmem.h>
-#include <linux/bit_spinlock.h>
#include <linux/page_cgroup.h>
-#include <linux/hash.h>
-#include <linux/slab.h>
-#include <linux/memory.h>
#include <linux/vmalloc.h>
-#include <linux/cgroup.h>
#include <linux/swapops.h>
-#include <linux/kmemleak.h>
-
-static unsigned long total_usage;
-
-#if !defined(CONFIG_SPARSEMEM)
-
-
-void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
-{
- pgdat->node_page_cgroup = NULL;
-}
-
-struct page_cgroup *lookup_page_cgroup(struct page *page)
-{
- unsigned long pfn = page_to_pfn(page);
- unsigned long offset;
- struct page_cgroup *base;
-
- base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
-#ifdef CONFIG_DEBUG_VM
- /*
- * The sanity checks the page allocator does upon freeing a
- * page can reach here before the page_cgroup arrays are
- * allocated when feeding a range of pages to the allocator
- * for the first time during bootup or memory hotplug.
- */
- if (unlikely(!base))
- return NULL;
-#endif
- offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
- return base + offset;
-}
-
-static int __init alloc_node_page_cgroup(int nid)
-{
- struct page_cgroup *base;
- unsigned long table_size;
- unsigned long nr_pages;
-
- nr_pages = NODE_DATA(nid)->node_spanned_pages;
- if (!nr_pages)
- return 0;
-
- table_size = sizeof(struct page_cgroup) * nr_pages;
-
- base = memblock_virt_alloc_try_nid_nopanic(
- table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
- BOOTMEM_ALLOC_ACCESSIBLE, nid);
- if (!base)
- return -ENOMEM;
- NODE_DATA(nid)->node_page_cgroup = base;
- total_usage += table_size;
- return 0;
-}
-
-void __init page_cgroup_init_flatmem(void)
-{
-
- int nid, fail;
-
- if (mem_cgroup_disabled())
- return;
-
- for_each_online_node(nid) {
- fail = alloc_node_page_cgroup(nid);
- if (fail)
- goto fail;
- }
- printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
- printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
- " don't want memory cgroups\n");
- return;
-fail:
- printk(KERN_CRIT "allocation of page_cgroup failed.\n");
- printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
- panic("Out of memory");
-}
-
-#else /* CONFIG_FLAT_NODE_MEM_MAP */
-
-struct page_cgroup *lookup_page_cgroup(struct page *page)
-{
- unsigned long pfn = page_to_pfn(page);
- struct mem_section *section = __pfn_to_section(pfn);
-#ifdef CONFIG_DEBUG_VM
- /*
- * The sanity checks the page allocator does upon freeing a
- * page can reach here before the page_cgroup arrays are
- * allocated when feeding a range of pages to the allocator
- * for the first time during bootup or memory hotplug.
- */
- if (!section->page_cgroup)
- return NULL;
-#endif
- return section->page_cgroup + pfn;
-}
-
-static void *__meminit alloc_page_cgroup(size_t size, int nid)
-{
- gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
- void *addr = NULL;
-
- addr = alloc_pages_exact_nid(nid, size, flags);
- if (addr) {
- kmemleak_alloc(addr, size, 1, flags);
- return addr;
- }
-
- if (node_state(nid, N_HIGH_MEMORY))
- addr = vzalloc_node(size, nid);
- else
- addr = vzalloc(size);
-
- return addr;
-}
-
-static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
-{
- struct mem_section *section;
- struct page_cgroup *base;
- unsigned long table_size;
-
- section = __pfn_to_section(pfn);
-
- if (section->page_cgroup)
- return 0;
-
- table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
- base = alloc_page_cgroup(table_size, nid);
-
- /*
- * The value stored in section->page_cgroup is (base - pfn)
- * and it does not point to the memory block allocated above,
- * causing kmemleak false positives.
- */
- kmemleak_not_leak(base);
-
- if (!base) {
- printk(KERN_ERR "page cgroup allocation failure\n");
- return -ENOMEM;
- }
-
- /*
- * The passed "pfn" may not be aligned to SECTION. For the calculation
- * we need to apply a mask.
- */
- pfn &= PAGE_SECTION_MASK;
- section->page_cgroup = base - pfn;
- total_usage += table_size;
- return 0;
-}
-#ifdef CONFIG_MEMORY_HOTPLUG
-static void free_page_cgroup(void *addr)
-{
- if (is_vmalloc_addr(addr)) {
- vfree(addr);
- } else {
- struct page *page = virt_to_page(addr);
- size_t table_size =
- sizeof(struct page_cgroup) * PAGES_PER_SECTION;
-
- BUG_ON(PageReserved(page));
- kmemleak_free(addr);
- free_pages_exact(addr, table_size);
- }
-}
-
-static void __free_page_cgroup(unsigned long pfn)
-{
- struct mem_section *ms;
- struct page_cgroup *base;
-
- ms = __pfn_to_section(pfn);
- if (!ms || !ms->page_cgroup)
- return;
- base = ms->page_cgroup + pfn;
- free_page_cgroup(base);
- ms->page_cgroup = NULL;
-}
-
-static int __meminit online_page_cgroup(unsigned long start_pfn,
- unsigned long nr_pages,
- int nid)
-{
- unsigned long start, end, pfn;
- int fail = 0;
-
- start = SECTION_ALIGN_DOWN(start_pfn);
- end = SECTION_ALIGN_UP(start_pfn + nr_pages);
-
- if (nid == -1) {
- /*
- * In this case, "nid" already exists and contains valid memory.
- * "start_pfn" passed to us is a pfn which is an arg for
- * online__pages(), and start_pfn should exist.
- */
- nid = pfn_to_nid(start_pfn);
- VM_BUG_ON(!node_state(nid, N_ONLINE));
- }
-
- for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
- if (!pfn_present(pfn))
- continue;
- fail = init_section_page_cgroup(pfn, nid);
- }
- if (!fail)
- return 0;
-
- /* rollback */
- for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
- __free_page_cgroup(pfn);
-
- return -ENOMEM;
-}
-
-static int __meminit offline_page_cgroup(unsigned long start_pfn,
- unsigned long nr_pages, int nid)
-{
- unsigned long start, end, pfn;
-
- start = SECTION_ALIGN_DOWN(start_pfn);
- end = SECTION_ALIGN_UP(start_pfn + nr_pages);
-
- for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
- __free_page_cgroup(pfn);
- return 0;
-
-}
-
-static int __meminit page_cgroup_callback(struct notifier_block *self,
- unsigned long action, void *arg)
-{
- struct memory_notify *mn = arg;
- int ret = 0;
- switch (action) {
- case MEM_GOING_ONLINE:
- ret = online_page_cgroup(mn->start_pfn,
- mn->nr_pages, mn->status_change_nid);
- break;
- case MEM_OFFLINE:
- offline_page_cgroup(mn->start_pfn,
- mn->nr_pages, mn->status_change_nid);
- break;
- case MEM_CANCEL_ONLINE:
- offline_page_cgroup(mn->start_pfn,
- mn->nr_pages, mn->status_change_nid);
- break;
- case MEM_GOING_OFFLINE:
- break;
- case MEM_ONLINE:
- case MEM_CANCEL_OFFLINE:
- break;
- }
-
- return notifier_from_errno(ret);
-}
-
-#endif
-
-void __init page_cgroup_init(void)
-{
- unsigned long pfn;
- int nid;
-
- if (mem_cgroup_disabled())
- return;
-
- for_each_node_state(nid, N_MEMORY) {
- unsigned long start_pfn, end_pfn;
-
- start_pfn = node_start_pfn(nid);
- end_pfn = node_end_pfn(nid);
- /*
- * start_pfn and end_pfn may not be aligned to SECTION and the
- * page->flags of out of node pages are not initialized. So we
- * scan [start_pfn, the biggest section's pfn < end_pfn) here.
- */
- for (pfn = start_pfn;
- pfn < end_pfn;
- pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
-
- if (!pfn_valid(pfn))
- continue;
- /*
- * Nodes's pfns can be overlapping.
- * We know some arch can have a nodes layout such as
- * -------------pfn-------------->
- * N0 | N1 | N2 | N0 | N1 | N2|....
- */
- if (pfn_to_nid(pfn) != nid)
- continue;
- if (init_section_page_cgroup(pfn, nid))
- goto oom;
- }
- }
- hotplug_memory_notifier(page_cgroup_callback, 0);
- printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
- printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
- "don't want memory cgroups\n");
- return;
-oom:
- printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
- panic("Out of memory");
-}
-
-void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
-{
- return;
-}
-
-#endif
-
#ifdef CONFIG_MEMCG_SWAP
--
2.1.3
Now that the external page_cgroup data structure and its lookup is
gone, the only code remaining in there is swap slot accounting.
Rename it and move the conditional compilation into mm/Makefile.
Signed-off-by: Johannes Weiner <[email protected]>
---
MAINTAINERS | 2 +-
include/linux/page_cgroup.h | 40 ---------
include/linux/swap_cgroup.h | 42 +++++++++
mm/Makefile | 3 +-
mm/memcontrol.c | 2 +-
mm/page_cgroup.c | 211 --------------------------------------------
mm/swap_cgroup.c | 208 +++++++++++++++++++++++++++++++++++++++++++
mm/swap_state.c | 1 -
mm/swapfile.c | 2 +-
9 files changed, 255 insertions(+), 256 deletions(-)
delete mode 100644 include/linux/page_cgroup.h
create mode 100644 include/linux/swap_cgroup.h
delete mode 100644 mm/page_cgroup.c
create mode 100644 mm/swap_cgroup.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 7e31be07197e..3a60389d3a13 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2583,7 +2583,7 @@ L: [email protected]
L: [email protected]
S: Maintained
F: mm/memcontrol.c
-F: mm/page_cgroup.c
+F: mm/swap_cgroup.c
CORETEMP HARDWARE MONITORING DRIVER
M: Fenghua Yu <[email protected]>
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
deleted file mode 100644
index 65be35785c86..000000000000
--- a/include/linux/page_cgroup.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef __LINUX_PAGE_CGROUP_H
-#define __LINUX_PAGE_CGROUP_H
-
-#include <linux/swap.h>
-
-#ifdef CONFIG_MEMCG_SWAP
-extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
- unsigned short old, unsigned short new);
-extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
-extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
-extern int swap_cgroup_swapon(int type, unsigned long max_pages);
-extern void swap_cgroup_swapoff(int type);
-#else
-
-static inline
-unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
-{
- return 0;
-}
-
-static inline
-unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
-{
- return 0;
-}
-
-static inline int
-swap_cgroup_swapon(int type, unsigned long max_pages)
-{
- return 0;
-}
-
-static inline void swap_cgroup_swapoff(int type)
-{
- return;
-}
-
-#endif /* CONFIG_MEMCG_SWAP */
-
-#endif /* __LINUX_PAGE_CGROUP_H */
diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h
new file mode 100644
index 000000000000..145306bdc92f
--- /dev/null
+++ b/include/linux/swap_cgroup.h
@@ -0,0 +1,42 @@
+#ifndef __LINUX_SWAP_CGROUP_H
+#define __LINUX_SWAP_CGROUP_H
+
+#include <linux/swap.h>
+
+#ifdef CONFIG_MEMCG_SWAP
+
+extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
+ unsigned short old, unsigned short new);
+extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
+extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
+extern int swap_cgroup_swapon(int type, unsigned long max_pages);
+extern void swap_cgroup_swapoff(int type);
+
+#else
+
+static inline
+unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
+{
+ return 0;
+}
+
+static inline
+unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
+{
+ return 0;
+}
+
+static inline int
+swap_cgroup_swapon(int type, unsigned long max_pages)
+{
+ return 0;
+}
+
+static inline void swap_cgroup_swapoff(int type)
+{
+ return;
+}
+
+#endif /* CONFIG_MEMCG_SWAP */
+
+#endif /* __LINUX_SWAP_CGROUP_H */
diff --git a/mm/Makefile b/mm/Makefile
index 27ddb80403a9..d9d579484f15 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -56,7 +56,8 @@ obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
-obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o
+obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
+obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dc5e0abb18cb..fbb41a170eae 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -51,7 +51,7 @@
#include <linux/seq_file.h>
#include <linux/vmpressure.h>
#include <linux/mm_inline.h>
-#include <linux/page_cgroup.h>
+#include <linux/swap_cgroup.h>
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/lockdep.h>
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
deleted file mode 100644
index f0f31c1d4d0c..000000000000
--- a/mm/page_cgroup.c
+++ /dev/null
@@ -1,211 +0,0 @@
-#include <linux/mm.h>
-#include <linux/page_cgroup.h>
-#include <linux/vmalloc.h>
-#include <linux/swapops.h>
-
-#ifdef CONFIG_MEMCG_SWAP
-
-static DEFINE_MUTEX(swap_cgroup_mutex);
-struct swap_cgroup_ctrl {
- struct page **map;
- unsigned long length;
- spinlock_t lock;
-};
-
-static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
-
-struct swap_cgroup {
- unsigned short id;
-};
-#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
-
-/*
- * SwapCgroup implements "lookup" and "exchange" operations.
- * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
- * against SwapCache. At swap_free(), this is accessed directly from swap.
- *
- * This means,
- * - we have no race in "exchange" when we're accessed via SwapCache because
- * SwapCache(and its swp_entry) is under lock.
- * - When called via swap_free(), there is no user of this entry and no race.
- * Then, we don't need lock around "exchange".
- *
- * TODO: we can push these buffers out to HIGHMEM.
- */
-
-/*
- * allocate buffer for swap_cgroup.
- */
-static int swap_cgroup_prepare(int type)
-{
- struct page *page;
- struct swap_cgroup_ctrl *ctrl;
- unsigned long idx, max;
-
- ctrl = &swap_cgroup_ctrl[type];
-
- for (idx = 0; idx < ctrl->length; idx++) {
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!page)
- goto not_enough_page;
- ctrl->map[idx] = page;
- }
- return 0;
-not_enough_page:
- max = idx;
- for (idx = 0; idx < max; idx++)
- __free_page(ctrl->map[idx]);
-
- return -ENOMEM;
-}
-
-static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
- struct swap_cgroup_ctrl **ctrlp)
-{
- pgoff_t offset = swp_offset(ent);
- struct swap_cgroup_ctrl *ctrl;
- struct page *mappage;
- struct swap_cgroup *sc;
-
- ctrl = &swap_cgroup_ctrl[swp_type(ent)];
- if (ctrlp)
- *ctrlp = ctrl;
-
- mappage = ctrl->map[offset / SC_PER_PAGE];
- sc = page_address(mappage);
- return sc + offset % SC_PER_PAGE;
-}
-
-/**
- * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
- * @ent: swap entry to be cmpxchged
- * @old: old id
- * @new: new id
- *
- * Returns old id at success, 0 at failure.
- * (There is no mem_cgroup using 0 as its id)
- */
-unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
- unsigned short old, unsigned short new)
-{
- struct swap_cgroup_ctrl *ctrl;
- struct swap_cgroup *sc;
- unsigned long flags;
- unsigned short retval;
-
- sc = lookup_swap_cgroup(ent, &ctrl);
-
- spin_lock_irqsave(&ctrl->lock, flags);
- retval = sc->id;
- if (retval == old)
- sc->id = new;
- else
- retval = 0;
- spin_unlock_irqrestore(&ctrl->lock, flags);
- return retval;
-}
-
-/**
- * swap_cgroup_record - record mem_cgroup for this swp_entry.
- * @ent: swap entry to be recorded into
- * @id: mem_cgroup to be recorded
- *
- * Returns old value at success, 0 at failure.
- * (Of course, old value can be 0.)
- */
-unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
-{
- struct swap_cgroup_ctrl *ctrl;
- struct swap_cgroup *sc;
- unsigned short old;
- unsigned long flags;
-
- sc = lookup_swap_cgroup(ent, &ctrl);
-
- spin_lock_irqsave(&ctrl->lock, flags);
- old = sc->id;
- sc->id = id;
- spin_unlock_irqrestore(&ctrl->lock, flags);
-
- return old;
-}
-
-/**
- * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
- * @ent: swap entry to be looked up.
- *
- * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
- */
-unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
-{
- return lookup_swap_cgroup(ent, NULL)->id;
-}
-
-int swap_cgroup_swapon(int type, unsigned long max_pages)
-{
- void *array;
- unsigned long array_size;
- unsigned long length;
- struct swap_cgroup_ctrl *ctrl;
-
- if (!do_swap_account)
- return 0;
-
- length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
- array_size = length * sizeof(void *);
-
- array = vzalloc(array_size);
- if (!array)
- goto nomem;
-
- ctrl = &swap_cgroup_ctrl[type];
- mutex_lock(&swap_cgroup_mutex);
- ctrl->length = length;
- ctrl->map = array;
- spin_lock_init(&ctrl->lock);
- if (swap_cgroup_prepare(type)) {
- /* memory shortage */
- ctrl->map = NULL;
- ctrl->length = 0;
- mutex_unlock(&swap_cgroup_mutex);
- vfree(array);
- goto nomem;
- }
- mutex_unlock(&swap_cgroup_mutex);
-
- return 0;
-nomem:
- printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
- printk(KERN_INFO
- "swap_cgroup can be disabled by swapaccount=0 boot option\n");
- return -ENOMEM;
-}
-
-void swap_cgroup_swapoff(int type)
-{
- struct page **map;
- unsigned long i, length;
- struct swap_cgroup_ctrl *ctrl;
-
- if (!do_swap_account)
- return;
-
- mutex_lock(&swap_cgroup_mutex);
- ctrl = &swap_cgroup_ctrl[type];
- map = ctrl->map;
- length = ctrl->length;
- ctrl->map = NULL;
- ctrl->length = 0;
- mutex_unlock(&swap_cgroup_mutex);
-
- if (map) {
- for (i = 0; i < length; i++) {
- struct page *page = map[i];
- if (page)
- __free_page(page);
- }
- vfree(map);
- }
-}
-
-#endif
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
new file mode 100644
index 000000000000..b5f7f24b8dd1
--- /dev/null
+++ b/mm/swap_cgroup.c
@@ -0,0 +1,208 @@
+#include <linux/swap_cgroup.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+
+#include <linux/swapops.h> /* depends on mm.h include */
+
+static DEFINE_MUTEX(swap_cgroup_mutex);
+struct swap_cgroup_ctrl {
+ struct page **map;
+ unsigned long length;
+ spinlock_t lock;
+};
+
+static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
+
+struct swap_cgroup {
+ unsigned short id;
+};
+#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
+
+/*
+ * SwapCgroup implements "lookup" and "exchange" operations.
+ * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
+ * against SwapCache. At swap_free(), this is accessed directly from swap.
+ *
+ * This means,
+ * - we have no race in "exchange" when we're accessed via SwapCache because
+ * SwapCache(and its swp_entry) is under lock.
+ * - When called via swap_free(), there is no user of this entry and no race.
+ * Then, we don't need lock around "exchange".
+ *
+ * TODO: we can push these buffers out to HIGHMEM.
+ */
+
+/*
+ * allocate buffer for swap_cgroup.
+ */
+static int swap_cgroup_prepare(int type)
+{
+ struct page *page;
+ struct swap_cgroup_ctrl *ctrl;
+ unsigned long idx, max;
+
+ ctrl = &swap_cgroup_ctrl[type];
+
+ for (idx = 0; idx < ctrl->length; idx++) {
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ goto not_enough_page;
+ ctrl->map[idx] = page;
+ }
+ return 0;
+not_enough_page:
+ max = idx;
+ for (idx = 0; idx < max; idx++)
+ __free_page(ctrl->map[idx]);
+
+ return -ENOMEM;
+}
+
+static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
+ struct swap_cgroup_ctrl **ctrlp)
+{
+ pgoff_t offset = swp_offset(ent);
+ struct swap_cgroup_ctrl *ctrl;
+ struct page *mappage;
+ struct swap_cgroup *sc;
+
+ ctrl = &swap_cgroup_ctrl[swp_type(ent)];
+ if (ctrlp)
+ *ctrlp = ctrl;
+
+ mappage = ctrl->map[offset / SC_PER_PAGE];
+ sc = page_address(mappage);
+ return sc + offset % SC_PER_PAGE;
+}
+
+/**
+ * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
+ * @ent: swap entry to be cmpxchged
+ * @old: old id
+ * @new: new id
+ *
+ * Returns old id at success, 0 at failure.
+ * (There is no mem_cgroup using 0 as its id)
+ */
+unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
+ unsigned short old, unsigned short new)
+{
+ struct swap_cgroup_ctrl *ctrl;
+ struct swap_cgroup *sc;
+ unsigned long flags;
+ unsigned short retval;
+
+ sc = lookup_swap_cgroup(ent, &ctrl);
+
+ spin_lock_irqsave(&ctrl->lock, flags);
+ retval = sc->id;
+ if (retval == old)
+ sc->id = new;
+ else
+ retval = 0;
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+ return retval;
+}
+
+/**
+ * swap_cgroup_record - record mem_cgroup for this swp_entry.
+ * @ent: swap entry to be recorded into
+ * @id: mem_cgroup to be recorded
+ *
+ * Returns old value at success, 0 at failure.
+ * (Of course, old value can be 0.)
+ */
+unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
+{
+ struct swap_cgroup_ctrl *ctrl;
+ struct swap_cgroup *sc;
+ unsigned short old;
+ unsigned long flags;
+
+ sc = lookup_swap_cgroup(ent, &ctrl);
+
+ spin_lock_irqsave(&ctrl->lock, flags);
+ old = sc->id;
+ sc->id = id;
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+
+ return old;
+}
+
+/**
+ * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
+ * @ent: swap entry to be looked up.
+ *
+ * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
+ */
+unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
+{
+ return lookup_swap_cgroup(ent, NULL)->id;
+}
+
+int swap_cgroup_swapon(int type, unsigned long max_pages)
+{
+ void *array;
+ unsigned long array_size;
+ unsigned long length;
+ struct swap_cgroup_ctrl *ctrl;
+
+ if (!do_swap_account)
+ return 0;
+
+ length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
+ array_size = length * sizeof(void *);
+
+ array = vzalloc(array_size);
+ if (!array)
+ goto nomem;
+
+ ctrl = &swap_cgroup_ctrl[type];
+ mutex_lock(&swap_cgroup_mutex);
+ ctrl->length = length;
+ ctrl->map = array;
+ spin_lock_init(&ctrl->lock);
+ if (swap_cgroup_prepare(type)) {
+ /* memory shortage */
+ ctrl->map = NULL;
+ ctrl->length = 0;
+ mutex_unlock(&swap_cgroup_mutex);
+ vfree(array);
+ goto nomem;
+ }
+ mutex_unlock(&swap_cgroup_mutex);
+
+ return 0;
+nomem:
+ printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
+ printk(KERN_INFO
+ "swap_cgroup can be disabled by swapaccount=0 boot option\n");
+ return -ENOMEM;
+}
+
+void swap_cgroup_swapoff(int type)
+{
+ struct page **map;
+ unsigned long i, length;
+ struct swap_cgroup_ctrl *ctrl;
+
+ if (!do_swap_account)
+ return;
+
+ mutex_lock(&swap_cgroup_mutex);
+ ctrl = &swap_cgroup_ctrl[type];
+ map = ctrl->map;
+ length = ctrl->length;
+ ctrl->map = NULL;
+ ctrl->length = 0;
+ mutex_unlock(&swap_cgroup_mutex);
+
+ if (map) {
+ for (i = 0; i < length; i++) {
+ struct page *page = map[i];
+ if (page)
+ __free_page(page);
+ }
+ vfree(map);
+ }
+}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 154444918685..9711342987a0 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -17,7 +17,6 @@
#include <linux/blkdev.h>
#include <linux/pagevec.h>
#include <linux/migrate.h>
-#include <linux/page_cgroup.h>
#include <asm/pgtable.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8798b2e0ac59..63f55ccb9b26 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -38,7 +38,7 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <linux/swapops.h>
-#include <linux/page_cgroup.h>
+#include <linux/swap_cgroup.h>
static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
unsigned char);
--
2.1.3
Now that the external page_cgroup data structure and its lookup is
gone, let the generic bad_page() check for page->mem_cgroup sanity.
Signed-off-by: Johannes Weiner <[email protected]>
---
include/linux/memcontrol.h | 4 ----
mm/debug.c | 5 ++++-
mm/memcontrol.c | 15 ---------------
mm/page_alloc.c | 12 ++++++++----
4 files changed, 12 insertions(+), 24 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index dafba59b31b4..e789551d4db0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -173,10 +173,6 @@ static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
void mem_cgroup_split_huge_fixup(struct page *head);
#endif
-#ifdef CONFIG_DEBUG_VM
-bool mem_cgroup_bad_page_check(struct page *page);
-void mem_cgroup_print_bad_page(struct page *page);
-#endif
#else /* CONFIG_MEMCG */
struct mem_cgroup;
diff --git a/mm/debug.c b/mm/debug.c
index 5ce45c9a29b5..0e58f3211f89 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -95,7 +95,10 @@ void dump_page_badflags(struct page *page, const char *reason,
dump_flags(page->flags & badflags,
pageflag_names, ARRAY_SIZE(pageflag_names));
}
- mem_cgroup_print_bad_page(page);
+#ifdef CONFIG_MEMCG
+ if (page->mem_cgroup)
+ pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
+#endif
}
void dump_page(struct page *page, const char *reason)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fbb41a170eae..3645641513a1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3157,21 +3157,6 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
}
#endif
-#ifdef CONFIG_DEBUG_VM
-bool mem_cgroup_bad_page_check(struct page *page)
-{
- if (mem_cgroup_disabled())
- return false;
-
- return page->mem_cgroup != NULL;
-}
-
-void mem_cgroup_print_bad_page(struct page *page)
-{
- pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
-}
-#endif
-
static DEFINE_MUTEX(memcg_limit_mutex);
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6a952237a677..161da09fcda2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -653,8 +653,10 @@ static inline int free_pages_check(struct page *page)
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
}
- if (unlikely(mem_cgroup_bad_page_check(page)))
- bad_reason = "cgroup check failed";
+#ifdef CONFIG_MEMCG
+ if (unlikely(page->mem_cgroup))
+ bad_reason = "page still charged to cgroup";
+#endif
if (unlikely(bad_reason)) {
bad_page(page, bad_reason, bad_flags);
return 1;
@@ -920,8 +922,10 @@ static inline int check_new_page(struct page *page)
bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
}
- if (unlikely(mem_cgroup_bad_page_check(page)))
- bad_reason = "cgroup check failed";
+#ifdef CONFIG_MEMCG
+ if (unlikely(page->mem_cgroup))
+ bad_reason = "page still charged to cgroup";
+#endif
if (unlikely(bad_reason)) {
bad_page(page, bad_reason, bad_flags);
return 1;
--
2.1.3
From: Johannes Weiner <[email protected]>
Date: Sat, 1 Nov 2014 23:15:54 -0400
> Memory cgroups used to have 5 per-page pointers. To allow users to
> disable that amount of overhead during runtime, those pointers were
> allocated in a separate array, with a translation layer between them
> and struct page.
>
> There is now only one page pointer remaining: the memcg pointer, that
> indicates which cgroup the page is associated with when charged. The
> complexity of runtime allocation and the runtime translation overhead
> is no longer justified to save that *potential* 0.19% of memory. With
> CONFIG_SLUB, page->mem_cgroup actually sits in the doubleword padding
> after the page->private member and doesn't even increase struct page,
> and then this patch actually saves space. Remaining users that care
> can still compile their kernels without CONFIG_MEMCG.
>
> text data bss dec hex filename
> 8828345 1725264 983040 11536649 b00909 vmlinux.old
> 8827425 1725264 966656 11519345 afc571 vmlinux.new
>
> Signed-off-by: Johannes Weiner <[email protected]>
Looks great:
Acked-by: David S. Miller <[email protected]>
From: Johannes Weiner <[email protected]>
Date: Sat, 1 Nov 2014 23:15:55 -0400
> Now that the external page_cgroup data structure and its lookup is
> gone, the only code remaining in there is swap slot accounting.
>
> Rename it and move the conditional compilation into mm/Makefile.
>
> Signed-off-by: Johannes Weiner <[email protected]>
Acked-by: David S. Miller <[email protected]>
From: Johannes Weiner <[email protected]>
Date: Sat, 1 Nov 2014 23:15:56 -0400
> Now that the external page_cgroup data structure and its lookup is
> gone, let the generic bad_page() check for page->mem_cgroup sanity.
>
> Signed-off-by: Johannes Weiner <[email protected]>
Acked-by: David S. Miller <[email protected]>
On Sat, Nov 01, 2014 at 11:15:54PM -0400, Johannes Weiner wrote:
> Memory cgroups used to have 5 per-page pointers. To allow users to
> disable that amount of overhead during runtime, those pointers were
> allocated in a separate array, with a translation layer between them
> and struct page.
Hello, Johannes.
I'd like to leave this translation layer.
Could you just disable that code with #ifdef until next user comes?
In our company, we uses PAGE_OWNER on mm tree which is the feature
saying who allocates the page. To use PAGE_OWNER needs modifying
struct page and then needs re-compile. This re-compile makes us difficult
to use this feature. So, we decide to implement run-time configurable
PAGE_OWNER through page_cgroup's translation layer code. Moreover, with
this infrastructure, I plan to implement some other debugging feature.
Because of my laziness, it didn't submitted to LKML. But, I will
submit it as soon as possible. If the code is removed, I would
copy-and-paste the code, but, it would cause lose of the history on
that code. So if possible, I'd like to leave that code now.
Thanks.
Hi Joonsoo,
On Mon, Nov 03, 2014 at 05:02:08PM +0900, Joonsoo Kim wrote:
> On Sat, Nov 01, 2014 at 11:15:54PM -0400, Johannes Weiner wrote:
> > Memory cgroups used to have 5 per-page pointers. To allow users to
> > disable that amount of overhead during runtime, those pointers were
> > allocated in a separate array, with a translation layer between them
> > and struct page.
>
> Hello, Johannes.
>
> I'd like to leave this translation layer.
> Could you just disable that code with #ifdef until next user comes?
>
> In our company, we uses PAGE_OWNER on mm tree which is the feature
> saying who allocates the page. To use PAGE_OWNER needs modifying
> struct page and then needs re-compile. This re-compile makes us difficult
> to use this feature. So, we decide to implement run-time configurable
> PAGE_OWNER through page_cgroup's translation layer code. Moreover, with
> this infrastructure, I plan to implement some other debugging feature.
>
> Because of my laziness, it didn't submitted to LKML. But, I will
> submit it as soon as possible. If the code is removed, I would
> copy-and-paste the code, but, it would cause lose of the history on
> that code. So if possible, I'd like to leave that code now.
Please re-introduce this code when your new usecase is ready to be
upstreamed. There is little reason to burden an unrelated feature
with a sizable chunk of dead code for a vague future user.
Thanks
From: Johannes Weiner <[email protected]>
Date: Mon, 3 Nov 2014 10:09:42 -0500
> Please re-introduce this code when your new usecase is ready to be
> upstreamed. There is little reason to burden an unrelated feature
> with a sizable chunk of dead code for a vague future user.
+1
On Sat 01-11-14 23:15:54, Johannes Weiner wrote:
> Memory cgroups used to have 5 per-page pointers. To allow users to
> disable that amount of overhead during runtime, those pointers were
> allocated in a separate array, with a translation layer between them
> and struct page.
>
> There is now only one page pointer remaining: the memcg pointer, that
> indicates which cgroup the page is associated with when charged. The
> complexity of runtime allocation and the runtime translation overhead
> is no longer justified to save that *potential* 0.19% of memory. With
> CONFIG_SLUB, page->mem_cgroup actually sits in the doubleword padding
> after the page->private member and doesn't even increase struct page,
> and then this patch actually saves space. Remaining users that care
> can still compile their kernels without CONFIG_MEMCG.
CONFIG_SLAB should be OK as well because there should be one slot for
pointer left if no special debugging is enabled.
> text data bss dec hex filename
> 8828345 1725264 983040 11536649 b00909 vmlinux.old
> 8827425 1725264 966656 11519345 afc571 vmlinux.new
>
> Signed-off-by: Johannes Weiner <[email protected]>
Very nice!
Acked-by: Michal Hocko <[email protected]>
Thanks!
> ---
> include/linux/memcontrol.h | 6 +-
> include/linux/mm_types.h | 5 +
> include/linux/mmzone.h | 12 --
> include/linux/page_cgroup.h | 53 --------
> init/main.c | 7 -
> mm/memcontrol.c | 124 +++++------------
> mm/page_alloc.c | 2 -
> mm/page_cgroup.c | 319 --------------------------------------------
> 8 files changed, 41 insertions(+), 487 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index d4575a1d6e99..dafba59b31b4 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -25,7 +25,6 @@
> #include <linux/jump_label.h>
>
> struct mem_cgroup;
> -struct page_cgroup;
> struct page;
> struct mm_struct;
> struct kmem_cache;
> @@ -466,8 +465,6 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
> * memcg_kmem_uncharge_pages: uncharge pages from memcg
> * @page: pointer to struct page being freed
> * @order: allocation order.
> - *
> - * there is no need to specify memcg here, since it is embedded in page_cgroup
> */
> static inline void
> memcg_kmem_uncharge_pages(struct page *page, int order)
> @@ -484,8 +481,7 @@ memcg_kmem_uncharge_pages(struct page *page, int order)
> *
> * Needs to be called after memcg_kmem_newpage_charge, regardless of success or
> * failure of the allocation. if @page is NULL, this function will revert the
> - * charges. Otherwise, it will commit the memcg given by @memcg to the
> - * corresponding page_cgroup.
> + * charges. Otherwise, it will commit @page to @memcg.
> */
> static inline void
> memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index de183328abb0..57e47dffbdda 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -23,6 +23,7 @@
> #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
>
> struct address_space;
> +struct mem_cgroup;
>
> #define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
> #define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \
> @@ -168,6 +169,10 @@ struct page {
> struct page *first_page; /* Compound tail pages */
> };
>
> +#ifdef CONFIG_MEMCG
> + struct mem_cgroup *mem_cgroup;
> +#endif
> +
> /*
> * On machines where all RAM is mapped into kernel address space,
> * we can simply calculate the virtual address. On machines with
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 48bf12ef6620..de32d9344446 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -713,9 +713,6 @@ typedef struct pglist_data {
> int nr_zones;
> #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
> struct page *node_mem_map;
> -#ifdef CONFIG_MEMCG
> - struct page_cgroup *node_page_cgroup;
> -#endif
> #endif
> #ifndef CONFIG_NO_BOOTMEM
> struct bootmem_data *bdata;
> @@ -1069,7 +1066,6 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
> #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK)
>
> struct page;
> -struct page_cgroup;
> struct mem_section {
> /*
> * This is, logically, a pointer to an array of struct
> @@ -1087,14 +1083,6 @@ struct mem_section {
>
> /* See declaration of similar field in struct zone */
> unsigned long *pageblock_flags;
> -#ifdef CONFIG_MEMCG
> - /*
> - * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
> - * section. (see memcontrol.h/page_cgroup.h about this.)
> - */
> - struct page_cgroup *page_cgroup;
> - unsigned long pad;
> -#endif
> /*
> * WARNING: mem_section must be a power-of-2 in size for the
> * calculation and use of SECTION_ROOT_MASK to make sense.
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> index 1289be6b436c..65be35785c86 100644
> --- a/include/linux/page_cgroup.h
> +++ b/include/linux/page_cgroup.h
> @@ -1,59 +1,6 @@
> #ifndef __LINUX_PAGE_CGROUP_H
> #define __LINUX_PAGE_CGROUP_H
>
> -struct pglist_data;
> -
> -#ifdef CONFIG_MEMCG
> -struct mem_cgroup;
> -
> -/*
> - * Page Cgroup can be considered as an extended mem_map.
> - * A page_cgroup page is associated with every page descriptor. The
> - * page_cgroup helps us identify information about the cgroup
> - * All page cgroups are allocated at boot or memory hotplug event,
> - * then the page cgroup for pfn always exists.
> - */
> -struct page_cgroup {
> - struct mem_cgroup *mem_cgroup;
> -};
> -
> -extern void pgdat_page_cgroup_init(struct pglist_data *pgdat);
> -
> -#ifdef CONFIG_SPARSEMEM
> -static inline void page_cgroup_init_flatmem(void)
> -{
> -}
> -extern void page_cgroup_init(void);
> -#else
> -extern void page_cgroup_init_flatmem(void);
> -static inline void page_cgroup_init(void)
> -{
> -}
> -#endif
> -
> -struct page_cgroup *lookup_page_cgroup(struct page *page);
> -
> -#else /* !CONFIG_MEMCG */
> -struct page_cgroup;
> -
> -static inline void pgdat_page_cgroup_init(struct pglist_data *pgdat)
> -{
> -}
> -
> -static inline struct page_cgroup *lookup_page_cgroup(struct page *page)
> -{
> - return NULL;
> -}
> -
> -static inline void page_cgroup_init(void)
> -{
> -}
> -
> -static inline void page_cgroup_init_flatmem(void)
> -{
> -}
> -#endif /* CONFIG_MEMCG */
> -
> #include <linux/swap.h>
>
> #ifdef CONFIG_MEMCG_SWAP
> diff --git a/init/main.c b/init/main.c
> index c4912d9abaee..a091bbcadd33 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -51,7 +51,6 @@
> #include <linux/mempolicy.h>
> #include <linux/key.h>
> #include <linux/buffer_head.h>
> -#include <linux/page_cgroup.h>
> #include <linux/debug_locks.h>
> #include <linux/debugobjects.h>
> #include <linux/lockdep.h>
> @@ -485,11 +484,6 @@ void __init __weak thread_info_cache_init(void)
> */
> static void __init mm_init(void)
> {
> - /*
> - * page_cgroup requires contiguous pages,
> - * bigger than MAX_ORDER unless SPARSEMEM.
> - */
> - page_cgroup_init_flatmem();
> mem_init();
> kmem_cache_init();
> percpu_init_late();
> @@ -627,7 +621,6 @@ asmlinkage __visible void __init start_kernel(void)
> initrd_start = 0;
> }
> #endif
> - page_cgroup_init();
> debug_objects_mem_init();
> kmemleak_init();
> setup_per_cpu_pageset();
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 19bac2ce827c..dc5e0abb18cb 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1274,7 +1274,6 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
> {
> struct mem_cgroup_per_zone *mz;
> struct mem_cgroup *memcg;
> - struct page_cgroup *pc;
> struct lruvec *lruvec;
>
> if (mem_cgroup_disabled()) {
> @@ -1282,8 +1281,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
> goto out;
> }
>
> - pc = lookup_page_cgroup(page);
> - memcg = pc->mem_cgroup;
> + memcg = page->mem_cgroup;
> /*
> * Swapcache readahead pages are added to the LRU - and
> * possibly migrated - before they are charged.
> @@ -2020,16 +2018,13 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
> unsigned long *flags)
> {
> struct mem_cgroup *memcg;
> - struct page_cgroup *pc;
>
> rcu_read_lock();
>
> if (mem_cgroup_disabled())
> return NULL;
> -
> - pc = lookup_page_cgroup(page);
> again:
> - memcg = pc->mem_cgroup;
> + memcg = page->mem_cgroup;
> if (unlikely(!memcg))
> return NULL;
>
> @@ -2038,7 +2033,7 @@ again:
> return memcg;
>
> spin_lock_irqsave(&memcg->move_lock, *flags);
> - if (memcg != pc->mem_cgroup) {
> + if (memcg != page->mem_cgroup) {
> spin_unlock_irqrestore(&memcg->move_lock, *flags);
> goto again;
> }
> @@ -2405,15 +2400,12 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
> struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
> {
> struct mem_cgroup *memcg;
> - struct page_cgroup *pc;
> unsigned short id;
> swp_entry_t ent;
>
> VM_BUG_ON_PAGE(!PageLocked(page), page);
>
> - pc = lookup_page_cgroup(page);
> - memcg = pc->mem_cgroup;
> -
> + memcg = page->mem_cgroup;
> if (memcg) {
> if (!css_tryget_online(&memcg->css))
> memcg = NULL;
> @@ -2463,10 +2455,9 @@ static void unlock_page_lru(struct page *page, int isolated)
> static void commit_charge(struct page *page, struct mem_cgroup *memcg,
> bool lrucare)
> {
> - struct page_cgroup *pc = lookup_page_cgroup(page);
> int isolated;
>
> - VM_BUG_ON_PAGE(pc->mem_cgroup, page);
> + VM_BUG_ON_PAGE(page->mem_cgroup, page);
>
> /*
> * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
> @@ -2477,7 +2468,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
>
> /*
> * Nobody should be changing or seriously looking at
> - * pc->mem_cgroup at this point:
> + * page->mem_cgroup at this point:
> *
> * - the page is uncharged
> *
> @@ -2489,7 +2480,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
> * - a page cache insertion, a swapin fault, or a migration
> * have the page locked
> */
> - pc->mem_cgroup = memcg;
> + page->mem_cgroup = memcg;
>
> if (lrucare)
> unlock_page_lru(page, isolated);
> @@ -2972,8 +2963,6 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
> void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
> int order)
> {
> - struct page_cgroup *pc;
> -
> VM_BUG_ON(mem_cgroup_is_root(memcg));
>
> /* The page allocation failed. Revert */
> @@ -2981,14 +2970,12 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
> memcg_uncharge_kmem(memcg, 1 << order);
> return;
> }
> - pc = lookup_page_cgroup(page);
> - pc->mem_cgroup = memcg;
> + page->mem_cgroup = memcg;
> }
>
> void __memcg_kmem_uncharge_pages(struct page *page, int order)
> {
> - struct page_cgroup *pc = lookup_page_cgroup(page);
> - struct mem_cgroup *memcg = pc->mem_cgroup;
> + struct mem_cgroup *memcg = page->mem_cgroup;
>
> if (!memcg)
> return;
> @@ -2996,7 +2983,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
> VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
>
> memcg_uncharge_kmem(memcg, 1 << order);
> - pc->mem_cgroup = NULL;
> + page->mem_cgroup = NULL;
> }
> #else
> static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
> @@ -3014,16 +3001,15 @@ static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
> */
> void mem_cgroup_split_huge_fixup(struct page *head)
> {
> - struct page_cgroup *pc = lookup_page_cgroup(head);
> int i;
>
> if (mem_cgroup_disabled())
> return;
>
> for (i = 1; i < HPAGE_PMD_NR; i++)
> - pc[i].mem_cgroup = pc[0].mem_cgroup;
> + head[i].mem_cgroup = head->mem_cgroup;
>
> - __this_cpu_sub(pc[0].mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
> + __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
> HPAGE_PMD_NR);
> }
> #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
> @@ -3032,7 +3018,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
> * mem_cgroup_move_account - move account of the page
> * @page: the page
> * @nr_pages: number of regular pages (>1 for huge pages)
> - * @pc: page_cgroup of the page.
> * @from: mem_cgroup which the page is moved from.
> * @to: mem_cgroup which the page is moved to. @from != @to.
> *
> @@ -3045,7 +3030,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
> */
> static int mem_cgroup_move_account(struct page *page,
> unsigned int nr_pages,
> - struct page_cgroup *pc,
> struct mem_cgroup *from,
> struct mem_cgroup *to)
> {
> @@ -3065,7 +3049,7 @@ static int mem_cgroup_move_account(struct page *page,
> goto out;
>
> /*
> - * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup
> + * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
> * of its source page while we change it: page migration takes
> * both pages off the LRU, but page cache replacement doesn't.
> */
> @@ -3073,7 +3057,7 @@ static int mem_cgroup_move_account(struct page *page,
> goto out;
>
> ret = -EINVAL;
> - if (pc->mem_cgroup != from)
> + if (page->mem_cgroup != from)
> goto out_unlock;
>
> spin_lock_irqsave(&from->move_lock, flags);
> @@ -3093,13 +3077,13 @@ static int mem_cgroup_move_account(struct page *page,
> }
>
> /*
> - * It is safe to change pc->mem_cgroup here because the page
> + * It is safe to change page->mem_cgroup here because the page
> * is referenced, charged, and isolated - we can't race with
> * uncharging, charging, migration, or LRU putback.
> */
>
> /* caller should have done css_get */
> - pc->mem_cgroup = to;
> + page->mem_cgroup = to;
> spin_unlock_irqrestore(&from->move_lock, flags);
>
> ret = 0;
> @@ -3174,36 +3158,17 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
> #endif
>
> #ifdef CONFIG_DEBUG_VM
> -static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
> -{
> - struct page_cgroup *pc;
> -
> - pc = lookup_page_cgroup(page);
> - /*
> - * Can be NULL while feeding pages into the page allocator for
> - * the first time, i.e. during boot or memory hotplug;
> - * or when mem_cgroup_disabled().
> - */
> - if (likely(pc) && pc->mem_cgroup)
> - return pc;
> - return NULL;
> -}
> -
> bool mem_cgroup_bad_page_check(struct page *page)
> {
> if (mem_cgroup_disabled())
> return false;
>
> - return lookup_page_cgroup_used(page) != NULL;
> + return page->mem_cgroup != NULL;
> }
>
> void mem_cgroup_print_bad_page(struct page *page)
> {
> - struct page_cgroup *pc;
> -
> - pc = lookup_page_cgroup_used(page);
> - if (pc)
> - pr_alert("pc:%p pc->mem_cgroup:%p\n", pc, pc->mem_cgroup);
> + pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
> }
> #endif
>
> @@ -5123,7 +5088,6 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
> unsigned long addr, pte_t ptent, union mc_target *target)
> {
> struct page *page = NULL;
> - struct page_cgroup *pc;
> enum mc_target_type ret = MC_TARGET_NONE;
> swp_entry_t ent = { .val = 0 };
>
> @@ -5137,13 +5101,12 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
> if (!page && !ent.val)
> return ret;
> if (page) {
> - pc = lookup_page_cgroup(page);
> /*
> * Do only loose check w/o serialization.
> - * mem_cgroup_move_account() checks the pc is valid or
> + * mem_cgroup_move_account() checks the page is valid or
> * not under LRU exclusion.
> */
> - if (pc->mem_cgroup == mc.from) {
> + if (page->mem_cgroup == mc.from) {
> ret = MC_TARGET_PAGE;
> if (target)
> target->page = page;
> @@ -5171,15 +5134,13 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
> unsigned long addr, pmd_t pmd, union mc_target *target)
> {
> struct page *page = NULL;
> - struct page_cgroup *pc;
> enum mc_target_type ret = MC_TARGET_NONE;
>
> page = pmd_page(pmd);
> VM_BUG_ON_PAGE(!page || !PageHead(page), page);
> if (!move_anon())
> return ret;
> - pc = lookup_page_cgroup(page);
> - if (pc->mem_cgroup == mc.from) {
> + if (page->mem_cgroup == mc.from) {
> ret = MC_TARGET_PAGE;
> if (target) {
> get_page(page);
> @@ -5378,7 +5339,6 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
> enum mc_target_type target_type;
> union mc_target target;
> struct page *page;
> - struct page_cgroup *pc;
>
> /*
> * We don't take compound_lock() here but no race with splitting thp
> @@ -5399,9 +5359,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
> if (target_type == MC_TARGET_PAGE) {
> page = target.page;
> if (!isolate_lru_page(page)) {
> - pc = lookup_page_cgroup(page);
> if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
> - pc, mc.from, mc.to)) {
> + mc.from, mc.to)) {
> mc.precharge -= HPAGE_PMD_NR;
> mc.moved_charge += HPAGE_PMD_NR;
> }
> @@ -5429,9 +5388,7 @@ retry:
> page = target.page;
> if (isolate_lru_page(page))
> goto put;
> - pc = lookup_page_cgroup(page);
> - if (!mem_cgroup_move_account(page, 1, pc,
> - mc.from, mc.to)) {
> + if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
> mc.precharge--;
> /* we uncharge from mc.from later. */
> mc.moved_charge++;
> @@ -5623,7 +5580,6 @@ static void __init enable_swap_cgroup(void)
> void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
> {
> struct mem_cgroup *memcg;
> - struct page_cgroup *pc;
> unsigned short oldid;
>
> VM_BUG_ON_PAGE(PageLRU(page), page);
> @@ -5632,8 +5588,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
> if (!do_swap_account)
> return;
>
> - pc = lookup_page_cgroup(page);
> - memcg = pc->mem_cgroup;
> + memcg = page->mem_cgroup;
>
> /* Readahead page, never charged */
> if (!memcg)
> @@ -5643,7 +5598,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
> VM_BUG_ON_PAGE(oldid, page);
> mem_cgroup_swap_statistics(memcg, true);
>
> - pc->mem_cgroup = NULL;
> + page->mem_cgroup = NULL;
>
> if (!mem_cgroup_is_root(memcg))
> page_counter_uncharge(&memcg->memory, 1);
> @@ -5710,7 +5665,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
> goto out;
>
> if (PageSwapCache(page)) {
> - struct page_cgroup *pc = lookup_page_cgroup(page);
> /*
> * Every swap fault against a single page tries to charge the
> * page, bail as early as possible. shmem_unuse() encounters
> @@ -5718,7 +5672,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
> * the page lock, which serializes swap cache removal, which
> * in turn serializes uncharging.
> */
> - if (pc->mem_cgroup)
> + if (page->mem_cgroup)
> goto out;
> }
>
> @@ -5871,7 +5825,6 @@ static void uncharge_list(struct list_head *page_list)
> next = page_list->next;
> do {
> unsigned int nr_pages = 1;
> - struct page_cgroup *pc;
>
> page = list_entry(next, struct page, lru);
> next = page->lru.next;
> @@ -5879,23 +5832,22 @@ static void uncharge_list(struct list_head *page_list)
> VM_BUG_ON_PAGE(PageLRU(page), page);
> VM_BUG_ON_PAGE(page_count(page), page);
>
> - pc = lookup_page_cgroup(page);
> - if (!pc->mem_cgroup)
> + if (!page->mem_cgroup)
> continue;
>
> /*
> * Nobody should be changing or seriously looking at
> - * pc->mem_cgroup at this point, we have fully
> + * page->mem_cgroup at this point, we have fully
> * exclusive access to the page.
> */
>
> - if (memcg != pc->mem_cgroup) {
> + if (memcg != page->mem_cgroup) {
> if (memcg) {
> uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
> nr_huge, page);
> pgpgout = nr_anon = nr_file = nr_huge = 0;
> }
> - memcg = pc->mem_cgroup;
> + memcg = page->mem_cgroup;
> }
>
> if (PageTransHuge(page)) {
> @@ -5909,7 +5861,7 @@ static void uncharge_list(struct list_head *page_list)
> else
> nr_file += nr_pages;
>
> - pc->mem_cgroup = NULL;
> + page->mem_cgroup = NULL;
>
> pgpgout++;
> } while (next != page_list);
> @@ -5928,14 +5880,11 @@ static void uncharge_list(struct list_head *page_list)
> */
> void mem_cgroup_uncharge(struct page *page)
> {
> - struct page_cgroup *pc;
> -
> if (mem_cgroup_disabled())
> return;
>
> /* Don't touch page->lru of any random page, pre-check: */
> - pc = lookup_page_cgroup(page);
> - if (!pc->mem_cgroup)
> + if (!page->mem_cgroup)
> return;
>
> INIT_LIST_HEAD(&page->lru);
> @@ -5972,7 +5921,6 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
> bool lrucare)
> {
> struct mem_cgroup *memcg;
> - struct page_cgroup *pc;
> int isolated;
>
> VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
> @@ -5987,8 +5935,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
> return;
>
> /* Page cache replacement: new page already charged? */
> - pc = lookup_page_cgroup(newpage);
> - if (pc->mem_cgroup)
> + if (newpage->mem_cgroup)
> return;
>
> /*
> @@ -5997,15 +5944,14 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
> * uncharged page when the PFN walker finds a page that
> * reclaim just put back on the LRU but has not released yet.
> */
> - pc = lookup_page_cgroup(oldpage);
> - memcg = pc->mem_cgroup;
> + memcg = oldpage->mem_cgroup;
> if (!memcg)
> return;
>
> if (lrucare)
> lock_page_lru(oldpage, &isolated);
>
> - pc->mem_cgroup = NULL;
> + oldpage->mem_cgroup = NULL;
>
> if (lrucare)
> unlock_page_lru(oldpage, isolated);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index fa9426389c6c..6a952237a677 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -48,7 +48,6 @@
> #include <linux/backing-dev.h>
> #include <linux/fault-inject.h>
> #include <linux/page-isolation.h>
> -#include <linux/page_cgroup.h>
> #include <linux/debugobjects.h>
> #include <linux/kmemleak.h>
> #include <linux/compaction.h>
> @@ -4899,7 +4898,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
> #endif
> init_waitqueue_head(&pgdat->kswapd_wait);
> init_waitqueue_head(&pgdat->pfmemalloc_wait);
> - pgdat_page_cgroup_init(pgdat);
>
> for (j = 0; j < MAX_NR_ZONES; j++) {
> struct zone *zone = pgdat->node_zones + j;
> diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> index 5331c2bd85a2..f0f31c1d4d0c 100644
> --- a/mm/page_cgroup.c
> +++ b/mm/page_cgroup.c
> @@ -1,326 +1,7 @@
> #include <linux/mm.h>
> -#include <linux/mmzone.h>
> -#include <linux/bootmem.h>
> -#include <linux/bit_spinlock.h>
> #include <linux/page_cgroup.h>
> -#include <linux/hash.h>
> -#include <linux/slab.h>
> -#include <linux/memory.h>
> #include <linux/vmalloc.h>
> -#include <linux/cgroup.h>
> #include <linux/swapops.h>
> -#include <linux/kmemleak.h>
> -
> -static unsigned long total_usage;
> -
> -#if !defined(CONFIG_SPARSEMEM)
> -
> -
> -void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
> -{
> - pgdat->node_page_cgroup = NULL;
> -}
> -
> -struct page_cgroup *lookup_page_cgroup(struct page *page)
> -{
> - unsigned long pfn = page_to_pfn(page);
> - unsigned long offset;
> - struct page_cgroup *base;
> -
> - base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
> -#ifdef CONFIG_DEBUG_VM
> - /*
> - * The sanity checks the page allocator does upon freeing a
> - * page can reach here before the page_cgroup arrays are
> - * allocated when feeding a range of pages to the allocator
> - * for the first time during bootup or memory hotplug.
> - */
> - if (unlikely(!base))
> - return NULL;
> -#endif
> - offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
> - return base + offset;
> -}
> -
> -static int __init alloc_node_page_cgroup(int nid)
> -{
> - struct page_cgroup *base;
> - unsigned long table_size;
> - unsigned long nr_pages;
> -
> - nr_pages = NODE_DATA(nid)->node_spanned_pages;
> - if (!nr_pages)
> - return 0;
> -
> - table_size = sizeof(struct page_cgroup) * nr_pages;
> -
> - base = memblock_virt_alloc_try_nid_nopanic(
> - table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
> - BOOTMEM_ALLOC_ACCESSIBLE, nid);
> - if (!base)
> - return -ENOMEM;
> - NODE_DATA(nid)->node_page_cgroup = base;
> - total_usage += table_size;
> - return 0;
> -}
> -
> -void __init page_cgroup_init_flatmem(void)
> -{
> -
> - int nid, fail;
> -
> - if (mem_cgroup_disabled())
> - return;
> -
> - for_each_online_node(nid) {
> - fail = alloc_node_page_cgroup(nid);
> - if (fail)
> - goto fail;
> - }
> - printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
> - printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
> - " don't want memory cgroups\n");
> - return;
> -fail:
> - printk(KERN_CRIT "allocation of page_cgroup failed.\n");
> - printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
> - panic("Out of memory");
> -}
> -
> -#else /* CONFIG_FLAT_NODE_MEM_MAP */
> -
> -struct page_cgroup *lookup_page_cgroup(struct page *page)
> -{
> - unsigned long pfn = page_to_pfn(page);
> - struct mem_section *section = __pfn_to_section(pfn);
> -#ifdef CONFIG_DEBUG_VM
> - /*
> - * The sanity checks the page allocator does upon freeing a
> - * page can reach here before the page_cgroup arrays are
> - * allocated when feeding a range of pages to the allocator
> - * for the first time during bootup or memory hotplug.
> - */
> - if (!section->page_cgroup)
> - return NULL;
> -#endif
> - return section->page_cgroup + pfn;
> -}
> -
> -static void *__meminit alloc_page_cgroup(size_t size, int nid)
> -{
> - gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
> - void *addr = NULL;
> -
> - addr = alloc_pages_exact_nid(nid, size, flags);
> - if (addr) {
> - kmemleak_alloc(addr, size, 1, flags);
> - return addr;
> - }
> -
> - if (node_state(nid, N_HIGH_MEMORY))
> - addr = vzalloc_node(size, nid);
> - else
> - addr = vzalloc(size);
> -
> - return addr;
> -}
> -
> -static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
> -{
> - struct mem_section *section;
> - struct page_cgroup *base;
> - unsigned long table_size;
> -
> - section = __pfn_to_section(pfn);
> -
> - if (section->page_cgroup)
> - return 0;
> -
> - table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
> - base = alloc_page_cgroup(table_size, nid);
> -
> - /*
> - * The value stored in section->page_cgroup is (base - pfn)
> - * and it does not point to the memory block allocated above,
> - * causing kmemleak false positives.
> - */
> - kmemleak_not_leak(base);
> -
> - if (!base) {
> - printk(KERN_ERR "page cgroup allocation failure\n");
> - return -ENOMEM;
> - }
> -
> - /*
> - * The passed "pfn" may not be aligned to SECTION. For the calculation
> - * we need to apply a mask.
> - */
> - pfn &= PAGE_SECTION_MASK;
> - section->page_cgroup = base - pfn;
> - total_usage += table_size;
> - return 0;
> -}
> -#ifdef CONFIG_MEMORY_HOTPLUG
> -static void free_page_cgroup(void *addr)
> -{
> - if (is_vmalloc_addr(addr)) {
> - vfree(addr);
> - } else {
> - struct page *page = virt_to_page(addr);
> - size_t table_size =
> - sizeof(struct page_cgroup) * PAGES_PER_SECTION;
> -
> - BUG_ON(PageReserved(page));
> - kmemleak_free(addr);
> - free_pages_exact(addr, table_size);
> - }
> -}
> -
> -static void __free_page_cgroup(unsigned long pfn)
> -{
> - struct mem_section *ms;
> - struct page_cgroup *base;
> -
> - ms = __pfn_to_section(pfn);
> - if (!ms || !ms->page_cgroup)
> - return;
> - base = ms->page_cgroup + pfn;
> - free_page_cgroup(base);
> - ms->page_cgroup = NULL;
> -}
> -
> -static int __meminit online_page_cgroup(unsigned long start_pfn,
> - unsigned long nr_pages,
> - int nid)
> -{
> - unsigned long start, end, pfn;
> - int fail = 0;
> -
> - start = SECTION_ALIGN_DOWN(start_pfn);
> - end = SECTION_ALIGN_UP(start_pfn + nr_pages);
> -
> - if (nid == -1) {
> - /*
> - * In this case, "nid" already exists and contains valid memory.
> - * "start_pfn" passed to us is a pfn which is an arg for
> - * online__pages(), and start_pfn should exist.
> - */
> - nid = pfn_to_nid(start_pfn);
> - VM_BUG_ON(!node_state(nid, N_ONLINE));
> - }
> -
> - for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
> - if (!pfn_present(pfn))
> - continue;
> - fail = init_section_page_cgroup(pfn, nid);
> - }
> - if (!fail)
> - return 0;
> -
> - /* rollback */
> - for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
> - __free_page_cgroup(pfn);
> -
> - return -ENOMEM;
> -}
> -
> -static int __meminit offline_page_cgroup(unsigned long start_pfn,
> - unsigned long nr_pages, int nid)
> -{
> - unsigned long start, end, pfn;
> -
> - start = SECTION_ALIGN_DOWN(start_pfn);
> - end = SECTION_ALIGN_UP(start_pfn + nr_pages);
> -
> - for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
> - __free_page_cgroup(pfn);
> - return 0;
> -
> -}
> -
> -static int __meminit page_cgroup_callback(struct notifier_block *self,
> - unsigned long action, void *arg)
> -{
> - struct memory_notify *mn = arg;
> - int ret = 0;
> - switch (action) {
> - case MEM_GOING_ONLINE:
> - ret = online_page_cgroup(mn->start_pfn,
> - mn->nr_pages, mn->status_change_nid);
> - break;
> - case MEM_OFFLINE:
> - offline_page_cgroup(mn->start_pfn,
> - mn->nr_pages, mn->status_change_nid);
> - break;
> - case MEM_CANCEL_ONLINE:
> - offline_page_cgroup(mn->start_pfn,
> - mn->nr_pages, mn->status_change_nid);
> - break;
> - case MEM_GOING_OFFLINE:
> - break;
> - case MEM_ONLINE:
> - case MEM_CANCEL_OFFLINE:
> - break;
> - }
> -
> - return notifier_from_errno(ret);
> -}
> -
> -#endif
> -
> -void __init page_cgroup_init(void)
> -{
> - unsigned long pfn;
> - int nid;
> -
> - if (mem_cgroup_disabled())
> - return;
> -
> - for_each_node_state(nid, N_MEMORY) {
> - unsigned long start_pfn, end_pfn;
> -
> - start_pfn = node_start_pfn(nid);
> - end_pfn = node_end_pfn(nid);
> - /*
> - * start_pfn and end_pfn may not be aligned to SECTION and the
> - * page->flags of out of node pages are not initialized. So we
> - * scan [start_pfn, the biggest section's pfn < end_pfn) here.
> - */
> - for (pfn = start_pfn;
> - pfn < end_pfn;
> - pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
> -
> - if (!pfn_valid(pfn))
> - continue;
> - /*
> - * Nodes's pfns can be overlapping.
> - * We know some arch can have a nodes layout such as
> - * -------------pfn-------------->
> - * N0 | N1 | N2 | N0 | N1 | N2|....
> - */
> - if (pfn_to_nid(pfn) != nid)
> - continue;
> - if (init_section_page_cgroup(pfn, nid))
> - goto oom;
> - }
> - }
> - hotplug_memory_notifier(page_cgroup_callback, 0);
> - printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
> - printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
> - "don't want memory cgroups\n");
> - return;
> -oom:
> - printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
> - panic("Out of memory");
> -}
> -
> -void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
> -{
> - return;
> -}
> -
> -#endif
> -
>
> #ifdef CONFIG_MEMCG_SWAP
>
> --
> 2.1.3
>
--
Michal Hocko
SUSE Labs
On Sat 01-11-14 23:15:55, Johannes Weiner wrote:
> Now that the external page_cgroup data structure and its lookup is
> gone, the only code remaining in there is swap slot accounting.
>
> Rename it and move the conditional compilation into mm/Makefile.
>
> Signed-off-by: Johannes Weiner <[email protected]>
Acked-by: Michal Hocko <[email protected]>
> ---
> MAINTAINERS | 2 +-
> include/linux/page_cgroup.h | 40 ---------
> include/linux/swap_cgroup.h | 42 +++++++++
> mm/Makefile | 3 +-
> mm/memcontrol.c | 2 +-
> mm/page_cgroup.c | 211 --------------------------------------------
> mm/swap_cgroup.c | 208 +++++++++++++++++++++++++++++++++++++++++++
> mm/swap_state.c | 1 -
> mm/swapfile.c | 2 +-
> 9 files changed, 255 insertions(+), 256 deletions(-)
> delete mode 100644 include/linux/page_cgroup.h
> create mode 100644 include/linux/swap_cgroup.h
> delete mode 100644 mm/page_cgroup.c
> create mode 100644 mm/swap_cgroup.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 7e31be07197e..3a60389d3a13 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -2583,7 +2583,7 @@ L: [email protected]
> L: [email protected]
> S: Maintained
> F: mm/memcontrol.c
> -F: mm/page_cgroup.c
> +F: mm/swap_cgroup.c
>
> CORETEMP HARDWARE MONITORING DRIVER
> M: Fenghua Yu <[email protected]>
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> deleted file mode 100644
> index 65be35785c86..000000000000
> --- a/include/linux/page_cgroup.h
> +++ /dev/null
> @@ -1,40 +0,0 @@
> -#ifndef __LINUX_PAGE_CGROUP_H
> -#define __LINUX_PAGE_CGROUP_H
> -
> -#include <linux/swap.h>
> -
> -#ifdef CONFIG_MEMCG_SWAP
> -extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
> - unsigned short old, unsigned short new);
> -extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
> -extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
> -extern int swap_cgroup_swapon(int type, unsigned long max_pages);
> -extern void swap_cgroup_swapoff(int type);
> -#else
> -
> -static inline
> -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
> -{
> - return 0;
> -}
> -
> -static inline
> -unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
> -{
> - return 0;
> -}
> -
> -static inline int
> -swap_cgroup_swapon(int type, unsigned long max_pages)
> -{
> - return 0;
> -}
> -
> -static inline void swap_cgroup_swapoff(int type)
> -{
> - return;
> -}
> -
> -#endif /* CONFIG_MEMCG_SWAP */
> -
> -#endif /* __LINUX_PAGE_CGROUP_H */
> diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h
> new file mode 100644
> index 000000000000..145306bdc92f
> --- /dev/null
> +++ b/include/linux/swap_cgroup.h
> @@ -0,0 +1,42 @@
> +#ifndef __LINUX_SWAP_CGROUP_H
> +#define __LINUX_SWAP_CGROUP_H
> +
> +#include <linux/swap.h>
> +
> +#ifdef CONFIG_MEMCG_SWAP
> +
> +extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
> + unsigned short old, unsigned short new);
> +extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
> +extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
> +extern int swap_cgroup_swapon(int type, unsigned long max_pages);
> +extern void swap_cgroup_swapoff(int type);
> +
> +#else
> +
> +static inline
> +unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
> +{
> + return 0;
> +}
> +
> +static inline
> +unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
> +{
> + return 0;
> +}
> +
> +static inline int
> +swap_cgroup_swapon(int type, unsigned long max_pages)
> +{
> + return 0;
> +}
> +
> +static inline void swap_cgroup_swapoff(int type)
> +{
> + return;
> +}
> +
> +#endif /* CONFIG_MEMCG_SWAP */
> +
> +#endif /* __LINUX_SWAP_CGROUP_H */
> diff --git a/mm/Makefile b/mm/Makefile
> index 27ddb80403a9..d9d579484f15 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -56,7 +56,8 @@ obj-$(CONFIG_MIGRATION) += migrate.o
> obj-$(CONFIG_QUICKLIST) += quicklist.o
> obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
> obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
> -obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o
> +obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
> +obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o
> obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
> obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
> obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index dc5e0abb18cb..fbb41a170eae 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -51,7 +51,7 @@
> #include <linux/seq_file.h>
> #include <linux/vmpressure.h>
> #include <linux/mm_inline.h>
> -#include <linux/page_cgroup.h>
> +#include <linux/swap_cgroup.h>
> #include <linux/cpu.h>
> #include <linux/oom.h>
> #include <linux/lockdep.h>
> diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> deleted file mode 100644
> index f0f31c1d4d0c..000000000000
> --- a/mm/page_cgroup.c
> +++ /dev/null
> @@ -1,211 +0,0 @@
> -#include <linux/mm.h>
> -#include <linux/page_cgroup.h>
> -#include <linux/vmalloc.h>
> -#include <linux/swapops.h>
> -
> -#ifdef CONFIG_MEMCG_SWAP
> -
> -static DEFINE_MUTEX(swap_cgroup_mutex);
> -struct swap_cgroup_ctrl {
> - struct page **map;
> - unsigned long length;
> - spinlock_t lock;
> -};
> -
> -static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
> -
> -struct swap_cgroup {
> - unsigned short id;
> -};
> -#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
> -
> -/*
> - * SwapCgroup implements "lookup" and "exchange" operations.
> - * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
> - * against SwapCache. At swap_free(), this is accessed directly from swap.
> - *
> - * This means,
> - * - we have no race in "exchange" when we're accessed via SwapCache because
> - * SwapCache(and its swp_entry) is under lock.
> - * - When called via swap_free(), there is no user of this entry and no race.
> - * Then, we don't need lock around "exchange".
> - *
> - * TODO: we can push these buffers out to HIGHMEM.
> - */
> -
> -/*
> - * allocate buffer for swap_cgroup.
> - */
> -static int swap_cgroup_prepare(int type)
> -{
> - struct page *page;
> - struct swap_cgroup_ctrl *ctrl;
> - unsigned long idx, max;
> -
> - ctrl = &swap_cgroup_ctrl[type];
> -
> - for (idx = 0; idx < ctrl->length; idx++) {
> - page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> - if (!page)
> - goto not_enough_page;
> - ctrl->map[idx] = page;
> - }
> - return 0;
> -not_enough_page:
> - max = idx;
> - for (idx = 0; idx < max; idx++)
> - __free_page(ctrl->map[idx]);
> -
> - return -ENOMEM;
> -}
> -
> -static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
> - struct swap_cgroup_ctrl **ctrlp)
> -{
> - pgoff_t offset = swp_offset(ent);
> - struct swap_cgroup_ctrl *ctrl;
> - struct page *mappage;
> - struct swap_cgroup *sc;
> -
> - ctrl = &swap_cgroup_ctrl[swp_type(ent)];
> - if (ctrlp)
> - *ctrlp = ctrl;
> -
> - mappage = ctrl->map[offset / SC_PER_PAGE];
> - sc = page_address(mappage);
> - return sc + offset % SC_PER_PAGE;
> -}
> -
> -/**
> - * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
> - * @ent: swap entry to be cmpxchged
> - * @old: old id
> - * @new: new id
> - *
> - * Returns old id at success, 0 at failure.
> - * (There is no mem_cgroup using 0 as its id)
> - */
> -unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
> - unsigned short old, unsigned short new)
> -{
> - struct swap_cgroup_ctrl *ctrl;
> - struct swap_cgroup *sc;
> - unsigned long flags;
> - unsigned short retval;
> -
> - sc = lookup_swap_cgroup(ent, &ctrl);
> -
> - spin_lock_irqsave(&ctrl->lock, flags);
> - retval = sc->id;
> - if (retval == old)
> - sc->id = new;
> - else
> - retval = 0;
> - spin_unlock_irqrestore(&ctrl->lock, flags);
> - return retval;
> -}
> -
> -/**
> - * swap_cgroup_record - record mem_cgroup for this swp_entry.
> - * @ent: swap entry to be recorded into
> - * @id: mem_cgroup to be recorded
> - *
> - * Returns old value at success, 0 at failure.
> - * (Of course, old value can be 0.)
> - */
> -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
> -{
> - struct swap_cgroup_ctrl *ctrl;
> - struct swap_cgroup *sc;
> - unsigned short old;
> - unsigned long flags;
> -
> - sc = lookup_swap_cgroup(ent, &ctrl);
> -
> - spin_lock_irqsave(&ctrl->lock, flags);
> - old = sc->id;
> - sc->id = id;
> - spin_unlock_irqrestore(&ctrl->lock, flags);
> -
> - return old;
> -}
> -
> -/**
> - * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
> - * @ent: swap entry to be looked up.
> - *
> - * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
> - */
> -unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
> -{
> - return lookup_swap_cgroup(ent, NULL)->id;
> -}
> -
> -int swap_cgroup_swapon(int type, unsigned long max_pages)
> -{
> - void *array;
> - unsigned long array_size;
> - unsigned long length;
> - struct swap_cgroup_ctrl *ctrl;
> -
> - if (!do_swap_account)
> - return 0;
> -
> - length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
> - array_size = length * sizeof(void *);
> -
> - array = vzalloc(array_size);
> - if (!array)
> - goto nomem;
> -
> - ctrl = &swap_cgroup_ctrl[type];
> - mutex_lock(&swap_cgroup_mutex);
> - ctrl->length = length;
> - ctrl->map = array;
> - spin_lock_init(&ctrl->lock);
> - if (swap_cgroup_prepare(type)) {
> - /* memory shortage */
> - ctrl->map = NULL;
> - ctrl->length = 0;
> - mutex_unlock(&swap_cgroup_mutex);
> - vfree(array);
> - goto nomem;
> - }
> - mutex_unlock(&swap_cgroup_mutex);
> -
> - return 0;
> -nomem:
> - printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
> - printk(KERN_INFO
> - "swap_cgroup can be disabled by swapaccount=0 boot option\n");
> - return -ENOMEM;
> -}
> -
> -void swap_cgroup_swapoff(int type)
> -{
> - struct page **map;
> - unsigned long i, length;
> - struct swap_cgroup_ctrl *ctrl;
> -
> - if (!do_swap_account)
> - return;
> -
> - mutex_lock(&swap_cgroup_mutex);
> - ctrl = &swap_cgroup_ctrl[type];
> - map = ctrl->map;
> - length = ctrl->length;
> - ctrl->map = NULL;
> - ctrl->length = 0;
> - mutex_unlock(&swap_cgroup_mutex);
> -
> - if (map) {
> - for (i = 0; i < length; i++) {
> - struct page *page = map[i];
> - if (page)
> - __free_page(page);
> - }
> - vfree(map);
> - }
> -}
> -
> -#endif
> diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
> new file mode 100644
> index 000000000000..b5f7f24b8dd1
> --- /dev/null
> +++ b/mm/swap_cgroup.c
> @@ -0,0 +1,208 @@
> +#include <linux/swap_cgroup.h>
> +#include <linux/vmalloc.h>
> +#include <linux/mm.h>
> +
> +#include <linux/swapops.h> /* depends on mm.h include */
> +
> +static DEFINE_MUTEX(swap_cgroup_mutex);
> +struct swap_cgroup_ctrl {
> + struct page **map;
> + unsigned long length;
> + spinlock_t lock;
> +};
> +
> +static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
> +
> +struct swap_cgroup {
> + unsigned short id;
> +};
> +#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
> +
> +/*
> + * SwapCgroup implements "lookup" and "exchange" operations.
> + * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
> + * against SwapCache. At swap_free(), this is accessed directly from swap.
> + *
> + * This means,
> + * - we have no race in "exchange" when we're accessed via SwapCache because
> + * SwapCache(and its swp_entry) is under lock.
> + * - When called via swap_free(), there is no user of this entry and no race.
> + * Then, we don't need lock around "exchange".
> + *
> + * TODO: we can push these buffers out to HIGHMEM.
> + */
> +
> +/*
> + * allocate buffer for swap_cgroup.
> + */
> +static int swap_cgroup_prepare(int type)
> +{
> + struct page *page;
> + struct swap_cgroup_ctrl *ctrl;
> + unsigned long idx, max;
> +
> + ctrl = &swap_cgroup_ctrl[type];
> +
> + for (idx = 0; idx < ctrl->length; idx++) {
> + page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> + if (!page)
> + goto not_enough_page;
> + ctrl->map[idx] = page;
> + }
> + return 0;
> +not_enough_page:
> + max = idx;
> + for (idx = 0; idx < max; idx++)
> + __free_page(ctrl->map[idx]);
> +
> + return -ENOMEM;
> +}
> +
> +static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
> + struct swap_cgroup_ctrl **ctrlp)
> +{
> + pgoff_t offset = swp_offset(ent);
> + struct swap_cgroup_ctrl *ctrl;
> + struct page *mappage;
> + struct swap_cgroup *sc;
> +
> + ctrl = &swap_cgroup_ctrl[swp_type(ent)];
> + if (ctrlp)
> + *ctrlp = ctrl;
> +
> + mappage = ctrl->map[offset / SC_PER_PAGE];
> + sc = page_address(mappage);
> + return sc + offset % SC_PER_PAGE;
> +}
> +
> +/**
> + * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
> + * @ent: swap entry to be cmpxchged
> + * @old: old id
> + * @new: new id
> + *
> + * Returns old id at success, 0 at failure.
> + * (There is no mem_cgroup using 0 as its id)
> + */
> +unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
> + unsigned short old, unsigned short new)
> +{
> + struct swap_cgroup_ctrl *ctrl;
> + struct swap_cgroup *sc;
> + unsigned long flags;
> + unsigned short retval;
> +
> + sc = lookup_swap_cgroup(ent, &ctrl);
> +
> + spin_lock_irqsave(&ctrl->lock, flags);
> + retval = sc->id;
> + if (retval == old)
> + sc->id = new;
> + else
> + retval = 0;
> + spin_unlock_irqrestore(&ctrl->lock, flags);
> + return retval;
> +}
> +
> +/**
> + * swap_cgroup_record - record mem_cgroup for this swp_entry.
> + * @ent: swap entry to be recorded into
> + * @id: mem_cgroup to be recorded
> + *
> + * Returns old value at success, 0 at failure.
> + * (Of course, old value can be 0.)
> + */
> +unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
> +{
> + struct swap_cgroup_ctrl *ctrl;
> + struct swap_cgroup *sc;
> + unsigned short old;
> + unsigned long flags;
> +
> + sc = lookup_swap_cgroup(ent, &ctrl);
> +
> + spin_lock_irqsave(&ctrl->lock, flags);
> + old = sc->id;
> + sc->id = id;
> + spin_unlock_irqrestore(&ctrl->lock, flags);
> +
> + return old;
> +}
> +
> +/**
> + * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
> + * @ent: swap entry to be looked up.
> + *
> + * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
> + */
> +unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
> +{
> + return lookup_swap_cgroup(ent, NULL)->id;
> +}
> +
> +int swap_cgroup_swapon(int type, unsigned long max_pages)
> +{
> + void *array;
> + unsigned long array_size;
> + unsigned long length;
> + struct swap_cgroup_ctrl *ctrl;
> +
> + if (!do_swap_account)
> + return 0;
> +
> + length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
> + array_size = length * sizeof(void *);
> +
> + array = vzalloc(array_size);
> + if (!array)
> + goto nomem;
> +
> + ctrl = &swap_cgroup_ctrl[type];
> + mutex_lock(&swap_cgroup_mutex);
> + ctrl->length = length;
> + ctrl->map = array;
> + spin_lock_init(&ctrl->lock);
> + if (swap_cgroup_prepare(type)) {
> + /* memory shortage */
> + ctrl->map = NULL;
> + ctrl->length = 0;
> + mutex_unlock(&swap_cgroup_mutex);
> + vfree(array);
> + goto nomem;
> + }
> + mutex_unlock(&swap_cgroup_mutex);
> +
> + return 0;
> +nomem:
> + printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
> + printk(KERN_INFO
> + "swap_cgroup can be disabled by swapaccount=0 boot option\n");
> + return -ENOMEM;
> +}
> +
> +void swap_cgroup_swapoff(int type)
> +{
> + struct page **map;
> + unsigned long i, length;
> + struct swap_cgroup_ctrl *ctrl;
> +
> + if (!do_swap_account)
> + return;
> +
> + mutex_lock(&swap_cgroup_mutex);
> + ctrl = &swap_cgroup_ctrl[type];
> + map = ctrl->map;
> + length = ctrl->length;
> + ctrl->map = NULL;
> + ctrl->length = 0;
> + mutex_unlock(&swap_cgroup_mutex);
> +
> + if (map) {
> + for (i = 0; i < length; i++) {
> + struct page *page = map[i];
> + if (page)
> + __free_page(page);
> + }
> + vfree(map);
> + }
> +}
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 154444918685..9711342987a0 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -17,7 +17,6 @@
> #include <linux/blkdev.h>
> #include <linux/pagevec.h>
> #include <linux/migrate.h>
> -#include <linux/page_cgroup.h>
>
> #include <asm/pgtable.h>
>
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 8798b2e0ac59..63f55ccb9b26 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -38,7 +38,7 @@
> #include <asm/pgtable.h>
> #include <asm/tlbflush.h>
> #include <linux/swapops.h>
> -#include <linux/page_cgroup.h>
> +#include <linux/swap_cgroup.h>
>
> static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
> unsigned char);
> --
> 2.1.3
>
--
Michal Hocko
SUSE Labs
On Sat 01-11-14 23:15:56, Johannes Weiner wrote:
> Now that the external page_cgroup data structure and its lookup is
> gone, let the generic bad_page() check for page->mem_cgroup sanity.
>
> Signed-off-by: Johannes Weiner <[email protected]>
Acked-by: Michal Hocko <[email protected]>
> ---
> include/linux/memcontrol.h | 4 ----
> mm/debug.c | 5 ++++-
> mm/memcontrol.c | 15 ---------------
> mm/page_alloc.c | 12 ++++++++----
> 4 files changed, 12 insertions(+), 24 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index dafba59b31b4..e789551d4db0 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -173,10 +173,6 @@ static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
> void mem_cgroup_split_huge_fixup(struct page *head);
> #endif
>
> -#ifdef CONFIG_DEBUG_VM
> -bool mem_cgroup_bad_page_check(struct page *page);
> -void mem_cgroup_print_bad_page(struct page *page);
> -#endif
> #else /* CONFIG_MEMCG */
> struct mem_cgroup;
>
> diff --git a/mm/debug.c b/mm/debug.c
> index 5ce45c9a29b5..0e58f3211f89 100644
> --- a/mm/debug.c
> +++ b/mm/debug.c
> @@ -95,7 +95,10 @@ void dump_page_badflags(struct page *page, const char *reason,
> dump_flags(page->flags & badflags,
> pageflag_names, ARRAY_SIZE(pageflag_names));
> }
> - mem_cgroup_print_bad_page(page);
> +#ifdef CONFIG_MEMCG
> + if (page->mem_cgroup)
> + pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
> +#endif
> }
>
> void dump_page(struct page *page, const char *reason)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index fbb41a170eae..3645641513a1 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3157,21 +3157,6 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
> }
> #endif
>
> -#ifdef CONFIG_DEBUG_VM
> -bool mem_cgroup_bad_page_check(struct page *page)
> -{
> - if (mem_cgroup_disabled())
> - return false;
> -
> - return page->mem_cgroup != NULL;
> -}
> -
> -void mem_cgroup_print_bad_page(struct page *page)
> -{
> - pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
> -}
> -#endif
> -
> static DEFINE_MUTEX(memcg_limit_mutex);
>
> static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 6a952237a677..161da09fcda2 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -653,8 +653,10 @@ static inline int free_pages_check(struct page *page)
> bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
> bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
> }
> - if (unlikely(mem_cgroup_bad_page_check(page)))
> - bad_reason = "cgroup check failed";
> +#ifdef CONFIG_MEMCG
> + if (unlikely(page->mem_cgroup))
> + bad_reason = "page still charged to cgroup";
> +#endif
> if (unlikely(bad_reason)) {
> bad_page(page, bad_reason, bad_flags);
> return 1;
> @@ -920,8 +922,10 @@ static inline int check_new_page(struct page *page)
> bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
> bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
> }
> - if (unlikely(mem_cgroup_bad_page_check(page)))
> - bad_reason = "cgroup check failed";
> +#ifdef CONFIG_MEMCG
> + if (unlikely(page->mem_cgroup))
> + bad_reason = "page still charged to cgroup";
> +#endif
> if (unlikely(bad_reason)) {
> bad_page(page, bad_reason, bad_flags);
> return 1;
> --
> 2.1.3
>
--
Michal Hocko
SUSE Labs
On Mon 03-11-14 10:09:42, Johannes Weiner wrote:
> Hi Joonsoo,
>
> On Mon, Nov 03, 2014 at 05:02:08PM +0900, Joonsoo Kim wrote:
> > On Sat, Nov 01, 2014 at 11:15:54PM -0400, Johannes Weiner wrote:
> > > Memory cgroups used to have 5 per-page pointers. To allow users to
> > > disable that amount of overhead during runtime, those pointers were
> > > allocated in a separate array, with a translation layer between them
> > > and struct page.
> >
> > Hello, Johannes.
> >
> > I'd like to leave this translation layer.
> > Could you just disable that code with #ifdef until next user comes?
> >
> > In our company, we uses PAGE_OWNER on mm tree which is the feature
> > saying who allocates the page. To use PAGE_OWNER needs modifying
> > struct page and then needs re-compile. This re-compile makes us difficult
> > to use this feature. So, we decide to implement run-time configurable
> > PAGE_OWNER through page_cgroup's translation layer code. Moreover, with
> > this infrastructure, I plan to implement some other debugging feature.
> >
> > Because of my laziness, it didn't submitted to LKML. But, I will
> > submit it as soon as possible. If the code is removed, I would
> > copy-and-paste the code, but, it would cause lose of the history on
> > that code. So if possible, I'd like to leave that code now.
>
> Please re-introduce this code when your new usecase is ready to be
> upstreamed. There is little reason to burden an unrelated feature
> with a sizable chunk of dead code for a vague future user.
Completely agreed! I would hate to have some random feature piggy back
on memcg internal structures. For that to be acceptable the page_cgroup
would have to be abstracted into a more generic structure which wouldn't
be under CONFIG_MEMCG anyway.
--
Michal Hocko
SUSE Labs
Documentation/cgroups/memory.txt is outdate even more hopelessly than
before. It deserves a complete rewrite but I guess something like the
following should be added in the meantime to prepare potential readers
about the trap.
---
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 67613ff0270c..46b2b5080317 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -1,5 +1,10 @@
Memory Resource Controller
+NOTE: This document is hopelessly outdated and it asks for a complete
+ rewrite. It still contains a useful information so we are keeping it
+ here but make sure to check the current code if you need a deeper
+ understanding.
+
NOTE: The Memory Resource Controller has generically been referred to as the
memory controller in this document. Do not confuse memory controller
used here with the memory controller that is used in hardware.
--
Michal Hocko
SUSE Labs
On Sat, Nov 01, 2014 at 11:15:54PM -0400, Johannes Weiner wrote:
> Memory cgroups used to have 5 per-page pointers. To allow users to
> disable that amount of overhead during runtime, those pointers were
> allocated in a separate array, with a translation layer between them
> and struct page.
>
> There is now only one page pointer remaining: the memcg pointer, that
> indicates which cgroup the page is associated with when charged. The
> complexity of runtime allocation and the runtime translation overhead
> is no longer justified to save that *potential* 0.19% of memory. With
> CONFIG_SLUB, page->mem_cgroup actually sits in the doubleword padding
> after the page->private member and doesn't even increase struct page,
> and then this patch actually saves space. Remaining users that care
> can still compile their kernels without CONFIG_MEMCG.
>
> text data bss dec hex filename
> 8828345 1725264 983040 11536649 b00909 vmlinux.old
> 8827425 1725264 966656 11519345 afc571 vmlinux.new
>
> Signed-off-by: Johannes Weiner <[email protected]>
Acked-by: Vladimir Davydov <[email protected]>
On Sat, Nov 01, 2014 at 11:15:55PM -0400, Johannes Weiner wrote:
> Now that the external page_cgroup data structure and its lookup is
> gone, the only code remaining in there is swap slot accounting.
>
> Rename it and move the conditional compilation into mm/Makefile.
>
> Signed-off-by: Johannes Weiner <[email protected]>
Acked-by: Vladimir Davydov <[email protected]>
On Sat, Nov 01, 2014 at 11:15:56PM -0400, Johannes Weiner wrote:
> Now that the external page_cgroup data structure and its lookup is
> gone, let the generic bad_page() check for page->mem_cgroup sanity.
>
> Signed-off-by: Johannes Weiner <[email protected]>
Acked-by: Vladimir Davydov <[email protected]>
Remove unneeded !CONFIG_MEMCG memcg bad_page() dummies as well.
Signed-off-by: Johannes Weiner <[email protected]>
---
include/linux/memcontrol.h | 13 -------------
1 file changed, 13 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e789551d4db0..41ae72b02b55 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -342,19 +342,6 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
}
#endif /* CONFIG_MEMCG */
-#if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM)
-static inline bool
-mem_cgroup_bad_page_check(struct page *page)
-{
- return false;
-}
-
-static inline void
-mem_cgroup_print_bad_page(struct page *page)
-{
-}
-#endif
-
enum {
UNDER_LIMIT,
SOFT_LIMIT,
--
2.1.3
From: Johannes Weiner <[email protected]>
Date: Mon, 3 Nov 2014 13:27:40 -0500
> Remove unneeded !CONFIG_MEMCG memcg bad_page() dummies as well.
>
> Signed-off-by: Johannes Weiner <[email protected]>
Acked-by: David S. Miller <[email protected]>
On Sat, Nov 01, 2014 at 11:15:54PM -0400, Johannes Weiner wrote:
> Memory cgroups used to have 5 per-page pointers. To allow users to
> disable that amount of overhead during runtime, those pointers were
> allocated in a separate array, with a translation layer between them
> and struct page.
>
> There is now only one page pointer remaining: the memcg pointer, that
> indicates which cgroup the page is associated with when charged. The
> complexity of runtime allocation and the runtime translation overhead
> is no longer justified to save that *potential* 0.19% of memory.
How much do you win by the change?
--
Kirill A. Shutemov
On Mon, Nov 03, 2014 at 11:06:07PM +0200, Kirill A. Shutemov wrote:
> On Sat, Nov 01, 2014 at 11:15:54PM -0400, Johannes Weiner wrote:
> > Memory cgroups used to have 5 per-page pointers. To allow users to
> > disable that amount of overhead during runtime, those pointers were
> > allocated in a separate array, with a translation layer between them
> > and struct page.
> >
> > There is now only one page pointer remaining: the memcg pointer, that
> > indicates which cgroup the page is associated with when charged. The
> > complexity of runtime allocation and the runtime translation overhead
> > is no longer justified to save that *potential* 0.19% of memory.
>
> How much do you win by the change?
Heh, that would have followed right after where you cut the quote:
with CONFIG_SLUB, that pointer actually sits in already existing
struct page padding, which means that I'm saving one pointer per page
(8 bytes per 4096 byte page, 0.19% of memory), plus the pointer and
padding in each memory section. I also save the (minor) translation
overhead going from page to page_cgroup and the maintenance burden
that stems from having these auxiliary arrays (see deleted code).
On Mon, Nov 03, 2014 at 04:36:28PM -0500, Johannes Weiner wrote:
> On Mon, Nov 03, 2014 at 11:06:07PM +0200, Kirill A. Shutemov wrote:
> > On Sat, Nov 01, 2014 at 11:15:54PM -0400, Johannes Weiner wrote:
> > > Memory cgroups used to have 5 per-page pointers. To allow users to
> > > disable that amount of overhead during runtime, those pointers were
> > > allocated in a separate array, with a translation layer between them
> > > and struct page.
> > >
> > > There is now only one page pointer remaining: the memcg pointer, that
> > > indicates which cgroup the page is associated with when charged. The
> > > complexity of runtime allocation and the runtime translation overhead
> > > is no longer justified to save that *potential* 0.19% of memory.
> >
> > How much do you win by the change?
>
> Heh, that would have followed right after where you cut the quote:
> with CONFIG_SLUB, that pointer actually sits in already existing
> struct page padding, which means that I'm saving one pointer per page
> (8 bytes per 4096 byte page, 0.19% of memory), plus the pointer and
> padding in each memory section. I also save the (minor) translation
> overhead going from page to page_cgroup and the maintenance burden
> that stems from having these auxiliary arrays (see deleted code).
I read the description. I want to know if runtime win (any benchmark data?)
from moving mem_cgroup back to the struct page is measurable.
If the win is not significant, I would prefer to not occupy the padding:
I'm sure we will be able to find a better use for the space in struct page
in the future.
--
Kirill A. Shutemov
From: "Kirill A. Shutemov" <[email protected]>
Date: Mon, 3 Nov 2014 23:52:06 +0200
> On Mon, Nov 03, 2014 at 04:36:28PM -0500, Johannes Weiner wrote:
>> On Mon, Nov 03, 2014 at 11:06:07PM +0200, Kirill A. Shutemov wrote:
>> > On Sat, Nov 01, 2014 at 11:15:54PM -0400, Johannes Weiner wrote:
>> > > Memory cgroups used to have 5 per-page pointers. To allow users to
>> > > disable that amount of overhead during runtime, those pointers were
>> > > allocated in a separate array, with a translation layer between them
>> > > and struct page.
>> > >
>> > > There is now only one page pointer remaining: the memcg pointer, that
>> > > indicates which cgroup the page is associated with when charged. The
>> > > complexity of runtime allocation and the runtime translation overhead
>> > > is no longer justified to save that *potential* 0.19% of memory.
>> >
>> > How much do you win by the change?
>>
>> Heh, that would have followed right after where you cut the quote:
>> with CONFIG_SLUB, that pointer actually sits in already existing
>> struct page padding, which means that I'm saving one pointer per page
>> (8 bytes per 4096 byte page, 0.19% of memory), plus the pointer and
>> padding in each memory section. I also save the (minor) translation
>> overhead going from page to page_cgroup and the maintenance burden
>> that stems from having these auxiliary arrays (see deleted code).
>
> I read the description. I want to know if runtime win (any benchmark data?)
> from moving mem_cgroup back to the struct page is measurable.
>
> If the win is not significant, I would prefer to not occupy the padding:
> I'm sure we will be able to find a better use for the space in struct page
> in the future.
I think the simplification benefits completely trump any performan
metric.
On Mon, Nov 03, 2014 at 04:58:07PM -0500, David Miller wrote:
> From: "Kirill A. Shutemov" <[email protected]>
> Date: Mon, 3 Nov 2014 23:52:06 +0200
>
> > On Mon, Nov 03, 2014 at 04:36:28PM -0500, Johannes Weiner wrote:
> >> On Mon, Nov 03, 2014 at 11:06:07PM +0200, Kirill A. Shutemov wrote:
> >> > On Sat, Nov 01, 2014 at 11:15:54PM -0400, Johannes Weiner wrote:
> >> > > Memory cgroups used to have 5 per-page pointers. To allow users to
> >> > > disable that amount of overhead during runtime, those pointers were
> >> > > allocated in a separate array, with a translation layer between them
> >> > > and struct page.
> >> > >
> >> > > There is now only one page pointer remaining: the memcg pointer, that
> >> > > indicates which cgroup the page is associated with when charged. The
> >> > > complexity of runtime allocation and the runtime translation overhead
> >> > > is no longer justified to save that *potential* 0.19% of memory.
> >> >
> >> > How much do you win by the change?
> >>
> >> Heh, that would have followed right after where you cut the quote:
> >> with CONFIG_SLUB, that pointer actually sits in already existing
> >> struct page padding, which means that I'm saving one pointer per page
> >> (8 bytes per 4096 byte page, 0.19% of memory), plus the pointer and
> >> padding in each memory section. I also save the (minor) translation
> >> overhead going from page to page_cgroup and the maintenance burden
> >> that stems from having these auxiliary arrays (see deleted code).
> >
> > I read the description. I want to know if runtime win (any benchmark data?)
> > from moving mem_cgroup back to the struct page is measurable.
> >
> > If the win is not significant, I would prefer to not occupy the padding:
> > I'm sure we will be able to find a better use for the space in struct page
> > in the future.
>
> I think the simplification benefits completely trump any performan
> metric.
I agree.
Also, nobody is using that space currently, and I can save memory by
moving the pointer in there. Should we later add another pointer to
struct page we are only back to the status quo - with the difference
that booting with cgroup_disable=memory will no longer save the extra
pointer per page, but again, if you care that much, you can disable
memory cgroups at compile-time.
So don't look at it as occpuying the padding, it is rather taking away
the ability to allocate that single memcg pointer at runtime, while at
the same time saving a bit of memory for common configurations until
somebody else needs the struct page padding.
On Mon, Nov 03, 2014 at 10:09:42AM -0500, Johannes Weiner wrote:
> Hi Joonsoo,
>
> On Mon, Nov 03, 2014 at 05:02:08PM +0900, Joonsoo Kim wrote:
> > On Sat, Nov 01, 2014 at 11:15:54PM -0400, Johannes Weiner wrote:
> > > Memory cgroups used to have 5 per-page pointers. To allow users to
> > > disable that amount of overhead during runtime, those pointers were
> > > allocated in a separate array, with a translation layer between them
> > > and struct page.
> >
> > Hello, Johannes.
> >
> > I'd like to leave this translation layer.
> > Could you just disable that code with #ifdef until next user comes?
> >
> > In our company, we uses PAGE_OWNER on mm tree which is the feature
> > saying who allocates the page. To use PAGE_OWNER needs modifying
> > struct page and then needs re-compile. This re-compile makes us difficult
> > to use this feature. So, we decide to implement run-time configurable
> > PAGE_OWNER through page_cgroup's translation layer code. Moreover, with
> > this infrastructure, I plan to implement some other debugging feature.
> >
> > Because of my laziness, it didn't submitted to LKML. But, I will
> > submit it as soon as possible. If the code is removed, I would
> > copy-and-paste the code, but, it would cause lose of the history on
> > that code. So if possible, I'd like to leave that code now.
>
> Please re-introduce this code when your new usecase is ready to be
> upstreamed. There is little reason to burden an unrelated feature
> with a sizable chunk of dead code for a vague future user.
Okay.
Thanks.
(2014/11/02 12:15), Johannes Weiner wrote:
> Memory cgroups used to have 5 per-page pointers. To allow users to
> disable that amount of overhead during runtime, those pointers were
> allocated in a separate array, with a translation layer between them
> and struct page.
>
> There is now only one page pointer remaining: the memcg pointer, that
> indicates which cgroup the page is associated with when charged. The
> complexity of runtime allocation and the runtime translation overhead
> is no longer justified to save that *potential* 0.19% of memory. With
> CONFIG_SLUB, page->mem_cgroup actually sits in the doubleword padding
> after the page->private member and doesn't even increase struct page,
> and then this patch actually saves space. Remaining users that care
> can still compile their kernels without CONFIG_MEMCG.
>
> text data bss dec hex filename
> 8828345 1725264 983040 11536649 b00909 vmlinux.old
> 8827425 1725264 966656 11519345 afc571 vmlinux.new
>
> Signed-off-by: Johannes Weiner <[email protected]>
> ---
> include/linux/memcontrol.h | 6 +-
> include/linux/mm_types.h | 5 +
> include/linux/mmzone.h | 12 --
> include/linux/page_cgroup.h | 53 --------
> init/main.c | 7 -
> mm/memcontrol.c | 124 +++++------------
> mm/page_alloc.c | 2 -
> mm/page_cgroup.c | 319 --------------------------------------------
> 8 files changed, 41 insertions(+), 487 deletions(-)
>
Great!
Acked-by: KAMEZAWA Hiroyuki <[email protected]>
BTW, init/Kconfig comments shouldn't be updated ?
(I'm sorry if it has been updated since your latest fix.)
(2014/11/02 12:15), Johannes Weiner wrote:
> Now that the external page_cgroup data structure and its lookup is
> gone, the only code remaining in there is swap slot accounting.
>
> Rename it and move the conditional compilation into mm/Makefile.
>
> Signed-off-by: Johannes Weiner <[email protected]>
Acked-by: KAMEZAWA Hiroyuki <[email protected]>
(2014/11/02 12:15), Johannes Weiner wrote:
> Now that the external page_cgroup data structure and its lookup is
> gone, let the generic bad_page() check for page->mem_cgroup sanity.
>
> Signed-off-by: Johannes Weiner <[email protected]>
Acked-by: KAMEZAWA Hiroyuki <[email protected]>
On Mon 03-11-14 17:36:26, Johannes Weiner wrote:
[...]
> Also, nobody is using that space currently, and I can save memory by
> moving the pointer in there. Should we later add another pointer to
> struct page we are only back to the status quo - with the difference
> that booting with cgroup_disable=memory will no longer save the extra
> pointer per page, but again, if you care that much, you can disable
> memory cgroups at compile-time.
There would be a slight inconvenience for 32b machines with distribution
kernels which cannot simply drop CONFIG_MEMCG from the config.
Especially those 32b machines with a lot of memory.
I have checked configuration used for OpenSUSE PAE kernel. Both the
struct page and the code size grow. There are additional 4B with SLAB
and SLUB gets 8 because of the alignment in the struct page. So the
overhead is 4B per page with SLUB.
This doesn't sound too bad to me considering that 64b actually even
saves some space with SLUB and it is at the same level with SLAB and
more importantly gets rid of the lookup in hot paths.
The code size grows (~1.5k) most probably due to struct page pointer
arithmetic (but I haven't checked that) but the data section shrinks
for SLAB. So we have additional 1.6k for SLUB. I guess this is
acceptable.
text data bss dec hex filename
8427489 887684 3186688 12501861 bec365 mmotm/vmlinux.slab
8429060 883588 3186688 12499336 beb988 page_cgroup/vmlinux.slab
8438894 883428 3186688 12509010 bedf52 mmotm/vmlinux.slub
8440529 883428 3186688 12510645 bee5b5 page_cgroup/vmlinux.slub
So to me it sounds like the savings for 64b are worth minor inconvenience
for 32b which is clearly on decline and I would definitely not encourage
people to use PAE kernels with a lot of memory where the difference
might matter. For the most x86 32b deployments (laptops with 4G) the
difference shouldn't be noticeable. I am not familiar with other archs
so the situation might be different there.
If this would be a problem for some reason, though, we can reintroduce
the external page descriptor and translation layer conditionally
depending on the arch. It seems there will be some users of the external
descriptors anyway so a struct page_external can hold memcg pointer as
well.
This should probably go into the changelog, I guess.
--
Michal Hocko
SUSE Labs
On Tue, Nov 04, 2014 at 05:36:39PM +0900, Kamezawa Hiroyuki wrote:
> (2014/11/02 12:15), Johannes Weiner wrote:
> > Memory cgroups used to have 5 per-page pointers. To allow users to
> > disable that amount of overhead during runtime, those pointers were
> > allocated in a separate array, with a translation layer between them
> > and struct page.
> >
> > There is now only one page pointer remaining: the memcg pointer, that
> > indicates which cgroup the page is associated with when charged. The
> > complexity of runtime allocation and the runtime translation overhead
> > is no longer justified to save that *potential* 0.19% of memory. With
> > CONFIG_SLUB, page->mem_cgroup actually sits in the doubleword padding
> > after the page->private member and doesn't even increase struct page,
> > and then this patch actually saves space. Remaining users that care
> > can still compile their kernels without CONFIG_MEMCG.
> >
> > text data bss dec hex filename
> > 8828345 1725264 983040 11536649 b00909 vmlinux.old
> > 8827425 1725264 966656 11519345 afc571 vmlinux.new
> >
> > Signed-off-by: Johannes Weiner <[email protected]>
> > ---
> > include/linux/memcontrol.h | 6 +-
> > include/linux/mm_types.h | 5 +
> > include/linux/mmzone.h | 12 --
> > include/linux/page_cgroup.h | 53 --------
> > init/main.c | 7 -
> > mm/memcontrol.c | 124 +++++------------
> > mm/page_alloc.c | 2 -
> > mm/page_cgroup.c | 319 --------------------------------------------
> > 8 files changed, 41 insertions(+), 487 deletions(-)
> >
>
> Great!
> Acked-by: KAMEZAWA Hiroyuki <[email protected]>
Thank you!
> BTW, init/Kconfig comments shouldn't be updated ?
> (I'm sorry if it has been updated since your latest fix.)
Good point. How about this?
---
From: Johannes Weiner <[email protected]>
Subject: [patch] mm: move page->mem_cgroup bad page handling into generic code fix
Remove obsolete memory saving recommendations from the MEMCG Kconfig
help text.
Signed-off-by: Johannes Weiner <[email protected]>
---
init/Kconfig | 12 ------------
1 file changed, 12 deletions(-)
diff --git a/init/Kconfig b/init/Kconfig
index 01b7f2a6abf7..d68d8b0780b3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -983,18 +983,6 @@ config MEMCG
Provides a memory resource controller that manages both anonymous
memory and page cache. (See Documentation/cgroups/memory.txt)
- Note that setting this option increases fixed memory overhead
- associated with each page of memory in the system. By this,
- 8(16)bytes/PAGE_SIZE on 32(64)bit system will be occupied by memory
- usage tracking struct at boot. Total amount of this is printed out
- at boot.
-
- Only enable when you're ok with these trade offs and really
- sure you need the memory resource controller. Even when you enable
- this, you can set "cgroup_disable=memory" at your boot option to
- disable memory resource controller and you can avoid overheads.
- (and lose benefits of memory resource controller)
-
config MEMCG_SWAP
bool "Memory Resource Controller Swap Extension"
depends on MEMCG && SWAP
--
2.1.3
On Tue 04-11-14 08:27:01, Johannes Weiner wrote:
> From: Johannes Weiner <[email protected]>
> Subject: [patch] mm: move page->mem_cgroup bad page handling into generic code fix
>
> Remove obsolete memory saving recommendations from the MEMCG Kconfig
> help text.
The memory overhead is still there. So I do not think it is good to
remove the message altogether. The current overhead might be 4 or 8B
depending on the configuration. What about
"
Note that setting this option might increase fixed memory
overhead associated with each page descriptor in the system.
The memory overhead depends on the architecture and other
configuration options which have influence on the size and
alignment on the page descriptor (struct page). Namely
CONFIG_SLUB has a requirement for page alignment to two words
which in turn means that 64b systems might not see any memory
overhead as the additional data fits into alignment. On the
other hand 32b systems might see 8B memory overhead.
"
> Signed-off-by: Johannes Weiner <[email protected]>
> ---
> init/Kconfig | 12 ------------
> 1 file changed, 12 deletions(-)
>
> diff --git a/init/Kconfig b/init/Kconfig
> index 01b7f2a6abf7..d68d8b0780b3 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -983,18 +983,6 @@ config MEMCG
> Provides a memory resource controller that manages both anonymous
> memory and page cache. (See Documentation/cgroups/memory.txt)
>
> - Note that setting this option increases fixed memory overhead
> - associated with each page of memory in the system. By this,
> - 8(16)bytes/PAGE_SIZE on 32(64)bit system will be occupied by memory
> - usage tracking struct at boot. Total amount of this is printed out
> - at boot.
> -
> - Only enable when you're ok with these trade offs and really
> - sure you need the memory resource controller. Even when you enable
> - this, you can set "cgroup_disable=memory" at your boot option to
> - disable memory resource controller and you can avoid overheads.
> - (and lose benefits of memory resource controller)
> -
> config MEMCG_SWAP
> bool "Memory Resource Controller Swap Extension"
> depends on MEMCG && SWAP
> --
> 2.1.3
>
--
Michal Hocko
SUSE Labs
On Tue, Nov 04, 2014 at 02:06:52PM +0100, Michal Hocko wrote:
> The code size grows (~1.5k) most probably due to struct page pointer
> arithmetic (but I haven't checked that) but the data section shrinks
> for SLAB. So we have additional 1.6k for SLUB. I guess this is
> acceptable.
>
> text data bss dec hex filename
> 8427489 887684 3186688 12501861 bec365 mmotm/vmlinux.slab
> 8429060 883588 3186688 12499336 beb988 page_cgroup/vmlinux.slab
>
> 8438894 883428 3186688 12509010 bedf52 mmotm/vmlinux.slub
> 8440529 883428 3186688 12510645 bee5b5 page_cgroup/vmlinux.slub
That's unexpected. It's not much, but how could the object size grow
at all when that much code is removed and we replace the lookups with
simple struct member accesses? Are you positive these are the right
object files, in the right order?
> So to me it sounds like the savings for 64b are worth minor inconvenience
> for 32b which is clearly on decline and I would definitely not encourage
> people to use PAE kernels with a lot of memory where the difference
> might matter. For the most x86 32b deployments (laptops with 4G) the
> difference shouldn't be noticeable. I am not familiar with other archs
> so the situation might be different there.
On 32 bit, the overhead is 0.098% of memory, so 4MB on a 4G machine.
This should be acceptable, even for the three people that run on the
cutting edge of 3.18-based PAE distribution kernels. :-)
> This should probably go into the changelog, I guess.
Which part?
On Tue, Nov 04, 2014 at 02:41:10PM +0100, Michal Hocko wrote:
> On Tue 04-11-14 08:27:01, Johannes Weiner wrote:
> > From: Johannes Weiner <[email protected]>
> > Subject: [patch] mm: move page->mem_cgroup bad page handling into generic code fix
> >
> > Remove obsolete memory saving recommendations from the MEMCG Kconfig
> > help text.
>
> The memory overhead is still there. So I do not think it is good to
> remove the message altogether. The current overhead might be 4 or 8B
> depending on the configuration. What about
> "
> Note that setting this option might increase fixed memory
> overhead associated with each page descriptor in the system.
> The memory overhead depends on the architecture and other
> configuration options which have influence on the size and
> alignment on the page descriptor (struct page). Namely
> CONFIG_SLUB has a requirement for page alignment to two words
> which in turn means that 64b systems might not see any memory
> overhead as the additional data fits into alignment. On the
> other hand 32b systems might see 8B memory overhead.
> "
What difference does it make whether this feature maybe costs an extra
pointer per page or not? These texts are supposed to help decide with
the selection, but this is not a "good to have, if affordable" type of
runtime debugging option. You either need cgroup memory accounting
and limiting or not. There is no possible trade-off to be had.
Slub and numa balancing don't mention this, either, simply because
this cost is negligible or irrelevant when it comes to these knobs.
On Tue 04-11-14 08:48:41, Johannes Weiner wrote:
> On Tue, Nov 04, 2014 at 02:06:52PM +0100, Michal Hocko wrote:
> > The code size grows (~1.5k) most probably due to struct page pointer
> > arithmetic (but I haven't checked that) but the data section shrinks
> > for SLAB. So we have additional 1.6k for SLUB. I guess this is
> > acceptable.
> >
> > text data bss dec hex filename
> > 8427489 887684 3186688 12501861 bec365 mmotm/vmlinux.slab
> > 8429060 883588 3186688 12499336 beb988 page_cgroup/vmlinux.slab
> >
> > 8438894 883428 3186688 12509010 bedf52 mmotm/vmlinux.slub
> > 8440529 883428 3186688 12510645 bee5b5 page_cgroup/vmlinux.slub
>
> That's unexpected. It's not much, but how could the object size grow
> at all when that much code is removed and we replace the lookups with
> simple struct member accesses? Are you positive these are the right
> object files, in the right order?
Double checked (the base is [1] and page_cgroup refers to these 3
patches). Please note that this is a distribution config (OpenSUSE
13.2) so it enables a lot of things. And I would really expect that 36B
resp. 40B pointer arithmetic will do more instructions than 32B and this
piles up when it is used all over the place.
memcontrol.o shrinks 0.2k
$ size {mmotm,page_cgroup}/mm/memcontrol.o
text data bss dec hex filename
25337 3095 2 28434 6f12 mmotm/mm/memcontrol.o
25123 3095 2 28220 6e3c page_cgroup/mm/memcontrol.o
and page_cgroup.o saves 0.5k
$ size mmotm/mm/page_cgroup.o page_cgroup/mm/swap_cgroup.o
text data bss dec hex filename
1419 24 352 1795 703 mmotm/mm/page_cgroup.o
849 24 348 1221 4c5 page_cgroup/mm/swap_cgroup.o
But built-in.o files grow or keep the same size (this is with
CONFIG_SLAB and gcc 4.8.2)
$ size {mmotm,page_cgroup}/*/built-in.o | sort -k1 -n | awk '!/text/{new = (i++ % 2); if (!new) {val = $1; last_line=$0} else if ($1-val != 0) {diff = $1 - val; printf("%s\n%s diff %d\n", last_line, $0, diff); sum+=diff}}END{printf("Sum diff %d\n", sum)}'
14481 19586 81 34148 8564 mmotm/init/built-in.o
14483 19586 81 34150 8566 page_cgroup/init/built-in.o diff 2
68679 2082 12 70773 11475 mmotm/crypto/built-in.o
68711 2082 12 70805 11495 page_cgroup/crypto/built-in.o diff 32
131583 26496 2376 160455 272c7 mmotm/lib/built-in.o
131631 26496 2376 160503 272f7 page_cgroup/lib/built-in.o diff 48
229809 12346 1548 243703 3b7f7 mmotm/block/built-in.o
229937 12346 1548 243831 3b877 page_cgroup/block/built-in.o diff 128
308015 20442 16280 344737 542a1 mmotm/security/built-in.o
308031 20442 16280 344753 542b1 page_cgroup/security/built-in.o diff 16
507979 47110 27236 582325 8e2b5 mmotm/mm/built-in.o
508540 47110 27236 582886 8e4e6 page_cgroup/mm/built-in.o diff 561
1033752 77064 13212 1124028 1126bc mmotm/fs/built-in.o
1033784 77064 13212 1124060 1126dc page_cgroup/fs/built-in.o diff 32
1099218 51979 33512 1184709 1213c5 mmotm/net/built-in.o
1099282 51979 33512 1184773 121405 page_cgroup/net/built-in.o diff 64
1180475 127020 705068 2012563 1eb593 mmotm/kernel/built-in.o
1180683 127020 705068 2012771 1eb663 page_cgroup/kernel/built-in.o diff 208
2193400 152698 34856 2380954 24549a mmotm/drivers/built-in.o
2193528 152698 34856 2381082 24551a page_cgroup/drivers/built-in.o diff 128
Sum diff 1219
this is not a complete list but mm part eats only 0.5k the rest is small
but it adds up.
> > So to me it sounds like the savings for 64b are worth minor inconvenience
> > for 32b which is clearly on decline and I would definitely not encourage
> > people to use PAE kernels with a lot of memory where the difference
> > might matter. For the most x86 32b deployments (laptops with 4G) the
> > difference shouldn't be noticeable. I am not familiar with other archs
> > so the situation might be different there.
>
> On 32 bit, the overhead is 0.098% of memory, so 4MB on a 4G machine.
> This should be acceptable, even for the three people that run on the
> cutting edge of 3.18-based PAE distribution kernels. :-)
>
> > This should probably go into the changelog, I guess.
>
> Which part?
About potential increased memory footprint on 32b systems (aka don't
sell it as a full win ;))
---
[1] https://git.kernel.org/cgit/linux/kernel/git/mhocko/mm.git/tag/?h=since-3.17&id=mmotm-2014-10-29-14-19
--
Michal Hocko
SUSE Labs
On Tue 04-11-14 09:09:37, Johannes Weiner wrote:
> On Tue, Nov 04, 2014 at 02:41:10PM +0100, Michal Hocko wrote:
> > On Tue 04-11-14 08:27:01, Johannes Weiner wrote:
> > > From: Johannes Weiner <[email protected]>
> > > Subject: [patch] mm: move page->mem_cgroup bad page handling into generic code fix
> > >
> > > Remove obsolete memory saving recommendations from the MEMCG Kconfig
> > > help text.
> >
> > The memory overhead is still there. So I do not think it is good to
> > remove the message altogether. The current overhead might be 4 or 8B
> > depending on the configuration. What about
> > "
> > Note that setting this option might increase fixed memory
> > overhead associated with each page descriptor in the system.
> > The memory overhead depends on the architecture and other
> > configuration options which have influence on the size and
> > alignment on the page descriptor (struct page). Namely
> > CONFIG_SLUB has a requirement for page alignment to two words
> > which in turn means that 64b systems might not see any memory
> > overhead as the additional data fits into alignment. On the
> > other hand 32b systems might see 8B memory overhead.
> > "
>
> What difference does it make whether this feature maybe costs an extra
> pointer per page or not? These texts are supposed to help decide with
> the selection, but this is not a "good to have, if affordable" type of
> runtime debugging option. You either need cgroup memory accounting
> and limiting or not. There is no possible trade-off to be had.
If you are compiling the kernel for your specific usecase then it
is clear. You enable only what you really need/want. But if you are
providing a pre-built kernel and considering which features to enable
then an information about overhead might be useful. You can simply
disable the feature for memory restricted kernel flavors.
> Slub and numa balancing don't mention this, either, simply because
> this cost is negligible or irrelevant when it comes to these knobs.
I agree that the overhead seems negligible but does it hurt us to
mention it though?
--
Michal Hocko
SUSE Labs
From: Johannes Weiner <[email protected]>
Date: Tue, 4 Nov 2014 09:09:37 -0500
> You either need cgroup memory accounting and limiting or not. There
> is no possible trade-off to be had.
I couldn't have said it better myself, +1.
On Tue, Nov 04, 2014 at 04:00:39PM +0100, Michal Hocko wrote:
> On Tue 04-11-14 09:09:37, Johannes Weiner wrote:
> > On Tue, Nov 04, 2014 at 02:41:10PM +0100, Michal Hocko wrote:
> > > On Tue 04-11-14 08:27:01, Johannes Weiner wrote:
> > > > From: Johannes Weiner <[email protected]>
> > > > Subject: [patch] mm: move page->mem_cgroup bad page handling into generic code fix
> > > >
> > > > Remove obsolete memory saving recommendations from the MEMCG Kconfig
> > > > help text.
> > >
> > > The memory overhead is still there. So I do not think it is good to
> > > remove the message altogether. The current overhead might be 4 or 8B
> > > depending on the configuration. What about
> > > "
> > > Note that setting this option might increase fixed memory
> > > overhead associated with each page descriptor in the system.
> > > The memory overhead depends on the architecture and other
> > > configuration options which have influence on the size and
> > > alignment on the page descriptor (struct page). Namely
> > > CONFIG_SLUB has a requirement for page alignment to two words
> > > which in turn means that 64b systems might not see any memory
> > > overhead as the additional data fits into alignment. On the
> > > other hand 32b systems might see 8B memory overhead.
> > > "
> >
> > What difference does it make whether this feature maybe costs an extra
> > pointer per page or not? These texts are supposed to help decide with
> > the selection, but this is not a "good to have, if affordable" type of
> > runtime debugging option. You either need cgroup memory accounting
> > and limiting or not. There is no possible trade-off to be had.
>
> If you are compiling the kernel for your specific usecase then it
> is clear. You enable only what you really need/want. But if you are
> providing a pre-built kernel and considering which features to enable
> then an information about overhead might be useful. You can simply
> disable the feature for memory restricted kernel flavors.
>
> > Slub and numa balancing don't mention this, either, simply because
> > this cost is negligible or irrelevant when it comes to these knobs.
>
> I agree that the overhead seems negligible but does it hurt us to
> mention it though?
Yes, it's fairly misleading. What about the instructions it adds to
the fault hotpaths? The additional memory footprint of each cgroup
created? You're adding 9 lines to point out one specific cost aspect,
when the entire feature is otherwise summed up in two lines. The
per-page overhead of memcg is not exceptional or unexpected if you
know what it does - which you should when you enable it, even as a
distributor - and such a gross overrepresentation in the help text is
more confusing than helpful.
On Sun, Nov 2, 2014 at 6:15 AM, Johannes Weiner <[email protected]> wrote:
> Memory cgroups used to have 5 per-page pointers. To allow users to
> disable that amount of overhead during runtime, those pointers were
> allocated in a separate array, with a translation layer between them
> and struct page.
>
> There is now only one page pointer remaining: the memcg pointer, that
> indicates which cgroup the page is associated with when charged. The
> complexity of runtime allocation and the runtime translation overhead
> is no longer justified to save that *potential* 0.19% of memory. With
> CONFIG_SLUB, page->mem_cgroup actually sits in the doubleword padding
> after the page->private member and doesn't even increase struct page,
> and then this patch actually saves space. Remaining users that care
> can still compile their kernels without CONFIG_MEMCG.
>
> text data bss dec hex filename
> 8828345 1725264 983040 11536649 b00909 vmlinux.old
> 8827425 1725264 966656 11519345 afc571 vmlinux.new
>
> Signed-off-by: Johannes Weiner <[email protected]>
Great! Never thought I'd see this. =)
Acked-by: Konstantin Khlebnikov <[email protected]>
> ---
> include/linux/memcontrol.h | 6 +-
> include/linux/mm_types.h | 5 +
> include/linux/mmzone.h | 12 --
> include/linux/page_cgroup.h | 53 --------
> init/main.c | 7 -
> mm/memcontrol.c | 124 +++++------------
> mm/page_alloc.c | 2 -
> mm/page_cgroup.c | 319 --------------------------------------------
> 8 files changed, 41 insertions(+), 487 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index d4575a1d6e99..dafba59b31b4 100644
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>