This is fall-out from the memcg lifetime rework in 3.17, where
writeback statistics can get lost against a migrating page.
The (-stable) fix is adapting the page stat side to the new lifetime
rules, rather than making an exception specifically for them, which
seems less error prone and generally the right way forward.
include/linux/memcontrol.h | 58 ++++++++++++++--------------------------------
include/linux/mm.h | 1 -
mm/memcontrol.c | 54 ++++++++++++++++++------------------------
mm/page-writeback.c | 43 ++++++++++++----------------------
mm/rmap.c | 20 ++++++++--------
5 files changed, 64 insertions(+), 112 deletions(-)
0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") changed page
migration to uncharge the old page right away. The page is locked,
unmapped, truncated, and off the LRU, but it could race with writeback
ending, which then doesn't unaccount the page properly:
test_clear_page_writeback() migration
acquire pc->mem_cgroup->move_lock
wait_on_page_writeback()
TestClearPageWriteback()
mem_cgroup_migrate()
clear PCG_USED
if (PageCgroupUsed(pc))
decrease memcg pages under writeback
release pc->mem_cgroup->move_lock
The per-page statistics interface is heavily optimized to avoid a
function call and a lookup_page_cgroup() in the file unmap fast path,
which means it doesn't verify whether a page is still charged before
clearing PageWriteback() and it has to do it in the stat update later.
Rework it so that it looks up the page's memcg once at the beginning
of the transaction and then uses it throughout. The charge will be
verified before clearing PageWriteback() and migration can't uncharge
the page as long as that is still set. The RCU lock will protect the
memcg past uncharge.
As far as losing the optimization goes, the following test results are
from a microbenchmark that maps, faults, and unmaps a 4GB sparse file
three times in a nested fashion, so that there are two negative passes
that don't account but still go through the new transaction overhead.
There is no actual difference:
old: 33.195102545 seconds time elapsed ( +- 0.01% )
new: 33.199231369 seconds time elapsed ( +- 0.03% )
The time spent in page_remove_rmap()'s callees still adds up to the
same, but the time spent in the function itself seems reduced:
# Children Self Command Shared Object Symbol
old: 0.12% 0.11% filemapstress [kernel.kallsyms] [k] page_remove_rmap
new: 0.12% 0.08% filemapstress [kernel.kallsyms] [k] page_remove_rmap
Signed-off-by: Johannes Weiner <[email protected]>
Cc: "3.17" <[email protected]>
---
include/linux/memcontrol.h | 58 ++++++++++++++--------------------------------
mm/memcontrol.c | 54 ++++++++++++++++++------------------------
mm/page-writeback.c | 22 ++++++++++--------
mm/rmap.c | 20 ++++++++--------
4 files changed, 61 insertions(+), 93 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0daf383f8f1c..ea007615e8f9 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -139,48 +139,23 @@ static inline bool mem_cgroup_disabled(void)
return false;
}
-void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked,
- unsigned long *flags);
-
-extern atomic_t memcg_moving;
-
-static inline void mem_cgroup_begin_update_page_stat(struct page *page,
- bool *locked, unsigned long *flags)
-{
- if (mem_cgroup_disabled())
- return;
- rcu_read_lock();
- *locked = false;
- if (atomic_read(&memcg_moving))
- __mem_cgroup_begin_update_page_stat(page, locked, flags);
-}
-
-void __mem_cgroup_end_update_page_stat(struct page *page,
- unsigned long *flags);
-static inline void mem_cgroup_end_update_page_stat(struct page *page,
- bool *locked, unsigned long *flags)
-{
- if (mem_cgroup_disabled())
- return;
- if (*locked)
- __mem_cgroup_end_update_page_stat(page, flags);
- rcu_read_unlock();
-}
-
-void mem_cgroup_update_page_stat(struct page *page,
- enum mem_cgroup_stat_index idx,
- int val);
-
-static inline void mem_cgroup_inc_page_stat(struct page *page,
+struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, bool *locked,
+ unsigned long *flags);
+void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,
+ unsigned long flags);
+void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
+ enum mem_cgroup_stat_index idx, int val);
+
+static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx)
{
- mem_cgroup_update_page_stat(page, idx, 1);
+ mem_cgroup_update_page_stat(memcg, idx, 1);
}
-static inline void mem_cgroup_dec_page_stat(struct page *page,
+static inline void mem_cgroup_dec_page_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx)
{
- mem_cgroup_update_page_stat(page, idx, -1);
+ mem_cgroup_update_page_stat(memcg, idx, -1);
}
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
@@ -315,13 +290,14 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
}
-static inline void mem_cgroup_begin_update_page_stat(struct page *page,
+static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
bool *locked, unsigned long *flags)
{
+ return NULL;
}
-static inline void mem_cgroup_end_update_page_stat(struct page *page,
- bool *locked, unsigned long *flags)
+static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg,
+ bool locked, unsigned long flags)
{
}
@@ -343,12 +319,12 @@ static inline bool mem_cgroup_oom_synchronize(bool wait)
return false;
}
-static inline void mem_cgroup_inc_page_stat(struct page *page,
+static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx)
{
}
-static inline void mem_cgroup_dec_page_stat(struct page *page,
+static inline void mem_cgroup_dec_page_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx)
{
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3a203c7ec6c7..d84f316ac901 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1463,12 +1463,8 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
* start move here.
*/
-/* for quick checking without looking up memcg */
-atomic_t memcg_moving __read_mostly;
-
static void mem_cgroup_start_move(struct mem_cgroup *memcg)
{
- atomic_inc(&memcg_moving);
atomic_inc(&memcg->moving_account);
synchronize_rcu();
}
@@ -1479,10 +1475,8 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)
* Now, mem_cgroup_clear_mc() may call this function with NULL.
* We check NULL in callee rather than caller.
*/
- if (memcg) {
- atomic_dec(&memcg_moving);
+ if (memcg)
atomic_dec(&memcg->moving_account);
- }
}
/*
@@ -2132,26 +2126,32 @@ cleanup:
* account and taking the move_lock in the slowpath.
*/
-void __mem_cgroup_begin_update_page_stat(struct page *page,
- bool *locked, unsigned long *flags)
+struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
+ bool *locked,
+ unsigned long *flags)
{
struct mem_cgroup *memcg;
struct page_cgroup *pc;
+ rcu_read_lock();
+
+ if (mem_cgroup_disabled())
+ return NULL;
+
pc = lookup_page_cgroup(page);
again:
memcg = pc->mem_cgroup;
if (unlikely(!memcg || !PageCgroupUsed(pc)))
- return;
+ return NULL;
/*
* If this memory cgroup is not under account moving, we don't
* need to take move_lock_mem_cgroup(). Because we already hold
* rcu_read_lock(), any calls to move_account will be delayed until
* rcu_read_unlock().
*/
- VM_BUG_ON(!rcu_read_lock_held());
+ *locked = false;
if (atomic_read(&memcg->moving_account) <= 0)
- return;
+ return memcg;
move_lock_mem_cgroup(memcg, flags);
if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
@@ -2159,36 +2159,26 @@ again:
goto again;
}
*locked = true;
+
+ return memcg;
}
-void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
+void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,
+ unsigned long flags)
{
- struct page_cgroup *pc = lookup_page_cgroup(page);
+ if (memcg && locked)
+ move_unlock_mem_cgroup(memcg, &flags);
- /*
- * It's guaranteed that pc->mem_cgroup never changes while
- * lock is held because a routine modifies pc->mem_cgroup
- * should take move_lock_mem_cgroup().
- */
- move_unlock_mem_cgroup(pc->mem_cgroup, flags);
+ rcu_read_unlock();
}
-void mem_cgroup_update_page_stat(struct page *page,
+void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx, int val)
{
- struct mem_cgroup *memcg;
- struct page_cgroup *pc = lookup_page_cgroup(page);
- unsigned long uninitialized_var(flags);
-
- if (mem_cgroup_disabled())
- return;
-
VM_BUG_ON(!rcu_read_lock_held());
- memcg = pc->mem_cgroup;
- if (unlikely(!memcg || !PageCgroupUsed(pc)))
- return;
- this_cpu_add(memcg->stat->count[idx], val);
+ if (memcg)
+ this_cpu_add(memcg->stat->count[idx], val);
}
/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ff6a5b07211e..19ceae87522d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2327,11 +2327,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- int ret;
- bool locked;
unsigned long memcg_flags;
+ struct mem_cgroup *memcg;
+ bool locked;
+ int ret;
- mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags);
+ memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
if (mapping) {
struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags;
@@ -2352,22 +2353,23 @@ int test_clear_page_writeback(struct page *page)
ret = TestClearPageWriteback(page);
}
if (ret) {
- mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
+ mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
dec_zone_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_WRITTEN);
}
- mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
+ mem_cgroup_end_page_stat(memcg, locked, memcg_flags);
return ret;
}
int __test_set_page_writeback(struct page *page, bool keep_write)
{
struct address_space *mapping = page_mapping(page);
- int ret;
- bool locked;
unsigned long memcg_flags;
+ struct mem_cgroup *memcg;
+ bool locked;
+ int ret;
- mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags);
+ memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
if (mapping) {
struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags;
@@ -2394,10 +2396,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
ret = TestSetPageWriteback(page);
}
if (!ret) {
- mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
+ mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
inc_zone_page_state(page, NR_WRITEBACK);
}
- mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
+ mem_cgroup_end_page_stat(memcg, locked, memcg_flags);
return ret;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 116a5053415b..f574046f77d4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1042,15 +1042,16 @@ void page_add_new_anon_rmap(struct page *page,
*/
void page_add_file_rmap(struct page *page)
{
- bool locked;
+ struct mem_cgroup *memcg;
unsigned long flags;
+ bool locked;
- mem_cgroup_begin_update_page_stat(page, &locked, &flags);
+ memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
+ mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
}
- mem_cgroup_end_update_page_stat(page, &locked, &flags);
+ mem_cgroup_end_page_stat(memcg, locked, flags);
}
/**
@@ -1061,9 +1062,10 @@ void page_add_file_rmap(struct page *page)
*/
void page_remove_rmap(struct page *page)
{
+ struct mem_cgroup *uninitialized_var(memcg);
bool anon = PageAnon(page);
- bool locked;
unsigned long flags;
+ bool locked;
/*
* The anon case has no mem_cgroup page_stat to update; but may
@@ -1071,7 +1073,7 @@ void page_remove_rmap(struct page *page)
* we hold the lock against page_stat move: so avoid it on anon.
*/
if (!anon)
- mem_cgroup_begin_update_page_stat(page, &locked, &flags);
+ memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
@@ -1096,8 +1098,7 @@ void page_remove_rmap(struct page *page)
-hpage_nr_pages(page));
} else {
__dec_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
- mem_cgroup_end_update_page_stat(page, &locked, &flags);
+ mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
}
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
@@ -1110,10 +1111,9 @@ void page_remove_rmap(struct page *page)
* Leaving it set also helps swapoff to reinstate ptes
* faster for those pages still in swapcache.
*/
- return;
out:
if (!anon)
- mem_cgroup_end_update_page_stat(page, &locked, &flags);
+ mem_cgroup_end_page_stat(memcg, locked, flags);
}
/*
--
2.1.2
A follow-up patch would have changed the call signature. To save the
trouble, just fold it instead.
Signed-off-by: Johannes Weiner <[email protected]>
Cc: "3.17" <[email protected]>
---
include/linux/mm.h | 1 -
mm/page-writeback.c | 23 ++++-------------------
2 files changed, 4 insertions(+), 20 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 27eb1bfbe704..b46461116cd2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1235,7 +1235,6 @@ int __set_page_dirty_no_writeback(struct page *page);
int redirty_page_for_writepage(struct writeback_control *wbc,
struct page *page);
void account_page_dirtied(struct page *page, struct address_space *mapping);
-void account_page_writeback(struct page *page);
int set_page_dirty(struct page *page);
int set_page_dirty_lock(struct page *page);
int clear_page_dirty_for_io(struct page *page);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ff24c9d83112..ff6a5b07211e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2116,23 +2116,6 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
EXPORT_SYMBOL(account_page_dirtied);
/*
- * Helper function for set_page_writeback family.
- *
- * The caller must hold mem_cgroup_begin/end_update_page_stat() lock
- * while calling this function.
- * See test_set_page_writeback for example.
- *
- * NOTE: Unlike account_page_dirtied this does not rely on being atomic
- * wrt interrupts.
- */
-void account_page_writeback(struct page *page)
-{
- mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
- inc_zone_page_state(page, NR_WRITEBACK);
-}
-EXPORT_SYMBOL(account_page_writeback);
-
-/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
* its radix tree.
*
@@ -2410,8 +2393,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
} else {
ret = TestSetPageWriteback(page);
}
- if (!ret)
- account_page_writeback(page);
+ if (!ret) {
+ mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
+ inc_zone_page_state(page, NR_WRITEBACK);
+ }
mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
return ret;
--
2.1.2
On Wed, 22 Oct 2014 14:29:28 -0400 Johannes Weiner <[email protected]> wrote:
> 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") changed page
> migration to uncharge the old page right away. The page is locked,
> unmapped, truncated, and off the LRU, but it could race with writeback
> ending, which then doesn't unaccount the page properly:
>
> test_clear_page_writeback() migration
> acquire pc->mem_cgroup->move_lock
> wait_on_page_writeback()
> TestClearPageWriteback()
> mem_cgroup_migrate()
> clear PCG_USED
> if (PageCgroupUsed(pc))
> decrease memcg pages under writeback
> release pc->mem_cgroup->move_lock
>
> The per-page statistics interface is heavily optimized to avoid a
> function call and a lookup_page_cgroup() in the file unmap fast path,
> which means it doesn't verify whether a page is still charged before
> clearing PageWriteback() and it has to do it in the stat update later.
>
> Rework it so that it looks up the page's memcg once at the beginning
> of the transaction and then uses it throughout. The charge will be
> verified before clearing PageWriteback() and migration can't uncharge
> the page as long as that is still set. The RCU lock will protect the
> memcg past uncharge.
>
> As far as losing the optimization goes, the following test results are
> from a microbenchmark that maps, faults, and unmaps a 4GB sparse file
> three times in a nested fashion, so that there are two negative passes
> that don't account but still go through the new transaction overhead.
> There is no actual difference:
>
> old: 33.195102545 seconds time elapsed ( +- 0.01% )
> new: 33.199231369 seconds time elapsed ( +- 0.03% )
>
> The time spent in page_remove_rmap()'s callees still adds up to the
> same, but the time spent in the function itself seems reduced:
>
> # Children Self Command Shared Object Symbol
> old: 0.12% 0.11% filemapstress [kernel.kallsyms] [k] page_remove_rmap
> new: 0.12% 0.08% filemapstress [kernel.kallsyms] [k] page_remove_rmap
>
> ...
>
> @@ -2132,26 +2126,32 @@ cleanup:
> * account and taking the move_lock in the slowpath.
> */
>
> -void __mem_cgroup_begin_update_page_stat(struct page *page,
> - bool *locked, unsigned long *flags)
> +struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
> + bool *locked,
> + unsigned long *flags)
It would be useful to document the args here (especially `locked').
Also the new rcu_read_locking protocol is worth a mention: that it
exists, what it does, why it persists as long as it does.
> {
> struct mem_cgroup *memcg;
> struct page_cgroup *pc;
>
> + rcu_read_lock();
> +
> + if (mem_cgroup_disabled())
> + return NULL;
> +
> pc = lookup_page_cgroup(page);
> again:
> memcg = pc->mem_cgroup;
> if (unlikely(!memcg || !PageCgroupUsed(pc)))
> - return;
> + return NULL;
> /*
> * If this memory cgroup is not under account moving, we don't
> * need to take move_lock_mem_cgroup(). Because we already hold
> * rcu_read_lock(), any calls to move_account will be delayed until
> * rcu_read_unlock().
> */
> - VM_BUG_ON(!rcu_read_lock_held());
> + *locked = false;
> if (atomic_read(&memcg->moving_account) <= 0)
> - return;
> + return memcg;
>
> move_lock_mem_cgroup(memcg, flags);
> if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
> @@ -2159,36 +2159,26 @@ again:
> goto again;
> }
> *locked = true;
> +
> + return memcg;
> }
>
>
> ...
>
> @@ -1061,9 +1062,10 @@ void page_add_file_rmap(struct page *page)
> */
> void page_remove_rmap(struct page *page)
> {
> + struct mem_cgroup *uninitialized_var(memcg);
> bool anon = PageAnon(page);
> - bool locked;
> unsigned long flags;
> + bool locked;
>
> /*
> * The anon case has no mem_cgroup page_stat to update; but may
> @@ -1071,7 +1073,7 @@ void page_remove_rmap(struct page *page)
> * we hold the lock against page_stat move: so avoid it on anon.
> */
> if (!anon)
> - mem_cgroup_begin_update_page_stat(page, &locked, &flags);
> + memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
>
> /* page still mapped by someone else? */
> if (!atomic_add_negative(-1, &page->_mapcount))
> @@ -1096,8 +1098,7 @@ void page_remove_rmap(struct page *page)
> -hpage_nr_pages(page));
> } else {
> __dec_zone_page_state(page, NR_FILE_MAPPED);
> - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
> - mem_cgroup_end_update_page_stat(page, &locked, &flags);
> + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
> }
> if (unlikely(PageMlocked(page)))
> clear_page_mlock(page);
> @@ -1110,10 +1111,9 @@ void page_remove_rmap(struct page *page)
> * Leaving it set also helps swapoff to reinstate ptes
> * faster for those pages still in swapcache.
> */
> - return;
> out:
> if (!anon)
> - mem_cgroup_end_update_page_stat(page, &locked, &flags);
> + mem_cgroup_end_page_stat(memcg, locked, flags);
> }
The anon and file paths have as much unique code as they do common
code. I wonder if page_remove_rmap() would come out better if split
into two functions? I gave that a quick try and it came out OK-looking.
On Wed 22-10-14 14:29:27, Johannes Weiner wrote:
> A follow-up patch would have changed the call signature. To save the
> trouble, just fold it instead.
>
> Signed-off-by: Johannes Weiner <[email protected]>
> Cc: "3.17" <[email protected]>
It seems that the function was added just for nilfs but that wasn't using
the symbol at the time memcg part went in. Funny...
Acked-by: Michal Hocko <[email protected]>
> ---
> include/linux/mm.h | 1 -
> mm/page-writeback.c | 23 ++++-------------------
> 2 files changed, 4 insertions(+), 20 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 27eb1bfbe704..b46461116cd2 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1235,7 +1235,6 @@ int __set_page_dirty_no_writeback(struct page *page);
> int redirty_page_for_writepage(struct writeback_control *wbc,
> struct page *page);
> void account_page_dirtied(struct page *page, struct address_space *mapping);
> -void account_page_writeback(struct page *page);
> int set_page_dirty(struct page *page);
> int set_page_dirty_lock(struct page *page);
> int clear_page_dirty_for_io(struct page *page);
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index ff24c9d83112..ff6a5b07211e 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -2116,23 +2116,6 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
> EXPORT_SYMBOL(account_page_dirtied);
>
> /*
> - * Helper function for set_page_writeback family.
> - *
> - * The caller must hold mem_cgroup_begin/end_update_page_stat() lock
> - * while calling this function.
> - * See test_set_page_writeback for example.
> - *
> - * NOTE: Unlike account_page_dirtied this does not rely on being atomic
> - * wrt interrupts.
> - */
> -void account_page_writeback(struct page *page)
> -{
> - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
> - inc_zone_page_state(page, NR_WRITEBACK);
> -}
> -EXPORT_SYMBOL(account_page_writeback);
> -
> -/*
> * For address_spaces which do not use buffers. Just tag the page as dirty in
> * its radix tree.
> *
> @@ -2410,8 +2393,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
> } else {
> ret = TestSetPageWriteback(page);
> }
> - if (!ret)
> - account_page_writeback(page);
> + if (!ret) {
> + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
> + inc_zone_page_state(page, NR_WRITEBACK);
> + }
> mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
> return ret;
>
> --
> 2.1.2
>
--
Michal Hocko
SUSE Labs
On Wed 22-10-14 14:29:28, Johannes Weiner wrote:
> 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") changed page
> migration to uncharge the old page right away. The page is locked,
> unmapped, truncated, and off the LRU, but it could race with writeback
> ending, which then doesn't unaccount the page properly:
>
> test_clear_page_writeback() migration
> acquire pc->mem_cgroup->move_lock
I do not think that mentioning move_lock is important/helpful here
because the hot path which is taken all the time (except when there is a
task move in progress) doesn't take it.
Besides that it is not even relevant for the race.
> wait_on_page_writeback()
> TestClearPageWriteback()
> mem_cgroup_migrate()
> clear PCG_USED
> if (PageCgroupUsed(pc))
> decrease memcg pages under writeback
> release pc->mem_cgroup->move_lock
>
> The per-page statistics interface is heavily optimized to avoid a
> function call and a lookup_page_cgroup() in the file unmap fast path,
> which means it doesn't verify whether a page is still charged before
> clearing PageWriteback() and it has to do it in the stat update later.
>
> Rework it so that it looks up the page's memcg once at the beginning
> of the transaction and then uses it throughout. The charge will be
> verified before clearing PageWriteback() and migration can't uncharge
> the page as long as that is still set. The RCU lock will protect the
> memcg past uncharge.
>
> As far as losing the optimization goes, the following test results are
> from a microbenchmark that maps, faults, and unmaps a 4GB sparse file
> three times in a nested fashion, so that there are two negative passes
> that don't account but still go through the new transaction overhead.
> There is no actual difference:
>
> old: 33.195102545 seconds time elapsed ( +- 0.01% )
> new: 33.199231369 seconds time elapsed ( +- 0.03% )
>
> The time spent in page_remove_rmap()'s callees still adds up to the
> same, but the time spent in the function itself seems reduced:
>
> # Children Self Command Shared Object Symbol
> old: 0.12% 0.11% filemapstress [kernel.kallsyms] [k] page_remove_rmap
> new: 0.12% 0.08% filemapstress [kernel.kallsyms] [k] page_remove_rmap
>
>
> Signed-off-by: Johannes Weiner <[email protected]>
> Cc: "3.17" <[email protected]>
Acked-by: Michal Hocko <[email protected]>
Thanks!
> ---
> include/linux/memcontrol.h | 58 ++++++++++++++--------------------------------
> mm/memcontrol.c | 54 ++++++++++++++++++------------------------
> mm/page-writeback.c | 22 ++++++++++--------
> mm/rmap.c | 20 ++++++++--------
> 4 files changed, 61 insertions(+), 93 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 0daf383f8f1c..ea007615e8f9 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -139,48 +139,23 @@ static inline bool mem_cgroup_disabled(void)
> return false;
> }
>
> -void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked,
> - unsigned long *flags);
> -
> -extern atomic_t memcg_moving;
> -
> -static inline void mem_cgroup_begin_update_page_stat(struct page *page,
> - bool *locked, unsigned long *flags)
> -{
> - if (mem_cgroup_disabled())
> - return;
> - rcu_read_lock();
> - *locked = false;
> - if (atomic_read(&memcg_moving))
> - __mem_cgroup_begin_update_page_stat(page, locked, flags);
> -}
> -
> -void __mem_cgroup_end_update_page_stat(struct page *page,
> - unsigned long *flags);
> -static inline void mem_cgroup_end_update_page_stat(struct page *page,
> - bool *locked, unsigned long *flags)
> -{
> - if (mem_cgroup_disabled())
> - return;
> - if (*locked)
> - __mem_cgroup_end_update_page_stat(page, flags);
> - rcu_read_unlock();
> -}
> -
> -void mem_cgroup_update_page_stat(struct page *page,
> - enum mem_cgroup_stat_index idx,
> - int val);
> -
> -static inline void mem_cgroup_inc_page_stat(struct page *page,
> +struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, bool *locked,
> + unsigned long *flags);
> +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,
> + unsigned long flags);
> +void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
> + enum mem_cgroup_stat_index idx, int val);
> +
> +static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
> enum mem_cgroup_stat_index idx)
> {
> - mem_cgroup_update_page_stat(page, idx, 1);
> + mem_cgroup_update_page_stat(memcg, idx, 1);
> }
>
> -static inline void mem_cgroup_dec_page_stat(struct page *page,
> +static inline void mem_cgroup_dec_page_stat(struct mem_cgroup *memcg,
> enum mem_cgroup_stat_index idx)
> {
> - mem_cgroup_update_page_stat(page, idx, -1);
> + mem_cgroup_update_page_stat(memcg, idx, -1);
> }
>
> unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
> @@ -315,13 +290,14 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
> {
> }
>
> -static inline void mem_cgroup_begin_update_page_stat(struct page *page,
> +static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
> bool *locked, unsigned long *flags)
> {
> + return NULL;
> }
>
> -static inline void mem_cgroup_end_update_page_stat(struct page *page,
> - bool *locked, unsigned long *flags)
> +static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg,
> + bool locked, unsigned long flags)
> {
> }
>
> @@ -343,12 +319,12 @@ static inline bool mem_cgroup_oom_synchronize(bool wait)
> return false;
> }
>
> -static inline void mem_cgroup_inc_page_stat(struct page *page,
> +static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
> enum mem_cgroup_stat_index idx)
> {
> }
>
> -static inline void mem_cgroup_dec_page_stat(struct page *page,
> +static inline void mem_cgroup_dec_page_stat(struct mem_cgroup *memcg,
> enum mem_cgroup_stat_index idx)
> {
> }
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 3a203c7ec6c7..d84f316ac901 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1463,12 +1463,8 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
> * start move here.
> */
>
> -/* for quick checking without looking up memcg */
> -atomic_t memcg_moving __read_mostly;
> -
> static void mem_cgroup_start_move(struct mem_cgroup *memcg)
> {
> - atomic_inc(&memcg_moving);
> atomic_inc(&memcg->moving_account);
> synchronize_rcu();
> }
> @@ -1479,10 +1475,8 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)
> * Now, mem_cgroup_clear_mc() may call this function with NULL.
> * We check NULL in callee rather than caller.
> */
> - if (memcg) {
> - atomic_dec(&memcg_moving);
> + if (memcg)
> atomic_dec(&memcg->moving_account);
> - }
> }
>
> /*
> @@ -2132,26 +2126,32 @@ cleanup:
> * account and taking the move_lock in the slowpath.
> */
>
> -void __mem_cgroup_begin_update_page_stat(struct page *page,
> - bool *locked, unsigned long *flags)
> +struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
> + bool *locked,
> + unsigned long *flags)
> {
> struct mem_cgroup *memcg;
> struct page_cgroup *pc;
>
> + rcu_read_lock();
> +
> + if (mem_cgroup_disabled())
> + return NULL;
> +
> pc = lookup_page_cgroup(page);
> again:
> memcg = pc->mem_cgroup;
> if (unlikely(!memcg || !PageCgroupUsed(pc)))
> - return;
> + return NULL;
> /*
> * If this memory cgroup is not under account moving, we don't
> * need to take move_lock_mem_cgroup(). Because we already hold
> * rcu_read_lock(), any calls to move_account will be delayed until
> * rcu_read_unlock().
> */
> - VM_BUG_ON(!rcu_read_lock_held());
> + *locked = false;
> if (atomic_read(&memcg->moving_account) <= 0)
> - return;
> + return memcg;
>
> move_lock_mem_cgroup(memcg, flags);
> if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
> @@ -2159,36 +2159,26 @@ again:
> goto again;
> }
> *locked = true;
> +
> + return memcg;
> }
>
> -void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
> +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,
> + unsigned long flags)
> {
> - struct page_cgroup *pc = lookup_page_cgroup(page);
> + if (memcg && locked)
> + move_unlock_mem_cgroup(memcg, &flags);
>
> - /*
> - * It's guaranteed that pc->mem_cgroup never changes while
> - * lock is held because a routine modifies pc->mem_cgroup
> - * should take move_lock_mem_cgroup().
> - */
> - move_unlock_mem_cgroup(pc->mem_cgroup, flags);
> + rcu_read_unlock();
> }
>
> -void mem_cgroup_update_page_stat(struct page *page,
> +void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
> enum mem_cgroup_stat_index idx, int val)
> {
> - struct mem_cgroup *memcg;
> - struct page_cgroup *pc = lookup_page_cgroup(page);
> - unsigned long uninitialized_var(flags);
> -
> - if (mem_cgroup_disabled())
> - return;
> -
> VM_BUG_ON(!rcu_read_lock_held());
> - memcg = pc->mem_cgroup;
> - if (unlikely(!memcg || !PageCgroupUsed(pc)))
> - return;
>
> - this_cpu_add(memcg->stat->count[idx], val);
> + if (memcg)
> + this_cpu_add(memcg->stat->count[idx], val);
> }
>
> /*
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index ff6a5b07211e..19ceae87522d 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -2327,11 +2327,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
> int test_clear_page_writeback(struct page *page)
> {
> struct address_space *mapping = page_mapping(page);
> - int ret;
> - bool locked;
> unsigned long memcg_flags;
> + struct mem_cgroup *memcg;
> + bool locked;
> + int ret;
>
> - mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags);
> + memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
> if (mapping) {
> struct backing_dev_info *bdi = mapping->backing_dev_info;
> unsigned long flags;
> @@ -2352,22 +2353,23 @@ int test_clear_page_writeback(struct page *page)
> ret = TestClearPageWriteback(page);
> }
> if (ret) {
> - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
> + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
> dec_zone_page_state(page, NR_WRITEBACK);
> inc_zone_page_state(page, NR_WRITTEN);
> }
> - mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
> + mem_cgroup_end_page_stat(memcg, locked, memcg_flags);
> return ret;
> }
>
> int __test_set_page_writeback(struct page *page, bool keep_write)
> {
> struct address_space *mapping = page_mapping(page);
> - int ret;
> - bool locked;
> unsigned long memcg_flags;
> + struct mem_cgroup *memcg;
> + bool locked;
> + int ret;
>
> - mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags);
> + memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
> if (mapping) {
> struct backing_dev_info *bdi = mapping->backing_dev_info;
> unsigned long flags;
> @@ -2394,10 +2396,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
> ret = TestSetPageWriteback(page);
> }
> if (!ret) {
> - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
> + mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
> inc_zone_page_state(page, NR_WRITEBACK);
> }
> - mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
> + mem_cgroup_end_page_stat(memcg, locked, memcg_flags);
> return ret;
>
> }
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 116a5053415b..f574046f77d4 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1042,15 +1042,16 @@ void page_add_new_anon_rmap(struct page *page,
> */
> void page_add_file_rmap(struct page *page)
> {
> - bool locked;
> + struct mem_cgroup *memcg;
> unsigned long flags;
> + bool locked;
>
> - mem_cgroup_begin_update_page_stat(page, &locked, &flags);
> + memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
> if (atomic_inc_and_test(&page->_mapcount)) {
> __inc_zone_page_state(page, NR_FILE_MAPPED);
> - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
> + mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
> }
> - mem_cgroup_end_update_page_stat(page, &locked, &flags);
> + mem_cgroup_end_page_stat(memcg, locked, flags);
> }
>
> /**
> @@ -1061,9 +1062,10 @@ void page_add_file_rmap(struct page *page)
> */
> void page_remove_rmap(struct page *page)
> {
> + struct mem_cgroup *uninitialized_var(memcg);
> bool anon = PageAnon(page);
> - bool locked;
> unsigned long flags;
> + bool locked;
>
> /*
> * The anon case has no mem_cgroup page_stat to update; but may
> @@ -1071,7 +1073,7 @@ void page_remove_rmap(struct page *page)
> * we hold the lock against page_stat move: so avoid it on anon.
> */
> if (!anon)
> - mem_cgroup_begin_update_page_stat(page, &locked, &flags);
> + memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
>
> /* page still mapped by someone else? */
> if (!atomic_add_negative(-1, &page->_mapcount))
> @@ -1096,8 +1098,7 @@ void page_remove_rmap(struct page *page)
> -hpage_nr_pages(page));
> } else {
> __dec_zone_page_state(page, NR_FILE_MAPPED);
> - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
> - mem_cgroup_end_update_page_stat(page, &locked, &flags);
> + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
> }
> if (unlikely(PageMlocked(page)))
> clear_page_mlock(page);
> @@ -1110,10 +1111,9 @@ void page_remove_rmap(struct page *page)
> * Leaving it set also helps swapoff to reinstate ptes
> * faster for those pages still in swapcache.
> */
> - return;
> out:
> if (!anon)
> - mem_cgroup_end_update_page_stat(page, &locked, &flags);
> + mem_cgroup_end_page_stat(memcg, locked, flags);
> }
>
> /*
> --
> 2.1.2
>
--
Michal Hocko
SUSE Labs
On Wed, Oct 22, 2014 at 01:39:36PM -0700, Andrew Morton wrote:
> On Wed, 22 Oct 2014 14:29:28 -0400 Johannes Weiner <[email protected]> wrote:
>
> > 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") changed page
> > migration to uncharge the old page right away. The page is locked,
> > unmapped, truncated, and off the LRU, but it could race with writeback
> > ending, which then doesn't unaccount the page properly:
> >
> > test_clear_page_writeback() migration
> > acquire pc->mem_cgroup->move_lock
> > wait_on_page_writeback()
> > TestClearPageWriteback()
> > mem_cgroup_migrate()
> > clear PCG_USED
> > if (PageCgroupUsed(pc))
> > decrease memcg pages under writeback
> > release pc->mem_cgroup->move_lock
> >
> > The per-page statistics interface is heavily optimized to avoid a
> > function call and a lookup_page_cgroup() in the file unmap fast path,
> > which means it doesn't verify whether a page is still charged before
> > clearing PageWriteback() and it has to do it in the stat update later.
> >
> > Rework it so that it looks up the page's memcg once at the beginning
> > of the transaction and then uses it throughout. The charge will be
> > verified before clearing PageWriteback() and migration can't uncharge
> > the page as long as that is still set. The RCU lock will protect the
> > memcg past uncharge.
> >
> > As far as losing the optimization goes, the following test results are
> > from a microbenchmark that maps, faults, and unmaps a 4GB sparse file
> > three times in a nested fashion, so that there are two negative passes
> > that don't account but still go through the new transaction overhead.
> > There is no actual difference:
> >
> > old: 33.195102545 seconds time elapsed ( +- 0.01% )
> > new: 33.199231369 seconds time elapsed ( +- 0.03% )
> >
> > The time spent in page_remove_rmap()'s callees still adds up to the
> > same, but the time spent in the function itself seems reduced:
> >
> > # Children Self Command Shared Object Symbol
> > old: 0.12% 0.11% filemapstress [kernel.kallsyms] [k] page_remove_rmap
> > new: 0.12% 0.08% filemapstress [kernel.kallsyms] [k] page_remove_rmap
> >
> > ...
> >
> > @@ -2132,26 +2126,32 @@ cleanup:
> > * account and taking the move_lock in the slowpath.
> > */
> >
> > -void __mem_cgroup_begin_update_page_stat(struct page *page,
> > - bool *locked, unsigned long *flags)
> > +struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
> > + bool *locked,
> > + unsigned long *flags)
>
> It would be useful to document the args here (especially `locked').
> Also the new rcu_read_locking protocol is worth a mention: that it
> exists, what it does, why it persists as long as it does.
Okay, I added full kernel docs that explain the RCU fast path, the
memcg->move_lock slow path, and the lifetime guarantee of RCU in cases
where the page state that is about to change is the only thing pinning
the charge, like in end-writeback.
---
>From 1808b8e2114a7d3cc6a0a52be2fe568ff6e1457e Mon Sep 17 00:00:00 2001
From: Johannes Weiner <[email protected]>
Date: Thu, 23 Oct 2014 09:12:01 -0400
Subject: [patch] mm: memcontrol: fix missed end-writeback page accounting fix
Add kernel-doc to page state accounting functions.
Signed-off-by: Johannes Weiner <[email protected]>
---
mm/memcontrol.c | 51 +++++++++++++++++++++++++++++++++++----------------
1 file changed, 35 insertions(+), 16 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 024177df7aae..ae9b630e928b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2109,21 +2109,31 @@ cleanup:
return true;
}
-/*
- * Used to update mapped file or writeback or other statistics.
+/**
+ * mem_cgroup_begin_page_stat - begin a page state statistics transaction
+ * @page: page that is going to change accounted state
+ * @locked: &memcg->move_lock slowpath was taken
+ * @flags: IRQ-state flags for &memcg->move_lock
*
- * Notes: Race condition
+ * This function must mark the beginning of an accounted page state
+ * change to prevent double accounting when the page is concurrently
+ * being moved to another memcg:
*
- * Charging occurs during page instantiation, while the page is
- * unmapped and locked in page migration, or while the page table is
- * locked in THP migration. No race is possible.
+ * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+ * if (TestClearPageState(page))
+ * mem_cgroup_update_page_stat(memcg, state, -1);
+ * mem_cgroup_end_page_stat(memcg, locked, flags);
*
- * Uncharge happens to pages with zero references, no race possible.
+ * The RCU lock is held throughout the transaction. The fast path can
+ * get away without acquiring the memcg->move_lock (@locked is false)
+ * because page moving starts with an RCU grace period.
*
- * Charge moving between groups is protected by checking mm->moving
- * account and taking the move_lock in the slowpath.
+ * The RCU lock also protects the memcg from being freed when the page
+ * state that is going to change is the only thing preventing the page
+ * from being uncharged. E.g. end-writeback clearing PageWriteback(),
+ * which allows migration to go ahead and uncharge the page before the
+ * account transaction might be complete.
*/
-
struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
bool *locked,
unsigned long *flags)
@@ -2141,12 +2151,7 @@ again:
memcg = pc->mem_cgroup;
if (unlikely(!memcg))
return NULL;
- /*
- * If this memory cgroup is not under account moving, we don't
- * need to take move_lock_mem_cgroup(). Because we already hold
- * rcu_read_lock(), any calls to move_account will be delayed until
- * rcu_read_unlock().
- */
+
*locked = false;
if (atomic_read(&memcg->moving_account) <= 0)
return memcg;
@@ -2161,6 +2166,12 @@ again:
return memcg;
}
+/**
+ * mem_cgroup_end_page_stat - finish a page state statistics transaction
+ * @memcg: the memcg that was accounted against
+ * @locked: value received from mem_cgroup_begin_page_stat()
+ * @flags: value received from mem_cgroup_begin_page_stat()
+ */
void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,
unsigned long flags)
{
@@ -2170,6 +2181,14 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,
rcu_read_unlock();
}
+/**
+ * mem_cgroup_update_page_stat - update page state statistics
+ * @memcg: memcg to account against
+ * @idx: page state item to account
+ * @val: number of pages (positive or negative)
+ *
+ * See mem_cgroup_begin_page_stat() for locking requirements.
+ */
void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx, int val)
{
--
2.1.2
On Wed, Oct 22, 2014 at 01:39:36PM -0700, Andrew Morton wrote:
> On Wed, 22 Oct 2014 14:29:28 -0400 Johannes Weiner <[email protected]> wrote:
> > @@ -1061,9 +1062,10 @@ void page_add_file_rmap(struct page *page)
> > */
> > void page_remove_rmap(struct page *page)
> > {
> > + struct mem_cgroup *uninitialized_var(memcg);
> > bool anon = PageAnon(page);
> > - bool locked;
> > unsigned long flags;
> > + bool locked;
> >
> > /*
> > * The anon case has no mem_cgroup page_stat to update; but may
> > @@ -1071,7 +1073,7 @@ void page_remove_rmap(struct page *page)
> > * we hold the lock against page_stat move: so avoid it on anon.
> > */
> > if (!anon)
> > - mem_cgroup_begin_update_page_stat(page, &locked, &flags);
> > + memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
> >
> > /* page still mapped by someone else? */
> > if (!atomic_add_negative(-1, &page->_mapcount))
> > @@ -1096,8 +1098,7 @@ void page_remove_rmap(struct page *page)
> > -hpage_nr_pages(page));
> > } else {
> > __dec_zone_page_state(page, NR_FILE_MAPPED);
> > - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
> > - mem_cgroup_end_update_page_stat(page, &locked, &flags);
> > + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
> > }
> > if (unlikely(PageMlocked(page)))
> > clear_page_mlock(page);
> > @@ -1110,10 +1111,9 @@ void page_remove_rmap(struct page *page)
> > * Leaving it set also helps swapoff to reinstate ptes
> > * faster for those pages still in swapcache.
> > */
> > - return;
> > out:
> > if (!anon)
> > - mem_cgroup_end_update_page_stat(page, &locked, &flags);
> > + mem_cgroup_end_page_stat(memcg, locked, flags);
> > }
>
> The anon and file paths have as much unique code as they do common
> code. I wonder if page_remove_rmap() would come out better if split
> into two functions? I gave that a quick try and it came out OK-looking.
I agree, that looks better. How about this?
---
>From b518d88254b01be8c6c0c4a496d9f311f0c71b4a Mon Sep 17 00:00:00 2001
From: Johannes Weiner <[email protected]>
Date: Thu, 23 Oct 2014 09:29:06 -0400
Subject: [patch] mm: rmap: split out page_remove_file_rmap()
page_remove_rmap() has too many branches on PageAnon() and is hard to
follow. Move the file part into a separate function.
Signed-off-by: Johannes Weiner <[email protected]>
---
mm/rmap.c | 78 +++++++++++++++++++++++++++++++++++++--------------------------
1 file changed, 46 insertions(+), 32 deletions(-)
diff --git a/mm/rmap.c b/mm/rmap.c
index f574046f77d4..19886fb2f13a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1054,6 +1054,36 @@ void page_add_file_rmap(struct page *page)
mem_cgroup_end_page_stat(memcg, locked, flags);
}
+static void page_remove_file_rmap(struct page *page)
+{
+ struct mem_cgroup *memcg;
+ unsigned long flags;
+ bool locked;
+
+ memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+
+ /* page still mapped by someone else? */
+ if (!atomic_add_negative(-1, &page->_mapcount))
+ goto out;
+
+ /* Hugepages are not counted in NR_FILE_MAPPED for now. */
+ if (unlikely(PageHuge(page)))
+ goto out;
+
+ /*
+ * We use the irq-unsafe __{inc|mod}_zone_page_stat because
+ * these counters are not modified in interrupt context, and
+ * pte lock(a spinlock) is held, which implies preemption disabled.
+ */
+ __dec_zone_page_state(page, NR_FILE_MAPPED);
+ mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
+
+ if (unlikely(PageMlocked(page)))
+ clear_page_mlock(page);
+out:
+ mem_cgroup_end_page_stat(memcg, locked, flags);
+}
+
/**
* page_remove_rmap - take down pte mapping from a page
* @page: page to remove mapping from
@@ -1062,46 +1092,33 @@ void page_add_file_rmap(struct page *page)
*/
void page_remove_rmap(struct page *page)
{
- struct mem_cgroup *uninitialized_var(memcg);
- bool anon = PageAnon(page);
- unsigned long flags;
- bool locked;
-
- /*
- * The anon case has no mem_cgroup page_stat to update; but may
- * uncharge_page() below, where the lock ordering can deadlock if
- * we hold the lock against page_stat move: so avoid it on anon.
- */
- if (!anon)
- memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+ if (!PageAnon(page)) {
+ page_remove_file_rmap(page);
+ return;
+ }
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
- goto out;
+ return;
+
+ /* Hugepages are not counted in NR_ANON_PAGES for now. */
+ if (unlikely(PageHuge(page)))
+ return;
/*
- * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
- * and not charged by memcg for now.
- *
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
- * these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
- if (unlikely(PageHuge(page)))
- goto out;
- if (anon) {
- if (PageTransHuge(page))
- __dec_zone_page_state(page,
- NR_ANON_TRANSPARENT_HUGEPAGES);
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
- -hpage_nr_pages(page));
- } else {
- __dec_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
- }
+ if (PageTransHuge(page))
+ __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+
+ __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
+ -hpage_nr_pages(page));
+
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
+
/*
* It would be tidy to reset the PageAnon mapping here,
* but that might overwrite a racing page_add_anon_rmap
@@ -1111,9 +1128,6 @@ void page_remove_rmap(struct page *page)
* Leaving it set also helps swapoff to reinstate ptes
* faster for those pages still in swapcache.
*/
-out:
- if (!anon)
- mem_cgroup_end_page_stat(memcg, locked, flags);
}
/*
--
2.1.2
On Thu, Oct 23, 2014 at 03:03:31PM +0200, Michal Hocko wrote:
> On Wed 22-10-14 14:29:28, Johannes Weiner wrote:
> > 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") changed page
> > migration to uncharge the old page right away. The page is locked,
> > unmapped, truncated, and off the LRU, but it could race with writeback
> > ending, which then doesn't unaccount the page properly:
> >
> > test_clear_page_writeback() migration
> > acquire pc->mem_cgroup->move_lock
>
> I do not think that mentioning move_lock is important/helpful here
> because the hot path which is taken all the time (except when there is a
> task move in progress) doesn't take it.
> Besides that it is not even relevant for the race.
You're right. It's not worth mentioning the transaction setup/finish
at all, because migration does not participate in that protocol. How
about this? Andrew, could you please copy-paste this into the patch?
test_clear_page_writeback() migration
wait_on_page_writeback()
TestClearPageWriteback()
mem_cgroup_migrate()
clear PCG_USED
mem_cgroup_update_page_stat()
if (PageCgroupUsed(pc))
decrease memcg pages under writeback
On Thu 23-10-14 10:14:43, Johannes Weiner wrote:
> On Thu, Oct 23, 2014 at 03:03:31PM +0200, Michal Hocko wrote:
> > On Wed 22-10-14 14:29:28, Johannes Weiner wrote:
> > > 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") changed page
> > > migration to uncharge the old page right away. The page is locked,
> > > unmapped, truncated, and off the LRU, but it could race with writeback
> > > ending, which then doesn't unaccount the page properly:
> > >
> > > test_clear_page_writeback() migration
> > > acquire pc->mem_cgroup->move_lock
> >
> > I do not think that mentioning move_lock is important/helpful here
> > because the hot path which is taken all the time (except when there is a
> > task move in progress) doesn't take it.
> > Besides that it is not even relevant for the race.
>
> You're right. It's not worth mentioning the transaction setup/finish
> at all, because migration does not participate in that protocol. How
> about this? Andrew, could you please copy-paste this into the patch?
>
> test_clear_page_writeback() migration
> wait_on_page_writeback()
> TestClearPageWriteback()
> mem_cgroup_migrate()
> clear PCG_USED
> mem_cgroup_update_page_stat()
> if (PageCgroupUsed(pc))
> decrease memcg pages under writeback
Yes, much better! Thanks!
--
Michal Hocko
SUSE Labs
On Thu 23-10-14 09:54:12, Johannes Weiner wrote:
[...]
> From 1808b8e2114a7d3cc6a0a52be2fe568ff6e1457e Mon Sep 17 00:00:00 2001
> From: Johannes Weiner <[email protected]>
> Date: Thu, 23 Oct 2014 09:12:01 -0400
> Subject: [patch] mm: memcontrol: fix missed end-writeback page accounting fix
>
> Add kernel-doc to page state accounting functions.
>
> Signed-off-by: Johannes Weiner <[email protected]>
Nice!
Acked-by: Michal Hocko <[email protected]>
> ---
> mm/memcontrol.c | 51 +++++++++++++++++++++++++++++++++++----------------
> 1 file changed, 35 insertions(+), 16 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 024177df7aae..ae9b630e928b 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2109,21 +2109,31 @@ cleanup:
> return true;
> }
>
> -/*
> - * Used to update mapped file or writeback or other statistics.
> +/**
> + * mem_cgroup_begin_page_stat - begin a page state statistics transaction
> + * @page: page that is going to change accounted state
> + * @locked: &memcg->move_lock slowpath was taken
> + * @flags: IRQ-state flags for &memcg->move_lock
> *
> - * Notes: Race condition
> + * This function must mark the beginning of an accounted page state
> + * change to prevent double accounting when the page is concurrently
> + * being moved to another memcg:
> *
> - * Charging occurs during page instantiation, while the page is
> - * unmapped and locked in page migration, or while the page table is
> - * locked in THP migration. No race is possible.
> + * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
> + * if (TestClearPageState(page))
> + * mem_cgroup_update_page_stat(memcg, state, -1);
> + * mem_cgroup_end_page_stat(memcg, locked, flags);
> *
> - * Uncharge happens to pages with zero references, no race possible.
> + * The RCU lock is held throughout the transaction. The fast path can
> + * get away without acquiring the memcg->move_lock (@locked is false)
> + * because page moving starts with an RCU grace period.
> *
> - * Charge moving between groups is protected by checking mm->moving
> - * account and taking the move_lock in the slowpath.
> + * The RCU lock also protects the memcg from being freed when the page
> + * state that is going to change is the only thing preventing the page
> + * from being uncharged. E.g. end-writeback clearing PageWriteback(),
> + * which allows migration to go ahead and uncharge the page before the
> + * account transaction might be complete.
> */
> -
> struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
> bool *locked,
> unsigned long *flags)
> @@ -2141,12 +2151,7 @@ again:
> memcg = pc->mem_cgroup;
> if (unlikely(!memcg))
> return NULL;
> - /*
> - * If this memory cgroup is not under account moving, we don't
> - * need to take move_lock_mem_cgroup(). Because we already hold
> - * rcu_read_lock(), any calls to move_account will be delayed until
> - * rcu_read_unlock().
> - */
> +
> *locked = false;
> if (atomic_read(&memcg->moving_account) <= 0)
> return memcg;
> @@ -2161,6 +2166,12 @@ again:
> return memcg;
> }
>
> +/**
> + * mem_cgroup_end_page_stat - finish a page state statistics transaction
> + * @memcg: the memcg that was accounted against
> + * @locked: value received from mem_cgroup_begin_page_stat()
> + * @flags: value received from mem_cgroup_begin_page_stat()
> + */
> void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,
> unsigned long flags)
> {
> @@ -2170,6 +2181,14 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,
> rcu_read_unlock();
> }
>
> +/**
> + * mem_cgroup_update_page_stat - update page state statistics
> + * @memcg: memcg to account against
> + * @idx: page state item to account
> + * @val: number of pages (positive or negative)
> + *
> + * See mem_cgroup_begin_page_stat() for locking requirements.
> + */
> void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
> enum mem_cgroup_stat_index idx, int val)
> {
> --
> 2.1.2
>
--
Michal Hocko
SUSE Labs
On Thu 23-10-14 09:57:29, Johannes Weiner wrote:
[...]
> From b518d88254b01be8c6c0c4a496d9f311f0c71b4a Mon Sep 17 00:00:00 2001
> From: Johannes Weiner <[email protected]>
> Date: Thu, 23 Oct 2014 09:29:06 -0400
> Subject: [patch] mm: rmap: split out page_remove_file_rmap()
>
> page_remove_rmap() has too many branches on PageAnon() and is hard to
> follow. Move the file part into a separate function.
>
> Signed-off-by: Johannes Weiner <[email protected]>
Reviewed-by: Michal Hocko <[email protected]>
> ---
> mm/rmap.c | 78 +++++++++++++++++++++++++++++++++++++--------------------------
> 1 file changed, 46 insertions(+), 32 deletions(-)
>
> diff --git a/mm/rmap.c b/mm/rmap.c
> index f574046f77d4..19886fb2f13a 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1054,6 +1054,36 @@ void page_add_file_rmap(struct page *page)
> mem_cgroup_end_page_stat(memcg, locked, flags);
> }
>
> +static void page_remove_file_rmap(struct page *page)
> +{
> + struct mem_cgroup *memcg;
> + unsigned long flags;
> + bool locked;
> +
> + memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
> +
> + /* page still mapped by someone else? */
> + if (!atomic_add_negative(-1, &page->_mapcount))
> + goto out;
> +
> + /* Hugepages are not counted in NR_FILE_MAPPED for now. */
> + if (unlikely(PageHuge(page)))
> + goto out;
> +
> + /*
> + * We use the irq-unsafe __{inc|mod}_zone_page_stat because
> + * these counters are not modified in interrupt context, and
> + * pte lock(a spinlock) is held, which implies preemption disabled.
> + */
> + __dec_zone_page_state(page, NR_FILE_MAPPED);
> + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
> +
> + if (unlikely(PageMlocked(page)))
> + clear_page_mlock(page);
> +out:
> + mem_cgroup_end_page_stat(memcg, locked, flags);
> +}
> +
> /**
> * page_remove_rmap - take down pte mapping from a page
> * @page: page to remove mapping from
> @@ -1062,46 +1092,33 @@ void page_add_file_rmap(struct page *page)
> */
> void page_remove_rmap(struct page *page)
> {
> - struct mem_cgroup *uninitialized_var(memcg);
> - bool anon = PageAnon(page);
> - unsigned long flags;
> - bool locked;
> -
> - /*
> - * The anon case has no mem_cgroup page_stat to update; but may
> - * uncharge_page() below, where the lock ordering can deadlock if
> - * we hold the lock against page_stat move: so avoid it on anon.
> - */
> - if (!anon)
> - memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
> + if (!PageAnon(page)) {
> + page_remove_file_rmap(page);
> + return;
> + }
>
> /* page still mapped by someone else? */
> if (!atomic_add_negative(-1, &page->_mapcount))
> - goto out;
> + return;
> +
> + /* Hugepages are not counted in NR_ANON_PAGES for now. */
> + if (unlikely(PageHuge(page)))
> + return;
>
> /*
> - * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
> - * and not charged by memcg for now.
> - *
> * We use the irq-unsafe __{inc|mod}_zone_page_stat because
> * these counters are not modified in interrupt context, and
> - * these counters are not modified in interrupt context, and
> * pte lock(a spinlock) is held, which implies preemption disabled.
> */
> - if (unlikely(PageHuge(page)))
> - goto out;
> - if (anon) {
> - if (PageTransHuge(page))
> - __dec_zone_page_state(page,
> - NR_ANON_TRANSPARENT_HUGEPAGES);
> - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
> - -hpage_nr_pages(page));
> - } else {
> - __dec_zone_page_state(page, NR_FILE_MAPPED);
> - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
> - }
> + if (PageTransHuge(page))
> + __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
> +
> + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
> + -hpage_nr_pages(page));
> +
> if (unlikely(PageMlocked(page)))
> clear_page_mlock(page);
> +
> /*
> * It would be tidy to reset the PageAnon mapping here,
> * but that might overwrite a racing page_add_anon_rmap
> @@ -1111,9 +1128,6 @@ void page_remove_rmap(struct page *page)
> * Leaving it set also helps swapoff to reinstate ptes
> * faster for those pages still in swapcache.
> */
> -out:
> - if (!anon)
> - mem_cgroup_end_page_stat(memcg, locked, flags);
> }
>
> /*
> --
> 2.1.2
>
--
Michal Hocko
SUSE Labs