Currently there are many open-coded reads and writes of the
page->mem_cgroup pointer, as well as a couple of read helpers,
which are barely used.
It creates an obstacle on a way to reuse some bits of the pointer
for storing additional bits of information. In fact, we already do
this for slab pages, where the last bit indicates that a pointer has
an attached vector of objcg pointers instead of a regular memcg
pointer.
This commits introduces 4 new helper functions and converts all
raw accesses to page->mem_cgroup to calls of these helpers:
struct mem_cgroup *page_mem_cgroup(struct page *page);
struct mem_cgroup *page_mem_cgroup_check(struct page *page);
void set_page_mem_cgroup(struct page *page, struct mem_cgroup *memcg);
void clear_page_mem_cgroup(struct page *page);
page_mem_cgroup_check() is intended to be used in cases when the page
can be a slab page and have a memcg pointer pointing at objcg vector.
It does check the lowest bit, and if set, returns NULL.
page_mem_cgroup() contains a VM_BUG_ON_PAGE() check for the page not
being a slab page. So do set_page_mem_cgroup() and clear_page_mem_cgroup().
To make sure nobody uses a direct access, struct page's
mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
Only new helpers and a couple of slab-accounting related functions
access this field directly.
page_memcg() and page_memcg_rcu() helpers defined in mm.h are removed.
New page_mem_cgroup() is a direct analog of page_memcg(), while
page_memcg_rcu() has a single call site in a small rcu-read-lock
section, so it's just not worth it to have a separate helper. So
it's replaced with page_mem_cgroup() too.
Signed-off-by: Roman Gushchin <[email protected]>
Reviewed-by: Shakeel Butt <[email protected]>
---
fs/buffer.c | 2 +-
fs/iomap/buffered-io.c | 2 +-
include/linux/memcontrol.h | 103 +++++++++++++++++++++++---
include/linux/mm.h | 22 ------
include/linux/mm_types.h | 5 +-
include/trace/events/writeback.h | 2 +-
kernel/fork.c | 7 +-
mm/debug.c | 4 +-
mm/huge_memory.c | 4 +-
mm/memcontrol.c | 119 ++++++++++++++-----------------
mm/migrate.c | 2 +-
mm/page_alloc.c | 4 +-
mm/page_io.c | 4 +-
mm/slab.h | 9 ++-
mm/workingset.c | 6 +-
15 files changed, 170 insertions(+), 125 deletions(-)
diff --git a/fs/buffer.c b/fs/buffer.c
index bf4d8037f91b..64564ac7dcc5 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -657,7 +657,7 @@ int __set_page_dirty_buffers(struct page *page)
} while (bh != head);
}
/*
- * Lock out page->mem_cgroup migration to keep PageDirty
+ * Lock out page's memcg migration to keep PageDirty
* synchronized with per-memcg dirty page counters.
*/
lock_page_memcg(page);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 897ab9a26a74..71381931f2c3 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -669,7 +669,7 @@ iomap_set_page_dirty(struct page *page)
return !TestSetPageDirty(page);
/*
- * Lock out page->mem_cgroup migration to keep PageDirty
+ * Lock out page's memcg migration to keep PageDirty
* synchronized with per-memcg dirty page counters.
*/
lock_page_memcg(page);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e391e3c56de5..3313e7c21534 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -343,6 +343,72 @@ struct mem_cgroup {
extern struct mem_cgroup *root_mem_cgroup;
+/*
+ * page_mem_cgroup - get the memory cgroup associated with a page
+ * @page: a pointer to the page struct
+ *
+ * Returns a pointer to the memory cgroup associated with the page,
+ * or NULL. This function assumes that the page is known to have a
+ * proper memory cgroup pointer. It's not safe to call this function
+ * against some type of pages, e.g. slab pages or ex-slab pages.
+ */
+static inline struct mem_cgroup *page_mem_cgroup(struct page *page)
+{
+ VM_BUG_ON_PAGE(PageSlab(page), page);
+ return (struct mem_cgroup *)page->memcg_data;
+}
+
+/*
+ * page_mem_cgroup_check - get the memory cgroup associated with a page
+ * @page: a pointer to the page struct
+ *
+ * Returns a pointer to the memory cgroup associated with the page,
+ * or NULL. This function unlike page_mem_cgroup() can take any page
+ * as an argument. It has to be used in cases when it's not known if a page
+ * has an associated memory cgroup pointer or an object cgroups vector.
+ */
+static inline struct mem_cgroup *page_mem_cgroup_check(struct page *page)
+{
+ unsigned long memcg_data = page->memcg_data;
+
+ /*
+ * The lowest bit set means that memcg isn't a valid
+ * memcg pointer, but a obj_cgroups pointer.
+ * In this case the page is shared and doesn't belong
+ * to any specific memory cgroup.
+ */
+ if (memcg_data & 0x1UL)
+ return NULL;
+
+ return (struct mem_cgroup *)memcg_data;
+}
+
+/*
+ * set_page_mem_cgroup - associate a page with a memory cgroup
+ * @page: a pointer to the page struct
+ * @memcg: a pointer to the memory cgroup
+ *
+ * Associates a page with a memory cgroup.
+ */
+static inline void set_page_mem_cgroup(struct page *page,
+ struct mem_cgroup *memcg)
+{
+ VM_BUG_ON_PAGE(PageSlab(page), page);
+ page->memcg_data = (unsigned long)memcg;
+}
+
+/*
+ * clear_page_mem_cgroup - clear an association of a page with a memory cgroup
+ * @page: a pointer to the page struct
+ *
+ * Clears an association of a page with a memory cgroup.
+ */
+static inline void clear_page_mem_cgroup(struct page *page)
+{
+ VM_BUG_ON_PAGE(PageSlab(page), page);
+ page->memcg_data = 0;
+}
+
static __always_inline bool memcg_stat_item_in_bytes(int idx)
{
if (idx == MEMCG_PERCPU_B)
@@ -743,15 +809,15 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
static inline void __mod_memcg_page_state(struct page *page,
int idx, int val)
{
- if (page->mem_cgroup)
- __mod_memcg_state(page->mem_cgroup, idx, val);
+ if (page_mem_cgroup(page))
+ __mod_memcg_state(page_mem_cgroup(page), idx, val);
}
static inline void mod_memcg_page_state(struct page *page,
int idx, int val)
{
- if (page->mem_cgroup)
- mod_memcg_state(page->mem_cgroup, idx, val);
+ if (page_mem_cgroup(page))
+ mod_memcg_state(page_mem_cgroup(page), idx, val);
}
static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
@@ -838,12 +904,12 @@ static inline void __mod_lruvec_page_state(struct page *page,
struct lruvec *lruvec;
/* Untracked pages have no memcg, no lruvec. Update only the node */
- if (!head->mem_cgroup) {
+ if (!page_mem_cgroup(head)) {
__mod_node_page_state(pgdat, idx, val);
return;
}
- lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat);
+ lruvec = mem_cgroup_lruvec(page_mem_cgroup(head), pgdat);
__mod_lruvec_state(lruvec, idx, val);
}
@@ -878,8 +944,8 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
static inline void count_memcg_page_event(struct page *page,
enum vm_event_item idx)
{
- if (page->mem_cgroup)
- count_memcg_events(page->mem_cgroup, idx, 1);
+ if (page_mem_cgroup(page))
+ count_memcg_events(page_mem_cgroup(page), idx, 1);
}
static inline void count_memcg_event_mm(struct mm_struct *mm,
@@ -941,6 +1007,25 @@ void mem_cgroup_split_huge_fixup(struct page *head);
struct mem_cgroup;
+static inline struct mem_cgroup *page_mem_cgroup(struct page *page)
+{
+ return NULL;
+}
+
+static inline struct mem_cgroup *page_mem_cgroup_check(struct page *page)
+{
+ return NULL;
+}
+
+static inline void set_page_mem_cgroup(struct page *page,
+ struct mem_cgroup *memcg)
+{
+}
+
+static inline void clear_page_mem_cgroup(struct page *page)
+{
+}
+
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
return true;
@@ -1430,7 +1515,7 @@ static inline void mem_cgroup_track_foreign_dirty(struct page *page,
if (mem_cgroup_disabled())
return;
- if (unlikely(&page->mem_cgroup->css != wb->memcg_css))
+ if (unlikely(&page_mem_cgroup(page)->css != wb->memcg_css))
mem_cgroup_track_foreign_dirty_slowpath(page, wb);
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 17e712207d74..5e24ff2ffec9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1476,28 +1476,6 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
#endif
}
-#ifdef CONFIG_MEMCG
-static inline struct mem_cgroup *page_memcg(struct page *page)
-{
- return page->mem_cgroup;
-}
-static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
-{
- WARN_ON_ONCE(!rcu_read_lock_held());
- return READ_ONCE(page->mem_cgroup);
-}
-#else
-static inline struct mem_cgroup *page_memcg(struct page *page)
-{
- return NULL;
-}
-static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
-{
- WARN_ON_ONCE(!rcu_read_lock_held());
- return NULL;
-}
-#endif
-
/*
* Some inline functions in vmstat.h depend on page_zone()
*/
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 496c3ff97cce..4856d23b1161 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -199,10 +199,7 @@ struct page {
atomic_t _refcount;
#ifdef CONFIG_MEMCG
- union {
- struct mem_cgroup *mem_cgroup;
- struct obj_cgroup **obj_cgroups;
- };
+ unsigned long memcg_data;
#endif
/*
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index e7cbccc7c14c..b1fa3ac64fa5 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -257,7 +257,7 @@ TRACE_EVENT(track_foreign_dirty,
__entry->ino = inode ? inode->i_ino : 0;
__entry->memcg_id = wb->memcg_css->id;
__entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
- __entry->page_cgroup_ino = cgroup_ino(page->mem_cgroup->css.cgroup);
+ __entry->page_cgroup_ino = cgroup_ino(page_mem_cgroup(page)->css.cgroup);
),
TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu",
diff --git a/kernel/fork.c b/kernel/fork.c
index 138cd6ca50da..ecbd44831130 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -403,9 +403,10 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk)
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
/*
- * If memcg_kmem_charge_page() fails, page->mem_cgroup
- * pointer is NULL, and memcg_kmem_uncharge_page() in
- * free_thread_stack() will ignore this page.
+ * If memcg_kmem_charge_page() fails, page's
+ * memory cgroup pointer is NULL, and
+ * memcg_kmem_uncharge_page() in free_thread_stack()
+ * will ignore this page.
*/
ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
0);
diff --git a/mm/debug.c b/mm/debug.c
index ccca576b2899..dcbe009a92de 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -182,8 +182,8 @@ void __dump_page(struct page *page, const char *reason)
pr_warn("page dumped because: %s\n", reason);
#ifdef CONFIG_MEMCG
- if (!page_poisoned && page->mem_cgroup)
- pr_warn("page->mem_cgroup:%px\n", page->mem_cgroup);
+ if (!page_poisoned && page->memcg_data)
+ pr_warn("pages's memcg:%px\n", page->memcg_data);
#endif
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b1c7dc8a6f96..8e9b2749ef21 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -470,7 +470,7 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
#ifdef CONFIG_MEMCG
static inline struct deferred_split *get_deferred_split_queue(struct page *page)
{
- struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+ struct mem_cgroup *memcg = page_mem_cgroup(compound_head(page));
struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
if (memcg)
@@ -2735,7 +2735,7 @@ void deferred_split_huge_page(struct page *page)
{
struct deferred_split *ds_queue = get_deferred_split_queue(page);
#ifdef CONFIG_MEMCG
- struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+ struct mem_cgroup *memcg = page_mem_cgroup(compound_head(page));
#endif
unsigned long flags;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9c4a0851348f..40220b7bf96d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -533,7 +533,7 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
{
struct mem_cgroup *memcg;
- memcg = page->mem_cgroup;
+ memcg = page_mem_cgroup(page);
if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
memcg = root_mem_cgroup;
@@ -560,16 +560,7 @@ ino_t page_cgroup_ino(struct page *page)
unsigned long ino = 0;
rcu_read_lock();
- memcg = page->mem_cgroup;
-
- /*
- * The lowest bit set means that memcg isn't a valid
- * memcg pointer, but a obj_cgroups pointer.
- * In this case the page is shared and doesn't belong
- * to any specific memory cgroup.
- */
- if ((unsigned long) memcg & 0x1UL)
- memcg = NULL;
+ memcg = page_mem_cgroup_check(page);
while (memcg && !(memcg->css.flags & CSS_ONLINE))
memcg = parent_mem_cgroup(memcg);
@@ -1050,7 +1041,7 @@ EXPORT_SYMBOL(get_mem_cgroup_from_mm);
*/
struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
{
- struct mem_cgroup *memcg = page->mem_cgroup;
+ struct mem_cgroup *memcg = page_mem_cgroup(page);
if (mem_cgroup_disabled())
return NULL;
@@ -1335,7 +1326,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
* @page: the page
* @pgdat: pgdat of the page
*
- * This function relies on page->mem_cgroup being stable - see the
+ * This function relies on page and memcg binding being stable - see the
* access rules in commit_charge().
*/
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
@@ -1349,7 +1340,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd
goto out;
}
- memcg = page->mem_cgroup;
+ memcg = page_mem_cgroup(page);
/*
* Swapcache readahead pages are added to the LRU - and
* possibly migrated - before they are charged.
@@ -2105,7 +2096,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
}
/**
- * lock_page_memcg - lock a page->mem_cgroup binding
+ * lock_page_memcg - lock a page and memcg binding
* @page: the page
*
* This function protects unlocked LRU pages from being moved to
@@ -2137,7 +2128,7 @@ struct mem_cgroup *lock_page_memcg(struct page *page)
if (mem_cgroup_disabled())
return NULL;
again:
- memcg = head->mem_cgroup;
+ memcg = page_mem_cgroup(head);
if (unlikely(!memcg))
return NULL;
@@ -2145,7 +2136,7 @@ struct mem_cgroup *lock_page_memcg(struct page *page)
return memcg;
spin_lock_irqsave(&memcg->move_lock, flags);
- if (memcg != head->mem_cgroup) {
+ if (memcg != page_mem_cgroup(head)) {
spin_unlock_irqrestore(&memcg->move_lock, flags);
goto again;
}
@@ -2183,14 +2174,14 @@ void __unlock_page_memcg(struct mem_cgroup *memcg)
}
/**
- * unlock_page_memcg - unlock a page->mem_cgroup binding
+ * unlock_page_memcg - unlock a page and memcg binding
* @page: the page
*/
void unlock_page_memcg(struct page *page)
{
struct page *head = compound_head(page);
- __unlock_page_memcg(head->mem_cgroup);
+ __unlock_page_memcg(page_mem_cgroup(head));
}
EXPORT_SYMBOL(unlock_page_memcg);
@@ -2880,16 +2871,16 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
static void commit_charge(struct page *page, struct mem_cgroup *memcg)
{
- VM_BUG_ON_PAGE(page->mem_cgroup, page);
+ VM_BUG_ON_PAGE(page_mem_cgroup(page), page);
/*
- * Any of the following ensures page->mem_cgroup stability:
+ * Any of the following ensures page and memcg binding stability:
*
* - the page lock
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
*/
- page->mem_cgroup = memcg;
+ set_page_mem_cgroup(page, memcg);
}
#ifdef CONFIG_MEMCG_KMEM
@@ -2904,8 +2895,7 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
if (!vec)
return -ENOMEM;
- if (cmpxchg(&page->obj_cgroups, NULL,
- (struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
+ if (cmpxchg(&page->memcg_data, 0, (unsigned long)vec | 0x1UL))
kfree(vec);
else
kmemleak_not_leak(vec);
@@ -2928,17 +2918,6 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
page = virt_to_head_page(p);
- /*
- * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
- * or a pointer to obj_cgroup vector. In the latter case the lowest
- * bit of the pointer is set.
- * The page->mem_cgroup pointer can be asynchronously changed
- * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
- * from a valid memcg pointer to objcg vector or back.
- */
- if (!page->mem_cgroup)
- return NULL;
-
/*
* Slab objects are accounted individually, not per-page.
* Memcg membership data for each individual object is saved in
@@ -2956,8 +2935,14 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
return NULL;
}
- /* All other pages use page->mem_cgroup */
- return page->mem_cgroup;
+ /*
+ * page_mem_cgroup_check() is used here, because page_has_obj_cgroups()
+ * check above could fail because the object cgroups vector wasn't set
+ * at that moment, but it can be set concurrently.
+ * page_mem_cgroup_check(page) will guarantee tat a proper memory
+ * cgroup pointer or NULL will be returned.
+ */
+ return page_mem_cgroup_check(page);
}
__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
@@ -3095,7 +3080,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
if (memcg && !mem_cgroup_is_root(memcg)) {
ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
if (!ret) {
- page->mem_cgroup = memcg;
+ set_page_mem_cgroup(page, memcg);
__SetPageKmemcg(page);
return 0;
}
@@ -3111,7 +3096,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
*/
void __memcg_kmem_uncharge_page(struct page *page, int order)
{
- struct mem_cgroup *memcg = page->mem_cgroup;
+ struct mem_cgroup *memcg = page_mem_cgroup(page);
unsigned int nr_pages = 1 << order;
if (!memcg)
@@ -3119,7 +3104,7 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
__memcg_kmem_uncharge(memcg, nr_pages);
- page->mem_cgroup = NULL;
+ clear_page_mem_cgroup(page);
css_put(&memcg->css);
/* slab pages do not have PageKmemcg flag set */
@@ -3270,7 +3255,7 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
- struct mem_cgroup *memcg = head->mem_cgroup;
+ struct mem_cgroup *memcg = page_mem_cgroup(head);
int i;
if (mem_cgroup_disabled())
@@ -3278,7 +3263,7 @@ void mem_cgroup_split_huge_fixup(struct page *head)
for (i = 1; i < HPAGE_PMD_NR; i++) {
css_get(&memcg->css);
- head[i].mem_cgroup = memcg;
+ set_page_mem_cgroup(&head[i], memcg);
}
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -4654,7 +4639,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
struct bdi_writeback *wb)
{
- struct mem_cgroup *memcg = page->mem_cgroup;
+ struct mem_cgroup *memcg = page_mem_cgroup(page);
struct memcg_cgwb_frn *frn;
u64 now = get_jiffies_64();
u64 oldest_at = now;
@@ -5623,14 +5608,14 @@ static int mem_cgroup_move_account(struct page *page,
/*
* Prevent mem_cgroup_migrate() from looking at
- * page->mem_cgroup of its source page while we change it.
+ * page's memory cgroup of its source page while we change it.
*/
ret = -EBUSY;
if (!trylock_page(page))
goto out;
ret = -EINVAL;
- if (page->mem_cgroup != from)
+ if (page_mem_cgroup(page) != from)
goto out_unlock;
pgdat = page_pgdat(page);
@@ -5685,13 +5670,13 @@ static int mem_cgroup_move_account(struct page *page,
/*
* All state has been migrated, let's switch to the new memcg.
*
- * It is safe to change page->mem_cgroup here because the page
+ * It is safe to change page's memcg here because the page
* is referenced, charged, isolated, and locked: we can't race
* with (un)charging, migration, LRU putback, or anything else
- * that would rely on a stable page->mem_cgroup.
+ * that would rely on a stable page's memory cgroup.
*
* Note that lock_page_memcg is a memcg lock, not a page lock,
- * to save space. As soon as we switch page->mem_cgroup to a
+ * to save space. As soon as we switch page's memory cgroup to a
* new memcg that isn't locked, the above state can change
* concurrently again. Make sure we're truly done with it.
*/
@@ -5700,7 +5685,7 @@ static int mem_cgroup_move_account(struct page *page,
css_get(&to->css);
css_put(&from->css);
- page->mem_cgroup = to;
+ set_page_mem_cgroup(page, to);
__unlock_page_memcg(from);
@@ -5766,7 +5751,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
* mem_cgroup_move_account() checks the page is valid or
* not under LRU exclusion.
*/
- if (page->mem_cgroup == mc.from) {
+ if (page_mem_cgroup(page) == mc.from) {
ret = MC_TARGET_PAGE;
if (is_device_private_page(page))
ret = MC_TARGET_DEVICE;
@@ -5810,7 +5795,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
VM_BUG_ON_PAGE(!page || !PageHead(page), page);
if (!(mc.flags & MOVE_ANON))
return ret;
- if (page->mem_cgroup == mc.from) {
+ if (page_mem_cgroup(page) == mc.from) {
ret = MC_TARGET_PAGE;
if (target) {
get_page(page);
@@ -6793,12 +6778,12 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
/*
* Every swap fault against a single page tries to charge the
* page, bail as early as possible. shmem_unuse() encounters
- * already charged pages, too. page->mem_cgroup is protected
- * by the page lock, which serializes swap cache removal, which
- * in turn serializes uncharging.
+ * already charged pages, too. page and memcg binding is
+ * protected by the page lock, which serializes swap cache
+ * removal, which in turn serializes uncharging.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (compound_head(page)->mem_cgroup)
+ if (page_mem_cgroup(compound_head(page)))
goto out;
id = lookup_swap_cgroup_id(ent);
@@ -6882,21 +6867,21 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
VM_BUG_ON_PAGE(PageLRU(page), page);
- if (!page->mem_cgroup)
+ if (!page_mem_cgroup(page))
return;
/*
* Nobody should be changing or seriously looking at
- * page->mem_cgroup at this point, we have fully
+ * page_mem_cgroup(page) at this point, we have fully
* exclusive access to the page.
*/
- if (ug->memcg != page->mem_cgroup) {
+ if (ug->memcg != page_mem_cgroup(page)) {
if (ug->memcg) {
uncharge_batch(ug);
uncharge_gather_clear(ug);
}
- ug->memcg = page->mem_cgroup;
+ ug->memcg = page_mem_cgroup(page);
/* pairs with css_put in uncharge_batch */
css_get(&ug->memcg->css);
@@ -6913,7 +6898,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
}
ug->dummy_page = page;
- page->mem_cgroup = NULL;
+ clear_page_mem_cgroup(page);
css_put(&ug->memcg->css);
}
@@ -6956,7 +6941,7 @@ void mem_cgroup_uncharge(struct page *page)
return;
/* Don't touch page->lru of any random page, pre-check: */
- if (!page->mem_cgroup)
+ if (!page_mem_cgroup(page))
return;
uncharge_gather_clear(&ug);
@@ -7006,11 +6991,11 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
return;
/* Page cache replacement: new page already charged? */
- if (newpage->mem_cgroup)
+ if (page_mem_cgroup(newpage))
return;
/* Swapcache readahead pages can get replaced before being charged */
- memcg = oldpage->mem_cgroup;
+ memcg = page_mem_cgroup(oldpage);
if (!memcg)
return;
@@ -7205,7 +7190,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
- memcg = page->mem_cgroup;
+ memcg = page_mem_cgroup(page);
/* Readahead page, never charged */
if (!memcg)
@@ -7226,7 +7211,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
VM_BUG_ON_PAGE(oldid, page);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
- page->mem_cgroup = NULL;
+ clear_page_mem_cgroup(page);
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, nr_entries);
@@ -7269,7 +7254,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0;
- memcg = page->mem_cgroup;
+ memcg = page_mem_cgroup(page);
/* Readahead page, never charged */
if (!memcg)
@@ -7350,7 +7335,7 @@ bool mem_cgroup_swap_full(struct page *page)
if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
return false;
- memcg = page->mem_cgroup;
+ memcg = page_mem_cgroup(page);
if (!memcg)
return false;
diff --git a/mm/migrate.c b/mm/migrate.c
index 3ab965f83029..54c198c97b64 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -493,7 +493,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
struct lruvec *old_lruvec, *new_lruvec;
struct mem_cgroup *memcg;
- memcg = page_memcg(page);
+ memcg = page_mem_cgroup(page);
old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6b1b4a331792..d4d181e15e7c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1056,7 +1056,7 @@ static inline bool page_expected_state(struct page *page,
if (unlikely((unsigned long)page->mapping |
page_ref_count(page) |
#ifdef CONFIG_MEMCG
- (unsigned long)page->mem_cgroup |
+ (unsigned long)page_mem_cgroup(page) |
#endif
(page->flags & check_flags)))
return false;
@@ -1081,7 +1081,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
}
#ifdef CONFIG_MEMCG
- if (unlikely(page->mem_cgroup))
+ if (unlikely(page_mem_cgroup(page)))
bad_reason = "page still charged to cgroup";
#endif
return bad_reason;
diff --git a/mm/page_io.c b/mm/page_io.c
index dc6de6962612..ffa3a7d20c58 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -282,11 +282,11 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
{
struct cgroup_subsys_state *css;
- if (!page->mem_cgroup)
+ if (!page_mem_cgroup(page))
return;
rcu_read_lock();
- css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
+ css = cgroup_e_css(page_mem_cgroup(page)->css.cgroup, &io_cgrp_subsys);
bio_associate_blkg_from_css(bio, css);
rcu_read_unlock();
}
diff --git a/mm/slab.h b/mm/slab.h
index 4a24e1702923..5ac89260f329 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -242,18 +242,17 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla
static inline struct obj_cgroup **page_obj_cgroups(struct page *page)
{
/*
- * page->mem_cgroup and page->obj_cgroups are sharing the same
+ * Page's memory cgroup and obj_cgroups vector are sharing the same
* space. To distinguish between them in case we don't know for sure
* that the page is a slab page (e.g. page_cgroup_ino()), let's
* always set the lowest bit of obj_cgroups.
*/
- return (struct obj_cgroup **)
- ((unsigned long)page->obj_cgroups & ~0x1UL);
+ return (struct obj_cgroup **)(page->memcg_data & ~0x1UL);
}
static inline bool page_has_obj_cgroups(struct page *page)
{
- return ((unsigned long)page->obj_cgroups & 0x1UL);
+ return page->memcg_data & 0x1UL;
}
int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
@@ -262,7 +261,7 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
static inline void memcg_free_page_obj_cgroups(struct page *page)
{
kfree(page_obj_cgroups(page));
- page->obj_cgroups = NULL;
+ page->memcg_data = 0;
}
static inline size_t obj_full_size(struct kmem_cache *s)
diff --git a/mm/workingset.c b/mm/workingset.c
index 8ed8e6296d8c..8a56f4ca4a2e 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -257,7 +257,7 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
struct lruvec *lruvec;
int memcgid;
- /* Page is fully exclusive and pins page->mem_cgroup */
+ /* Page is fully exclusive and pins page's memory cgroup pointer */
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -345,7 +345,7 @@ void workingset_refault(struct page *page, void *shadow)
* However, the cgroup that will own the page is the one that
* is actually experiencing the refault event.
*/
- memcg = page_memcg(page);
+ memcg = page_mem_cgroup(page);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file);
@@ -407,7 +407,7 @@ void workingset_activation(struct page *page)
* XXX: See workingset_refault() - this should return
* root_mem_cgroup even for !CONFIG_MEMCG.
*/
- memcg = page_memcg_rcu(page);
+ memcg = page_mem_cgroup(page);
if (!mem_cgroup_disabled() && !memcg)
goto out;
lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
--
2.26.2
On Tue, Sep 22, 2020 at 01:36:57PM -0700, Roman Gushchin wrote:
> Currently there are many open-coded reads and writes of the
> page->mem_cgroup pointer, as well as a couple of read helpers,
> which are barely used.
>
> It creates an obstacle on a way to reuse some bits of the pointer
> for storing additional bits of information. In fact, we already do
> this for slab pages, where the last bit indicates that a pointer has
> an attached vector of objcg pointers instead of a regular memcg
> pointer.
>
> This commits introduces 4 new helper functions and converts all
> raw accesses to page->mem_cgroup to calls of these helpers:
> struct mem_cgroup *page_mem_cgroup(struct page *page);
> struct mem_cgroup *page_mem_cgroup_check(struct page *page);
> void set_page_mem_cgroup(struct page *page, struct mem_cgroup *memcg);
> void clear_page_mem_cgroup(struct page *page);
Sounds reasonable to me!
> page_mem_cgroup_check() is intended to be used in cases when the page
> can be a slab page and have a memcg pointer pointing at objcg vector.
> It does check the lowest bit, and if set, returns NULL.
> page_mem_cgroup() contains a VM_BUG_ON_PAGE() check for the page not
> being a slab page. So do set_page_mem_cgroup() and clear_page_mem_cgroup().
>
> To make sure nobody uses a direct access, struct page's
> mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
> Only new helpers and a couple of slab-accounting related functions
> access this field directly.
>
> page_memcg() and page_memcg_rcu() helpers defined in mm.h are removed.
> New page_mem_cgroup() is a direct analog of page_memcg(), while
> page_memcg_rcu() has a single call site in a small rcu-read-lock
> section, so it's just not worth it to have a separate helper. So
> it's replaced with page_mem_cgroup() too.
page_memcg_rcu() does READ_ONCE(). We need to keep that for lockless
accesses.
> @@ -343,6 +343,72 @@ struct mem_cgroup {
>
> extern struct mem_cgroup *root_mem_cgroup;
>
> +/*
> + * page_mem_cgroup - get the memory cgroup associated with a page
> + * @page: a pointer to the page struct
> + *
> + * Returns a pointer to the memory cgroup associated with the page,
> + * or NULL. This function assumes that the page is known to have a
> + * proper memory cgroup pointer. It's not safe to call this function
> + * against some type of pages, e.g. slab pages or ex-slab pages.
> + */
> +static inline struct mem_cgroup *page_mem_cgroup(struct page *page)
> +{
> + VM_BUG_ON_PAGE(PageSlab(page), page);
> + return (struct mem_cgroup *)page->memcg_data;
> +}
This would also be a good place to mention what's required for the
function to be called safely, or in a way that produces a stable
result - i.e. the list of conditions in commit_charge().
> + * page_mem_cgroup_check - get the memory cgroup associated with a page
> + * @page: a pointer to the page struct
> + *
> + * Returns a pointer to the memory cgroup associated with the page,
> + * or NULL. This function unlike page_mem_cgroup() can take any page
> + * as an argument. It has to be used in cases when it's not known if a page
> + * has an associated memory cgroup pointer or an object cgroups vector.
> + */
> +static inline struct mem_cgroup *page_mem_cgroup_check(struct page *page)
> +{
> + unsigned long memcg_data = page->memcg_data;
> +
> + /*
> + * The lowest bit set means that memcg isn't a valid
> + * memcg pointer, but a obj_cgroups pointer.
> + * In this case the page is shared and doesn't belong
> + * to any specific memory cgroup.
> + */
> + if (memcg_data & 0x1UL)
> + return NULL;
> +
> + return (struct mem_cgroup *)memcg_data;
> +}
Here as well.
> +
> +/*
> + * set_page_mem_cgroup - associate a page with a memory cgroup
> + * @page: a pointer to the page struct
> + * @memcg: a pointer to the memory cgroup
> + *
> + * Associates a page with a memory cgroup.
> + */
> +static inline void set_page_mem_cgroup(struct page *page,
> + struct mem_cgroup *memcg)
> +{
> + VM_BUG_ON_PAGE(PageSlab(page), page);
> + page->memcg_data = (unsigned long)memcg;
> +}
> +
> +/*
> + * clear_page_mem_cgroup - clear an association of a page with a memory cgroup
> + * @page: a pointer to the page struct
> + *
> + * Clears an association of a page with a memory cgroup.
> + */
> +static inline void clear_page_mem_cgroup(struct page *page)
> +{
> + VM_BUG_ON_PAGE(PageSlab(page), page);
> + page->memcg_data = 0;
> +}
> +
> static __always_inline bool memcg_stat_item_in_bytes(int idx)
> {
> if (idx == MEMCG_PERCPU_B)
> @@ -743,15 +809,15 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
> static inline void __mod_memcg_page_state(struct page *page,
> int idx, int val)
> {
> - if (page->mem_cgroup)
> - __mod_memcg_state(page->mem_cgroup, idx, val);
> + if (page_mem_cgroup(page))
> + __mod_memcg_state(page_mem_cgroup(page), idx, val);
> }
>
> static inline void mod_memcg_page_state(struct page *page,
> int idx, int val)
> {
> - if (page->mem_cgroup)
> - mod_memcg_state(page->mem_cgroup, idx, val);
> + if (page_mem_cgroup(page))
> + mod_memcg_state(page_mem_cgroup(page), idx, val);
> }
>
> static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
> @@ -838,12 +904,12 @@ static inline void __mod_lruvec_page_state(struct page *page,
> struct lruvec *lruvec;
>
> /* Untracked pages have no memcg, no lruvec. Update only the node */
> - if (!head->mem_cgroup) {
> + if (!page_mem_cgroup(head)) {
> __mod_node_page_state(pgdat, idx, val);
> return;
> }
>
> - lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat);
> + lruvec = mem_cgroup_lruvec(page_mem_cgroup(head), pgdat);
> __mod_lruvec_state(lruvec, idx, val);
The repetition of the function call is a bit jarring, especially in
configs with VM_BUG_ON() enabled (some distros use it for their beta
release kernels, so it's not just kernel developer test machines that
pay this cost). Can you please use a local variable when the function
needs the memcg more than once?
> @@ -878,8 +944,8 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
> static inline void count_memcg_page_event(struct page *page,
> enum vm_event_item idx)
> {
> - if (page->mem_cgroup)
> - count_memcg_events(page->mem_cgroup, idx, 1);
> + if (page_mem_cgroup(page))
> + count_memcg_events(page_mem_cgroup(page), idx, 1);
> }
>
> static inline void count_memcg_event_mm(struct mm_struct *mm,
> @@ -941,6 +1007,25 @@ void mem_cgroup_split_huge_fixup(struct page *head);
>
> struct mem_cgroup;
>
> +static inline struct mem_cgroup *page_mem_cgroup(struct page *page)
> +{
> + return NULL;
> +}
> +
> +static inline struct mem_cgroup *page_mem_cgroup_check(struct page *page)
> +{
> + return NULL;
> +}
> +
> +static inline void set_page_mem_cgroup(struct page *page,
> + struct mem_cgroup *memcg)
> +{
> +}
> +
> +static inline void clear_page_mem_cgroup(struct page *page)
> +{
> +}
> +
> static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
> {
> return true;
> @@ -1430,7 +1515,7 @@ static inline void mem_cgroup_track_foreign_dirty(struct page *page,
> if (mem_cgroup_disabled())
> return;
>
> - if (unlikely(&page->mem_cgroup->css != wb->memcg_css))
> + if (unlikely(&page_mem_cgroup(page)->css != wb->memcg_css))
> mem_cgroup_track_foreign_dirty_slowpath(page, wb);
> }
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 17e712207d74..5e24ff2ffec9 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1476,28 +1476,6 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
> #endif
> }
>
> -#ifdef CONFIG_MEMCG
> -static inline struct mem_cgroup *page_memcg(struct page *page)
> -{
> - return page->mem_cgroup;
> -}
> -static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
> -{
> - WARN_ON_ONCE(!rcu_read_lock_held());
> - return READ_ONCE(page->mem_cgroup);
> -}
> -#else
> -static inline struct mem_cgroup *page_memcg(struct page *page)
> -{
> - return NULL;
> -}
> -static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
> -{
> - WARN_ON_ONCE(!rcu_read_lock_held());
> - return NULL;
> -}
> -#endif
You essentially renamed these existing helpers, but I don't think
that's justified. Especially with the proliferation of callsites, the
original names are nicer. I'd prefer we keep them.
> @@ -560,16 +560,7 @@ ino_t page_cgroup_ino(struct page *page)
> unsigned long ino = 0;
>
> rcu_read_lock();
> - memcg = page->mem_cgroup;
> -
> - /*
> - * The lowest bit set means that memcg isn't a valid
> - * memcg pointer, but a obj_cgroups pointer.
> - * In this case the page is shared and doesn't belong
> - * to any specific memory cgroup.
> - */
> - if ((unsigned long) memcg & 0x1UL)
> - memcg = NULL;
> + memcg = page_mem_cgroup_check(page);
This should actually have been using READ_ONCE() all along. Otherwise
the compiler can issue multiple loads to page->mem_cgroup here and you
can end up with a pointer with the lowest bit set leaking out.
> @@ -2928,17 +2918,6 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
>
> page = virt_to_head_page(p);
>
> - /*
> - * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
> - * or a pointer to obj_cgroup vector. In the latter case the lowest
> - * bit of the pointer is set.
> - * The page->mem_cgroup pointer can be asynchronously changed
> - * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
> - * from a valid memcg pointer to objcg vector or back.
> - */
> - if (!page->mem_cgroup)
> - return NULL;
> -
> /*
> * Slab objects are accounted individually, not per-page.
> * Memcg membership data for each individual object is saved in
> @@ -2956,8 +2935,14 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
> return NULL;
> }
>
> - /* All other pages use page->mem_cgroup */
> - return page->mem_cgroup;
> + /*
> + * page_mem_cgroup_check() is used here, because page_has_obj_cgroups()
> + * check above could fail because the object cgroups vector wasn't set
> + * at that moment, but it can be set concurrently.
> + * page_mem_cgroup_check(page) will guarantee tat a proper memory
> + * cgroup pointer or NULL will be returned.
> + */
> + return page_mem_cgroup_check(page);
The code right now doesn't look quite safe. As per above, without the
READ_ONCE the compiler might issue multiple loads and we may get a
pointer with the low bit set.
Maybe slightly off-topic, but what are "all other pages" in general?
I don't see any callsites that ask for ownership on objects whose
backing pages may belong to a single memcg. That wouldn't seem to make
too much sense. Unless I'm missing something, this function should
probably tighten up its scope a bit and only work on stuff that is
actually following the obj_cgroup protocol.
I.e. either do the obj_cgroup lookup, or return root_mem_cgroup like
the other mem_cgroup_from_* functions.
On Thu, Sep 24, 2020 at 03:45:08PM -0400, Johannes Weiner wrote:
> On Tue, Sep 22, 2020 at 01:36:57PM -0700, Roman Gushchin wrote:
> > Currently there are many open-coded reads and writes of the
> > page->mem_cgroup pointer, as well as a couple of read helpers,
> > which are barely used.
> >
> > It creates an obstacle on a way to reuse some bits of the pointer
> > for storing additional bits of information. In fact, we already do
> > this for slab pages, where the last bit indicates that a pointer has
> > an attached vector of objcg pointers instead of a regular memcg
> > pointer.
> >
> > This commits introduces 4 new helper functions and converts all
> > raw accesses to page->mem_cgroup to calls of these helpers:
> > struct mem_cgroup *page_mem_cgroup(struct page *page);
> > struct mem_cgroup *page_mem_cgroup_check(struct page *page);
> > void set_page_mem_cgroup(struct page *page, struct mem_cgroup *memcg);
> > void clear_page_mem_cgroup(struct page *page);
>
> Sounds reasonable to me!
>
> > page_mem_cgroup_check() is intended to be used in cases when the page
> > can be a slab page and have a memcg pointer pointing at objcg vector.
> > It does check the lowest bit, and if set, returns NULL.
> > page_mem_cgroup() contains a VM_BUG_ON_PAGE() check for the page not
> > being a slab page. So do set_page_mem_cgroup() and clear_page_mem_cgroup().
> >
> > To make sure nobody uses a direct access, struct page's
> > mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
> > Only new helpers and a couple of slab-accounting related functions
> > access this field directly.
> >
> > page_memcg() and page_memcg_rcu() helpers defined in mm.h are removed.
> > New page_mem_cgroup() is a direct analog of page_memcg(), while
> > page_memcg_rcu() has a single call site in a small rcu-read-lock
> > section, so it's just not worth it to have a separate helper. So
> > it's replaced with page_mem_cgroup() too.
>
> page_memcg_rcu() does READ_ONCE(). We need to keep that for lockless
> accesses.
Ok, how about page_memcg() and page_objcgs() which always do READ_ONCE()?
Because page_memcg_rcu() has only a single call site, I would prefer to
have one helper instead of two.
>
> > @@ -343,6 +343,72 @@ struct mem_cgroup {
> >
> > extern struct mem_cgroup *root_mem_cgroup;
> >
> > +/*
> > + * page_mem_cgroup - get the memory cgroup associated with a page
> > + * @page: a pointer to the page struct
> > + *
> > + * Returns a pointer to the memory cgroup associated with the page,
> > + * or NULL. This function assumes that the page is known to have a
> > + * proper memory cgroup pointer. It's not safe to call this function
> > + * against some type of pages, e.g. slab pages or ex-slab pages.
> > + */
> > +static inline struct mem_cgroup *page_mem_cgroup(struct page *page)
> > +{
> > + VM_BUG_ON_PAGE(PageSlab(page), page);
> > + return (struct mem_cgroup *)page->memcg_data;
> > +}
>
> This would also be a good place to mention what's required for the
> function to be called safely, or in a way that produces a stable
> result - i.e. the list of conditions in commit_charge().
Makes sense.
>
> > + * page_mem_cgroup_check - get the memory cgroup associated with a page
> > + * @page: a pointer to the page struct
> > + *
> > + * Returns a pointer to the memory cgroup associated with the page,
> > + * or NULL. This function unlike page_mem_cgroup() can take any page
> > + * as an argument. It has to be used in cases when it's not known if a page
> > + * has an associated memory cgroup pointer or an object cgroups vector.
> > + */
> > +static inline struct mem_cgroup *page_mem_cgroup_check(struct page *page)
> > +{
> > + unsigned long memcg_data = page->memcg_data;
> > +
> > + /*
> > + * The lowest bit set means that memcg isn't a valid
> > + * memcg pointer, but a obj_cgroups pointer.
> > + * In this case the page is shared and doesn't belong
> > + * to any specific memory cgroup.
> > + */
> > + if (memcg_data & 0x1UL)
> > + return NULL;
> > +
> > + return (struct mem_cgroup *)memcg_data;
> > +}
>
> Here as well.
>
> > +
> > +/*
> > + * set_page_mem_cgroup - associate a page with a memory cgroup
> > + * @page: a pointer to the page struct
> > + * @memcg: a pointer to the memory cgroup
> > + *
> > + * Associates a page with a memory cgroup.
> > + */
> > +static inline void set_page_mem_cgroup(struct page *page,
> > + struct mem_cgroup *memcg)
> > +{
> > + VM_BUG_ON_PAGE(PageSlab(page), page);
> > + page->memcg_data = (unsigned long)memcg;
> > +}
> > +
> > +/*
> > + * clear_page_mem_cgroup - clear an association of a page with a memory cgroup
> > + * @page: a pointer to the page struct
> > + *
> > + * Clears an association of a page with a memory cgroup.
> > + */
> > +static inline void clear_page_mem_cgroup(struct page *page)
> > +{
> > + VM_BUG_ON_PAGE(PageSlab(page), page);
> > + page->memcg_data = 0;
> > +}
> > +
> > static __always_inline bool memcg_stat_item_in_bytes(int idx)
> > {
> > if (idx == MEMCG_PERCPU_B)
> > @@ -743,15 +809,15 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
> > static inline void __mod_memcg_page_state(struct page *page,
> > int idx, int val)
> > {
> > - if (page->mem_cgroup)
> > - __mod_memcg_state(page->mem_cgroup, idx, val);
> > + if (page_mem_cgroup(page))
> > + __mod_memcg_state(page_mem_cgroup(page), idx, val);
> > }
> >
> > static inline void mod_memcg_page_state(struct page *page,
> > int idx, int val)
> > {
> > - if (page->mem_cgroup)
> > - mod_memcg_state(page->mem_cgroup, idx, val);
> > + if (page_mem_cgroup(page))
> > + mod_memcg_state(page_mem_cgroup(page), idx, val);
> > }
> >
> > static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
> > @@ -838,12 +904,12 @@ static inline void __mod_lruvec_page_state(struct page *page,
> > struct lruvec *lruvec;
> >
> > /* Untracked pages have no memcg, no lruvec. Update only the node */
> > - if (!head->mem_cgroup) {
> > + if (!page_mem_cgroup(head)) {
> > __mod_node_page_state(pgdat, idx, val);
> > return;
> > }
> >
> > - lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat);
> > + lruvec = mem_cgroup_lruvec(page_mem_cgroup(head), pgdat);
> > __mod_lruvec_state(lruvec, idx, val);
>
> The repetition of the function call is a bit jarring, especially in
> configs with VM_BUG_ON() enabled (some distros use it for their beta
> release kernels, so it's not just kernel developer test machines that
> pay this cost). Can you please use a local variable when the function
> needs the memcg more than once?
Sure.
>
> > @@ -878,8 +944,8 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
> > static inline void count_memcg_page_event(struct page *page,
> > enum vm_event_item idx)
> > {
> > - if (page->mem_cgroup)
> > - count_memcg_events(page->mem_cgroup, idx, 1);
> > + if (page_mem_cgroup(page))
> > + count_memcg_events(page_mem_cgroup(page), idx, 1);
> > }
> >
> > static inline void count_memcg_event_mm(struct mm_struct *mm,
> > @@ -941,6 +1007,25 @@ void mem_cgroup_split_huge_fixup(struct page *head);
> >
> > struct mem_cgroup;
> >
> > +static inline struct mem_cgroup *page_mem_cgroup(struct page *page)
> > +{
> > + return NULL;
> > +}
> > +
> > +static inline struct mem_cgroup *page_mem_cgroup_check(struct page *page)
> > +{
> > + return NULL;
> > +}
> > +
> > +static inline void set_page_mem_cgroup(struct page *page,
> > + struct mem_cgroup *memcg)
> > +{
> > +}
> > +
> > +static inline void clear_page_mem_cgroup(struct page *page)
> > +{
> > +}
> > +
> > static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
> > {
> > return true;
> > @@ -1430,7 +1515,7 @@ static inline void mem_cgroup_track_foreign_dirty(struct page *page,
> > if (mem_cgroup_disabled())
> > return;
> >
> > - if (unlikely(&page->mem_cgroup->css != wb->memcg_css))
> > + if (unlikely(&page_mem_cgroup(page)->css != wb->memcg_css))
> > mem_cgroup_track_foreign_dirty_slowpath(page, wb);
> > }
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 17e712207d74..5e24ff2ffec9 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -1476,28 +1476,6 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
> > #endif
> > }
> >
> > -#ifdef CONFIG_MEMCG
> > -static inline struct mem_cgroup *page_memcg(struct page *page)
> > -{
> > - return page->mem_cgroup;
> > -}
> > -static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
> > -{
> > - WARN_ON_ONCE(!rcu_read_lock_held());
> > - return READ_ONCE(page->mem_cgroup);
> > -}
> > -#else
> > -static inline struct mem_cgroup *page_memcg(struct page *page)
> > -{
> > - return NULL;
> > -}
> > -static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
> > -{
> > - WARN_ON_ONCE(!rcu_read_lock_held());
> > - return NULL;
> > -}
> > -#endif
>
> You essentially renamed these existing helpers, but I don't think
> that's justified. Especially with the proliferation of callsites, the
> original names are nicer. I'd prefer we keep them.
>
> > @@ -560,16 +560,7 @@ ino_t page_cgroup_ino(struct page *page)
> > unsigned long ino = 0;
> >
> > rcu_read_lock();
> > - memcg = page->mem_cgroup;
> > -
> > - /*
> > - * The lowest bit set means that memcg isn't a valid
> > - * memcg pointer, but a obj_cgroups pointer.
> > - * In this case the page is shared and doesn't belong
> > - * to any specific memory cgroup.
> > - */
> > - if ((unsigned long) memcg & 0x1UL)
> > - memcg = NULL;
> > + memcg = page_mem_cgroup_check(page);
>
> This should actually have been using READ_ONCE() all along. Otherwise
> the compiler can issue multiple loads to page->mem_cgroup here and you
> can end up with a pointer with the lowest bit set leaking out.
>
> > @@ -2928,17 +2918,6 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
> >
> > page = virt_to_head_page(p);
> >
> > - /*
> > - * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
> > - * or a pointer to obj_cgroup vector. In the latter case the lowest
> > - * bit of the pointer is set.
> > - * The page->mem_cgroup pointer can be asynchronously changed
> > - * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
> > - * from a valid memcg pointer to objcg vector or back.
> > - */
> > - if (!page->mem_cgroup)
> > - return NULL;
> > -
> > /*
> > * Slab objects are accounted individually, not per-page.
> > * Memcg membership data for each individual object is saved in
> > @@ -2956,8 +2935,14 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
> > return NULL;
> > }
> >
> > - /* All other pages use page->mem_cgroup */
> > - return page->mem_cgroup;
> > + /*
> > + * page_mem_cgroup_check() is used here, because page_has_obj_cgroups()
> > + * check above could fail because the object cgroups vector wasn't set
> > + * at that moment, but it can be set concurrently.
> > + * page_mem_cgroup_check(page) will guarantee tat a proper memory
> > + * cgroup pointer or NULL will be returned.
> > + */
> > + return page_mem_cgroup_check(page);
>
> The code right now doesn't look quite safe. As per above, without the
> READ_ONCE the compiler might issue multiple loads and we may get a
> pointer with the low bit set.
>
> Maybe slightly off-topic, but what are "all other pages" in general?
> I don't see any callsites that ask for ownership on objects whose
> backing pages may belong to a single memcg. That wouldn't seem to make
> too much sense. Unless I'm missing something, this function should
> probably tighten up its scope a bit and only work on stuff that is
> actually following the obj_cgroup protocol.
Kernel stacks can be slabs or generic pages/vmallocs. Also large kmallocs
are using the page allocator, so they don't follow the objcg protocol.
Thanks!
On Thu, Sep 24, 2020 at 01:27:00PM -0700, Roman Gushchin wrote:
> On Thu, Sep 24, 2020 at 03:45:08PM -0400, Johannes Weiner wrote:
> > On Tue, Sep 22, 2020 at 01:36:57PM -0700, Roman Gushchin wrote:
> > > Currently there are many open-coded reads and writes of the
> > > page->mem_cgroup pointer, as well as a couple of read helpers,
> > > which are barely used.
> > >
> > > It creates an obstacle on a way to reuse some bits of the pointer
> > > for storing additional bits of information. In fact, we already do
> > > this for slab pages, where the last bit indicates that a pointer has
> > > an attached vector of objcg pointers instead of a regular memcg
> > > pointer.
> > >
> > > This commits introduces 4 new helper functions and converts all
> > > raw accesses to page->mem_cgroup to calls of these helpers:
> > > struct mem_cgroup *page_mem_cgroup(struct page *page);
> > > struct mem_cgroup *page_mem_cgroup_check(struct page *page);
> > > void set_page_mem_cgroup(struct page *page, struct mem_cgroup *memcg);
> > > void clear_page_mem_cgroup(struct page *page);
> >
> > Sounds reasonable to me!
> >
> > > page_mem_cgroup_check() is intended to be used in cases when the page
> > > can be a slab page and have a memcg pointer pointing at objcg vector.
> > > It does check the lowest bit, and if set, returns NULL.
> > > page_mem_cgroup() contains a VM_BUG_ON_PAGE() check for the page not
> > > being a slab page. So do set_page_mem_cgroup() and clear_page_mem_cgroup().
> > >
> > > To make sure nobody uses a direct access, struct page's
> > > mem_cgroup/obj_cgroups is converted to unsigned long memcg_data.
> > > Only new helpers and a couple of slab-accounting related functions
> > > access this field directly.
> > >
> > > page_memcg() and page_memcg_rcu() helpers defined in mm.h are removed.
> > > New page_mem_cgroup() is a direct analog of page_memcg(), while
> > > page_memcg_rcu() has a single call site in a small rcu-read-lock
> > > section, so it's just not worth it to have a separate helper. So
> > > it's replaced with page_mem_cgroup() too.
> >
> > page_memcg_rcu() does READ_ONCE(). We need to keep that for lockless
> > accesses.
>
> Ok, how about page_memcg() and page_objcgs() which always do READ_ONCE()?
> Because page_memcg_rcu() has only a single call site, I would prefer to
> have one helper instead of two.
Well, page_objcgs() needs it everywhere because the pointer changes
locklessly, so that makes sense to me.
For page_memcg(), I'll have to NAK that approach. It's not justified
to impose ordering cost on a hundred callers that don't need it. And
it muddies the serialization rules of the interface.
> > > @@ -2956,8 +2935,14 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
> > > return NULL;
> > > }
> > >
> > > - /* All other pages use page->mem_cgroup */
> > > - return page->mem_cgroup;
> > > + /*
> > > + * page_mem_cgroup_check() is used here, because page_has_obj_cgroups()
> > > + * check above could fail because the object cgroups vector wasn't set
> > > + * at that moment, but it can be set concurrently.
> > > + * page_mem_cgroup_check(page) will guarantee tat a proper memory
> > > + * cgroup pointer or NULL will be returned.
> > > + */
> > > + return page_mem_cgroup_check(page);
> >
> > The code right now doesn't look quite safe. As per above, without the
> > READ_ONCE the compiler might issue multiple loads and we may get a
> > pointer with the low bit set.
> >
> > Maybe slightly off-topic, but what are "all other pages" in general?
> > I don't see any callsites that ask for ownership on objects whose
> > backing pages may belong to a single memcg. That wouldn't seem to make
> > too much sense. Unless I'm missing something, this function should
> > probably tighten up its scope a bit and only work on stuff that is
> > actually following the obj_cgroup protocol.
>
> Kernel stacks can be slabs or generic pages/vmallocs. Also large kmallocs
> are using the page allocator, so they don't follow the objcg protocol.
Ah, that's super valuable information! Instead of deleting the "all
other pages" comment, could you please elaborate on it?
Thanks!