Since Roman series "The new cgroup slab memory controller" applied. All
slab objects are charged with the new APIs of obj_cgroup. The new APIs
introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the memory.
But there are still some corner objects (e.g. allocations larger than
order-1 page on SLUB) which are not charged with the new APIs. Those
objects (include the pages which are allocated from buddy allocator
directly) are charged as kmem pages which still hold a reference to
the memory cgroup.
E.g. We know that the kernel stack is charged as kmem pages because the
size of the kernel stack can be greater than 2 pages (e.g. 16KB on x86_64
or arm64). If we create a thread (suppose the thread stack is charged to
memory cgroup A) and then move it from memory cgroup A to memory cgroup
B. Because the kernel stack of the thread hold a reference to the memory
cgroup A. The thread can pin the memory cgroup A in the memory even if
we remove the cgroup A. If we want to see this scenario by using the
following script. We can see that the system has added 500 dying cgroups
(This is not a real world issue, just a script to show that the large
kmallocs are charged as kmem pages which can pin the memory cgroup in the
memory).
#!/bin/bash
cat /proc/cgroups | grep memory
cd /sys/fs/cgroup/memory
echo 1 > memory.move_charge_at_immigrate
for i in range{1..500}
do
mkdir kmem_test
echo $$ > kmem_test/cgroup.procs
sleep 3600 &
echo $$ > cgroup.procs
echo `cat kmem_test/cgroup.procs` > cgroup.procs
rmdir kmem_test
done
cat /proc/cgroups | grep memory
This patchset aims to make those kmem pages to drop the reference to memory
cgroup by using the APIs of obj_cgroup. Finally, we can see that the number
of the dying cgroups will not increase if we run the above test script.
Patch 1-3 use obj_cgroup APIs to charge kmem pages.
Patch 4 introduces remote objcg charging APIs.
Patch 5 uses remote objcg charging APIs to charge kernel memory.
Changlogs in v2:
1. Fix some types in the commit log (Thanks Roman).
2. Do not introduce page_memcg_kmem helper (Thanks to Johannes and Shakeel).
3. Reduce the CC list to mm/memcg folks (Thanks to Johannes).
4. Introduce remote objcg charging APIs instead of convert "remote memcg
charging APIs" to "remote objcg charging APIs".
Muchun Song (5):
mm: memcontrol: introduce obj_cgroup_{un}charge_page
mm: memcontrol: make page_memcg{_rcu} only applicable for non-kmem
page
mm: memcontrol: charge kmem pages by using obj_cgroup APIs
mm: memcontrol: introduce remote objcg charging API
mm: memcontrol: use remote objcg charging APIs to charge kernel memory
fs/buffer.c | 10 +-
fs/notify/fanotify/fanotify.c | 6 +-
fs/notify/fanotify/fanotify_user.c | 2 +-
fs/notify/group.c | 3 +-
fs/notify/inotify/inotify_fsnotify.c | 8 +-
fs/notify/inotify/inotify_user.c | 2 +-
include/linux/bpf.h | 2 +-
include/linux/fsnotify_backend.h | 2 +-
include/linux/memcontrol.h | 114 +++++++++++++---
include/linux/sched.h | 4 +
include/linux/sched/mm.h | 38 ++++++
kernel/bpf/syscall.c | 35 ++---
kernel/fork.c | 3 +
mm/memcontrol.c | 257 ++++++++++++++++++++++++++---------
mm/page_alloc.c | 4 +-
15 files changed, 372 insertions(+), 118 deletions(-)
--
2.11.0
The remote memcg charing APIs is a mechanism to charge pages to a given
memcg. Since all kernel memory are charged by using obj_cgroup APIs.
Actually, we want to charge kernel memory to the remote object cgroup
instead of memory cgroup. So introduce remote objcg charging APIs to
charge the kmem pages by using objcg_cgroup APIs. And if remote memcg
and objcg are both set, objcg takes precedence over memcg to charge
the kmem pages.
In the later patch, we will use those API to charge kernel memory to
the remote objcg.
Signed-off-by: Muchun Song <[email protected]>
---
include/linux/sched.h | 4 ++++
include/linux/sched/mm.h | 38 ++++++++++++++++++++++++++++++++++++++
kernel/fork.c | 3 +++
mm/memcontrol.c | 44 ++++++++++++++++++++++++++++++++++++++++----
4 files changed, 85 insertions(+), 4 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ee46f5cab95b..8edcc71a0a1d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1318,6 +1318,10 @@ struct task_struct {
/* Used by memcontrol for targeted memcg charge: */
struct mem_cgroup *active_memcg;
#endif
+#ifdef CONFIG_MEMCG_KMEM
+ /* Used by memcontrol for targeted objcg charge: */
+ struct obj_cgroup *active_objcg;
+#endif
#ifdef CONFIG_BLK_CGROUP
struct request_queue *throttle_queue;
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 1ae08b8462a4..be1189598b09 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -330,6 +330,44 @@ set_active_memcg(struct mem_cgroup *memcg)
}
#endif
+#ifdef CONFIG_MEMCG_KMEM
+DECLARE_PER_CPU(struct obj_cgroup *, int_active_objcg);
+
+/**
+ * set_active_objcg - Starts the remote objcg kmem pages charging scope.
+ * @objcg: objcg to charge.
+ *
+ * This function marks the beginning of the remote objcg charging scope. All the
+ * __GFP_ACCOUNT kmem page allocations till the end of the scope will be charged
+ * to the given objcg.
+ *
+ * NOTE: This function can nest. Users must save the return value and
+ * reset the previous value after their own charging scope is over.
+ *
+ * If remote memcg and objcg are both set, objcg takes precedence over memcg
+ * to charge the kmem pages.
+ */
+static inline struct obj_cgroup *set_active_objcg(struct obj_cgroup *objcg)
+{
+ struct obj_cgroup *old;
+
+ if (in_interrupt()) {
+ old = this_cpu_read(int_active_objcg);
+ this_cpu_write(int_active_objcg, objcg);
+ } else {
+ old = current->active_objcg;
+ current->active_objcg = objcg;
+ }
+
+ return old;
+}
+#else
+static inline struct obj_cgroup *set_active_objcg(struct obj_cgroup *objcg)
+{
+ return NULL;
+}
+#endif
+
#ifdef CONFIG_MEMBARRIER
enum {
MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0),
diff --git a/kernel/fork.c b/kernel/fork.c
index d66cd1014211..b4b9dd5d122f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -945,6 +945,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_MEMCG
tsk->active_memcg = NULL;
#endif
+#ifdef CONFIG_MEMCG_KMEM
+ tsk->active_objcg = NULL;
+#endif
return tsk;
free_stack:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0cf342d22547..e48d4ab0af76 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -79,6 +79,11 @@ struct mem_cgroup *root_mem_cgroup __read_mostly;
/* Active memory cgroup to use from an interrupt context */
DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
+#ifdef CONFIG_MEMCG_KMEM
+/* Active object cgroup to use from an interrupt context */
+DEFINE_PER_CPU(struct obj_cgroup *, int_active_objcg);
+#endif
+
/* Socket memory accounting disabled? */
static bool cgroup_memory_nosocket;
@@ -1076,7 +1081,7 @@ static __always_inline struct mem_cgroup *get_active_memcg(void)
return memcg;
}
-static __always_inline bool memcg_kmem_bypass(void)
+static __always_inline bool memcg_charge_bypass(void)
{
/* Allow remote memcg charging from any context. */
if (unlikely(active_memcg()))
@@ -1094,7 +1099,7 @@ static __always_inline bool memcg_kmem_bypass(void)
*/
static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
{
- if (memcg_kmem_bypass())
+ if (memcg_charge_bypass())
return NULL;
if (unlikely(active_memcg()))
@@ -1103,6 +1108,29 @@ static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
return get_mem_cgroup_from_mm(current->mm);
}
+#ifdef CONFIG_MEMCG_KMEM
+static __always_inline struct obj_cgroup *active_objcg(void)
+{
+ if (in_interrupt())
+ return this_cpu_read(int_active_objcg);
+ else
+ return current->active_objcg;
+}
+
+static __always_inline bool kmem_charge_bypass(void)
+{
+ /* Allow remote charging from any context. */
+ if (unlikely(active_objcg() || active_memcg()))
+ return false;
+
+ /* Memcg to charge can't be determined. */
+ if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
+ return true;
+
+ return false;
+}
+#endif
+
/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
@@ -2997,13 +3025,20 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
{
- struct obj_cgroup *objcg = NULL;
+ struct obj_cgroup *objcg;
struct mem_cgroup *memcg;
- if (memcg_kmem_bypass())
+ if (kmem_charge_bypass())
return NULL;
rcu_read_lock();
+ objcg = active_objcg();
+ if (unlikely(objcg)) {
+ /* remote object cgroup must hold a reference. */
+ obj_cgroup_get(objcg);
+ goto out;
+ }
+
if (unlikely(active_memcg()))
memcg = active_memcg();
else
@@ -3015,6 +3050,7 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
break;
objcg = NULL;
}
+out:
rcu_read_unlock();
return objcg;
--
2.11.0
We want to reuse the obj_cgroup APIs to charge the kmem pages when
If we do that, we should store an object cgroup pointer to
page->memcg_data for the kmem pages.
Finally, page->memcg_data can have 3 different meanings.
1) For the slab pages, page->memcg_data points to an object cgroups
vector.
2) For the kmem pages (exclude the slab pages), page->memcg_data
points to an object cgroup.
3) For the user pages (e.g. the LRU pages), page->memcg_data points
to a memory cgroup.
Currently we always get the memory cgroup associated with a page via
page_memcg() or page_memcg_rcu(). page_memcg_check() is special, it
has to be used in cases when it's not known if a page has an
associated memory cgroup pointer or an object cgroups vector. Because
the page->memcg_data of the kmem page is not pointing to a memory
cgroup in the later patch, the page_memcg() and page_memcg_rcu()
cannot be applicable for the kmem pages. In this patch, make
page_memcg() and page_memcg_rcu() no longer apply to the kmem pages.
We do not change the behavior of the page_memcg_check(), it is also
applicable for the kmem pages.
In the end, there are 3 helpers to get the memcg associated with a page.
Usage is as follows.
1) Get the memory cgroup associated with a non-kmem page (e.g. the LRU
pages).
- page_memcg()
- page_memcg_rcu()
2) Get the memory cgroup associated with a page. It has to be used in
cases when it's not known if a page has an associated memory cgroup
pointer or an object cgroups vector. Returns NULL for slab pages or
uncharged pages. Otherwise, returns memory cgroup for charged pages
(e.g. the kmem pages, the LRU pages).
- page_memcg_check()
In some place, we use page_memcg() to check whether the page is charged.
Now introduce page_memcg_charged() helper to do that.
This is a preparation for reparenting the kmem pages.
Signed-off-by: Muchun Song <[email protected]>
---
include/linux/memcontrol.h | 36 ++++++++++++++++++++++++++++--------
mm/memcontrol.c | 23 +++++++++++++----------
mm/page_alloc.c | 4 ++--
3 files changed, 43 insertions(+), 20 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e6dc793d587d..049b80246cbf 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -358,14 +358,26 @@ enum page_memcg_data_flags {
#define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)
+/* Return true for charged page, otherwise false. */
+static inline bool page_memcg_charged(struct page *page)
+{
+ unsigned long memcg_data = page->memcg_data;
+
+ VM_BUG_ON_PAGE(PageSlab(page), page);
+ VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
+
+ return !!memcg_data;
+}
+
/*
- * page_memcg - get the memory cgroup associated with a page
+ * page_memcg - get the memory cgroup associated with a non-kmem page
* @page: a pointer to the page struct
*
* Returns a pointer to the memory cgroup associated with the page,
* or NULL. This function assumes that the page is known to have a
* proper memory cgroup pointer. It's not safe to call this function
- * against some type of pages, e.g. slab pages or ex-slab pages.
+ * against some type of pages, e.g. slab pages, kmem pages or ex-slab
+ * pages.
*
* Any of the following ensures page and memcg binding stability:
* - the page lock
@@ -378,27 +390,30 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
unsigned long memcg_data = page->memcg_data;
VM_BUG_ON_PAGE(PageSlab(page), page);
- VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
+ VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_FLAGS_MASK, page);
- return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+ return (struct mem_cgroup *)memcg_data;
}
/*
- * page_memcg_rcu - locklessly get the memory cgroup associated with a page
+ * page_memcg_rcu - locklessly get the memory cgroup associated with a non-kmem page
* @page: a pointer to the page struct
*
* Returns a pointer to the memory cgroup associated with the page,
* or NULL. This function assumes that the page is known to have a
* proper memory cgroup pointer. It's not safe to call this function
- * against some type of pages, e.g. slab pages or ex-slab pages.
+ * against some type of pages, e.g. slab pages, kmem pages or ex-slab
+ * pages.
*/
static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
{
+ unsigned long memcg_data = READ_ONCE(page->memcg_data);
+
VM_BUG_ON_PAGE(PageSlab(page), page);
+ VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_FLAGS_MASK, page);
WARN_ON_ONCE(!rcu_read_lock_held());
- return (struct mem_cgroup *)(READ_ONCE(page->memcg_data) &
- ~MEMCG_DATA_FLAGS_MASK);
+ return (struct mem_cgroup *)memcg_data;
}
/*
@@ -1072,6 +1087,11 @@ void mem_cgroup_split_huge_fixup(struct page *head);
struct mem_cgroup;
+static inline bool page_memcg_charged(struct page *page)
+{
+ return false;
+}
+
static inline struct mem_cgroup *page_memcg(struct page *page)
{
return NULL;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index faae16def127..86a8db937ec6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -855,10 +855,11 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
int val)
{
struct page *head = compound_head(page); /* rmap on tail pages */
- struct mem_cgroup *memcg = page_memcg(head);
+ struct mem_cgroup *memcg;
pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;
+ memcg = page_memcg_check(head);
/* Untracked pages have no memcg, no lruvec. Update only the node */
if (!memcg) {
__mod_node_page_state(pgdat, idx, val);
@@ -3166,12 +3167,13 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
*/
void __memcg_kmem_uncharge_page(struct page *page, int order)
{
- struct mem_cgroup *memcg = page_memcg(page);
+ struct mem_cgroup *memcg;
unsigned int nr_pages = 1 << order;
- if (!memcg)
+ if (!page_memcg_charged(page))
return;
+ memcg = page_memcg_check(page);
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
__memcg_kmem_uncharge(memcg, nr_pages);
page->memcg_data = 0;
@@ -6827,24 +6829,25 @@ static void uncharge_batch(const struct uncharge_gather *ug)
static void uncharge_page(struct page *page, struct uncharge_gather *ug)
{
unsigned long nr_pages;
+ struct mem_cgroup *memcg;
VM_BUG_ON_PAGE(PageLRU(page), page);
- if (!page_memcg(page))
+ if (!page_memcg_charged(page))
return;
/*
* Nobody should be changing or seriously looking at
- * page_memcg(page) at this point, we have fully
- * exclusive access to the page.
+ * page memcg at this point, we have fully exclusive
+ * access to the page.
*/
-
- if (ug->memcg != page_memcg(page)) {
+ memcg = page_memcg_check(page);
+ if (ug->memcg != memcg) {
if (ug->memcg) {
uncharge_batch(ug);
uncharge_gather_clear(ug);
}
- ug->memcg = page_memcg(page);
+ ug->memcg = memcg;
/* pairs with css_put in uncharge_batch */
css_get(&ug->memcg->css);
@@ -6877,7 +6880,7 @@ void mem_cgroup_uncharge(struct page *page)
return;
/* Don't touch page->lru of any random page, pre-check: */
- if (!page_memcg(page))
+ if (!page_memcg_charged(page))
return;
uncharge_gather_clear(&ug);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f10966e3b4a5..bcb58ae15e24 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1124,7 +1124,7 @@ static inline bool page_expected_state(struct page *page,
if (unlikely((unsigned long)page->mapping |
page_ref_count(page) |
#ifdef CONFIG_MEMCG
- (unsigned long)page_memcg(page) |
+ page_memcg_charged(page) |
#endif
(page->flags & check_flags)))
return false;
@@ -1149,7 +1149,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
}
#ifdef CONFIG_MEMCG
- if (unlikely(page_memcg(page)))
+ if (unlikely(page_memcg_charged(page)))
bad_reason = "page still charged to cgroup";
#endif
return bad_reason;
--
2.11.0
Since Roman series "The new cgroup slab memory controller" applied. All
slab objects are charged via the new APIs of obj_cgroup. The new APIs
introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the memory.
But there are still some corner objects (e.g. allocations larger than
order-1 page on SLUB) which are not charged via the new APIs. Those
objects (include the pages which are allocated from buddy allocator
directly) are charged as kmem pages which still hold a reference to
the memory cgroup.
This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to
an object cgroup. We can use the page_objcg() to get the object
cgroup associated with a kmem page. Or we can use page_memcg_check()
to get the memory cgroup associated with a kmem page, but caller must
ensure that the returned memcg won't be released (e.g. acquire the
rcu_read_lock or css_set_lock).
Signed-off-by: Muchun Song <[email protected]>
---
include/linux/memcontrol.h | 63 +++++++++++++++++------
mm/memcontrol.c | 123 +++++++++++++++++++++++++++++++--------------
2 files changed, 133 insertions(+), 53 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 049b80246cbf..5911b9d107b0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -370,6 +370,18 @@ static inline bool page_memcg_charged(struct page *page)
}
/*
+ * After the initialization objcg->memcg is always pointing at
+ * a valid memcg, but can be atomically swapped to the parent memcg.
+ *
+ * The caller must ensure that the returned memcg won't be released:
+ * e.g. acquire the rcu_read_lock or css_set_lock.
+ */
+static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
+{
+ return READ_ONCE(objcg->memcg);
+}
+
+/*
* page_memcg - get the memory cgroup associated with a non-kmem page
* @page: a pointer to the page struct
*
@@ -421,9 +433,10 @@ static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
* @page: a pointer to the page struct
*
* Returns a pointer to the memory cgroup associated with the page,
- * or NULL. This function unlike page_memcg() can take any page
+ * or NULL. This function unlike page_memcg() can take any non-kmem page
* as an argument. It has to be used in cases when it's not known if a page
- * has an associated memory cgroup pointer or an object cgroups vector.
+ * has an associated memory cgroup pointer or an object cgroups vector or
+ * an object cgroup.
*
* Any of the following ensures page and memcg binding stability:
* - the page lock
@@ -442,6 +455,17 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
if (memcg_data & MEMCG_DATA_OBJCGS)
return NULL;
+ if (memcg_data & MEMCG_DATA_KMEM) {
+ struct obj_cgroup *objcg;
+
+ /*
+ * The caller must ensure that the returned memcg won't be
+ * released: e.g. acquire the rcu_read_lock or css_set_lock.
+ */
+ objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+ return obj_cgroup_memcg(objcg);
+ }
+
return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
}
@@ -500,6 +524,24 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
}
+/*
+ * page_objcg - get the object cgroup associated with a kmem page
+ * @page: a pointer to the page struct
+ *
+ * Returns a pointer to the object cgroup associated with the kmem page,
+ * or NULL. This function assumes that the page is known to have an
+ * associated object cgroup. It's only safe to call this function
+ * against kmem pages (PageMemcgKmem() returns true).
+ */
+static inline struct obj_cgroup *page_objcg(struct page *page)
+{
+ unsigned long memcg_data = page->memcg_data;
+
+ VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
+ VM_BUG_ON_PAGE(!(memcg_data & MEMCG_DATA_KMEM), page);
+
+ return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+}
#else
static inline struct obj_cgroup **page_objcgs(struct page *page)
{
@@ -510,6 +552,11 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
{
return NULL;
}
+
+static inline struct obj_cgroup *page_objcg(struct page *page)
+{
+ return NULL;
+}
#endif
static __always_inline bool memcg_stat_item_in_bytes(int idx)
@@ -728,18 +775,6 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
percpu_ref_put(&objcg->refcnt);
}
-/*
- * After the initialization objcg->memcg is always pointing at
- * a valid memcg, but can be atomically swapped to the parent memcg.
- *
- * The caller must ensure that the returned memcg won't be released:
- * e.g. acquire the rcu_read_lock or css_set_lock.
- */
-static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
-{
- return READ_ONCE(objcg->memcg);
-}
-
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
if (memcg)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 86a8db937ec6..0cf342d22547 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -856,10 +856,16 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
{
struct page *head = compound_head(page); /* rmap on tail pages */
struct mem_cgroup *memcg;
- pg_data_t *pgdat = page_pgdat(page);
+ pg_data_t *pgdat;
struct lruvec *lruvec;
- memcg = page_memcg_check(head);
+ if (PageMemcgKmem(head)) {
+ __mod_lruvec_kmem_state(page_to_virt(head), idx, val);
+ return;
+ }
+
+ pgdat = page_pgdat(head);
+ memcg = page_memcg(head);
/* Untracked pages have no memcg, no lruvec. Update only the node */
if (!memcg) {
__mod_node_page_state(pgdat, idx, val);
@@ -3144,18 +3150,18 @@ static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_page
*/
int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
{
- struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
int ret = 0;
- memcg = get_mem_cgroup_from_current();
- if (memcg && !mem_cgroup_is_root(memcg)) {
- ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
+ objcg = get_obj_cgroup_from_current();
+ if (objcg) {
+ ret = obj_cgroup_charge_page(objcg, gfp, 1 << order);
if (!ret) {
- page->memcg_data = (unsigned long)memcg |
+ page->memcg_data = (unsigned long)objcg |
MEMCG_DATA_KMEM;
return 0;
}
- css_put(&memcg->css);
+ obj_cgroup_put(objcg);
}
return ret;
}
@@ -3167,17 +3173,18 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
*/
void __memcg_kmem_uncharge_page(struct page *page, int order)
{
- struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
unsigned int nr_pages = 1 << order;
if (!page_memcg_charged(page))
return;
- memcg = page_memcg_check(page);
- VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
- __memcg_kmem_uncharge(memcg, nr_pages);
+ VM_BUG_ON_PAGE(!PageMemcgKmem(page), page);
+
+ objcg = page_objcg(page);
+ obj_cgroup_uncharge_page(objcg, nr_pages);
page->memcg_data = 0;
- css_put(&memcg->css);
+ obj_cgroup_put(objcg);
}
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
@@ -6794,8 +6801,12 @@ struct uncharge_gather {
struct mem_cgroup *memcg;
unsigned long nr_pages;
unsigned long pgpgout;
- unsigned long nr_kmem;
struct page *dummy_page;
+
+#ifdef CONFIG_MEMCG_KMEM
+ struct obj_cgroup *objcg;
+ unsigned long nr_kmem;
+#endif
};
static inline void uncharge_gather_clear(struct uncharge_gather *ug)
@@ -6807,12 +6818,21 @@ static void uncharge_batch(const struct uncharge_gather *ug)
{
unsigned long flags;
+#ifdef CONFIG_MEMCG_KMEM
+ if (ug->objcg) {
+ obj_cgroup_uncharge_page(ug->objcg, ug->nr_kmem);
+ /* drop reference from uncharge_kmem_page */
+ obj_cgroup_put(ug->objcg);
+ }
+#endif
+
+ if (!ug->memcg)
+ return;
+
if (!mem_cgroup_is_root(ug->memcg)) {
page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
if (do_memsw_account())
page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
- page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
memcg_oom_recover(ug->memcg);
}
@@ -6822,26 +6842,40 @@ static void uncharge_batch(const struct uncharge_gather *ug)
memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
- /* drop reference from uncharge_page */
+ /* drop reference from uncharge_user_page */
css_put(&ug->memcg->css);
}
-static void uncharge_page(struct page *page, struct uncharge_gather *ug)
+#ifdef CONFIG_MEMCG_KMEM
+static void uncharge_kmem_page(struct page *page, struct uncharge_gather *ug)
{
- unsigned long nr_pages;
- struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg = page_objcg(page);
- VM_BUG_ON_PAGE(PageLRU(page), page);
+ if (ug->objcg != objcg) {
+ if (ug->objcg) {
+ uncharge_batch(ug);
+ uncharge_gather_clear(ug);
+ }
+ ug->objcg = objcg;
- if (!page_memcg_charged(page))
- return;
+ /* pairs with obj_cgroup_put in uncharge_batch */
+ obj_cgroup_get(ug->objcg);
+ }
+
+ ug->nr_kmem += compound_nr(page);
+ page->memcg_data = 0;
+ obj_cgroup_put(ug->objcg);
+}
+#else
+static void uncharge_kmem_page(struct page *page, struct uncharge_gather *ug)
+{
+}
+#endif
+
+static void uncharge_user_page(struct page *page, struct uncharge_gather *ug)
+{
+ struct mem_cgroup *memcg = page_memcg(page);
- /*
- * Nobody should be changing or seriously looking at
- * page memcg at this point, we have fully exclusive
- * access to the page.
- */
- memcg = page_memcg_check(page);
if (ug->memcg != memcg) {
if (ug->memcg) {
uncharge_batch(ug);
@@ -6852,18 +6886,30 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
/* pairs with css_put in uncharge_batch */
css_get(&ug->memcg->css);
}
+ ug->pgpgout++;
+ ug->dummy_page = page;
+
+ ug->nr_pages += compound_nr(page);
+ page->memcg_data = 0;
+ css_put(&ug->memcg->css);
+}
- nr_pages = compound_nr(page);
- ug->nr_pages += nr_pages;
+static void uncharge_page(struct page *page, struct uncharge_gather *ug)
+{
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ if (!page_memcg_charged(page))
+ return;
+
+ /*
+ * Nobody should be changing or seriously looking at
+ * page memcg at this point, we have fully exclusive
+ * access to the page.
+ */
if (PageMemcgKmem(page))
- ug->nr_kmem += nr_pages;
+ uncharge_kmem_page(page, ug);
else
- ug->pgpgout++;
-
- ug->dummy_page = page;
- page->memcg_data = 0;
- css_put(&ug->memcg->css);
+ uncharge_user_page(page, ug);
}
/**
@@ -6906,8 +6952,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
uncharge_gather_clear(&ug);
list_for_each_entry(page, page_list, lru)
uncharge_page(page, &ug);
- if (ug.memcg)
- uncharge_batch(&ug);
+ uncharge_batch(&ug);
}
/**
--
2.11.0
On Wed, Mar 03, 2021 at 01:59:14PM +0800, Muchun Song wrote:
> We want to reuse the obj_cgroup APIs to charge the kmem pages when
> If we do that, we should store an object cgroup pointer to
> page->memcg_data for the kmem pages.
>
> Finally, page->memcg_data can have 3 different meanings.
>
> 1) For the slab pages, page->memcg_data points to an object cgroups
> vector.
>
> 2) For the kmem pages (exclude the slab pages), page->memcg_data
> points to an object cgroup.
>
> 3) For the user pages (e.g. the LRU pages), page->memcg_data points
> to a memory cgroup.
>
> Currently we always get the memory cgroup associated with a page via
> page_memcg() or page_memcg_rcu(). page_memcg_check() is special, it
> has to be used in cases when it's not known if a page has an
> associated memory cgroup pointer or an object cgroups vector. Because
> the page->memcg_data of the kmem page is not pointing to a memory
> cgroup in the later patch, the page_memcg() and page_memcg_rcu()
> cannot be applicable for the kmem pages. In this patch, make
> page_memcg() and page_memcg_rcu() no longer apply to the kmem pages.
> We do not change the behavior of the page_memcg_check(), it is also
> applicable for the kmem pages.
>
> In the end, there are 3 helpers to get the memcg associated with a page.
> Usage is as follows.
>
> 1) Get the memory cgroup associated with a non-kmem page (e.g. the LRU
> pages).
>
> - page_memcg()
> - page_memcg_rcu()
>
> 2) Get the memory cgroup associated with a page. It has to be used in
> cases when it's not known if a page has an associated memory cgroup
> pointer or an object cgroups vector. Returns NULL for slab pages or
> uncharged pages. Otherwise, returns memory cgroup for charged pages
> (e.g. the kmem pages, the LRU pages).
>
> - page_memcg_check()
>
> In some place, we use page_memcg() to check whether the page is charged.
> Now introduce page_memcg_charged() helper to do that.
>
> This is a preparation for reparenting the kmem pages.
>
> Signed-off-by: Muchun Song <[email protected]>
This patch also looks good to me, but, please, make it safe for adding
new memcg_data flags. E.g. if someone will add a new flag with a completely
new meaning, it shouldn't break the code.
I'll ack it after another look at the final version, but overall it
looks good.
Thanks!
> ---
> include/linux/memcontrol.h | 36 ++++++++++++++++++++++++++++--------
> mm/memcontrol.c | 23 +++++++++++++----------
> mm/page_alloc.c | 4 ++--
> 3 files changed, 43 insertions(+), 20 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index e6dc793d587d..049b80246cbf 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -358,14 +358,26 @@ enum page_memcg_data_flags {
>
> #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)
>
> +/* Return true for charged page, otherwise false. */
> +static inline bool page_memcg_charged(struct page *page)
> +{
> + unsigned long memcg_data = page->memcg_data;
> +
> + VM_BUG_ON_PAGE(PageSlab(page), page);
> + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
> +
> + return !!memcg_data;
> +}
> +
> /*
> - * page_memcg - get the memory cgroup associated with a page
> + * page_memcg - get the memory cgroup associated with a non-kmem page
> * @page: a pointer to the page struct
> *
> * Returns a pointer to the memory cgroup associated with the page,
> * or NULL. This function assumes that the page is known to have a
> * proper memory cgroup pointer. It's not safe to call this function
> - * against some type of pages, e.g. slab pages or ex-slab pages.
> + * against some type of pages, e.g. slab pages, kmem pages or ex-slab
> + * pages.
> *
> * Any of the following ensures page and memcg binding stability:
> * - the page lock
> @@ -378,27 +390,30 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
> unsigned long memcg_data = page->memcg_data;
>
> VM_BUG_ON_PAGE(PageSlab(page), page);
> - VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
> + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_FLAGS_MASK, page);
>
> - return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
> + return (struct mem_cgroup *)memcg_data;
> }
>
> /*
> - * page_memcg_rcu - locklessly get the memory cgroup associated with a page
> + * page_memcg_rcu - locklessly get the memory cgroup associated with a non-kmem page
> * @page: a pointer to the page struct
> *
> * Returns a pointer to the memory cgroup associated with the page,
> * or NULL. This function assumes that the page is known to have a
> * proper memory cgroup pointer. It's not safe to call this function
> - * against some type of pages, e.g. slab pages or ex-slab pages.
> + * against some type of pages, e.g. slab pages, kmem pages or ex-slab
> + * pages.
> */
> static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
> {
> + unsigned long memcg_data = READ_ONCE(page->memcg_data);
> +
> VM_BUG_ON_PAGE(PageSlab(page), page);
> + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_FLAGS_MASK, page);
> WARN_ON_ONCE(!rcu_read_lock_held());
>
> - return (struct mem_cgroup *)(READ_ONCE(page->memcg_data) &
> - ~MEMCG_DATA_FLAGS_MASK);
> + return (struct mem_cgroup *)memcg_data;
> }
>
> /*
> @@ -1072,6 +1087,11 @@ void mem_cgroup_split_huge_fixup(struct page *head);
>
> struct mem_cgroup;
>
> +static inline bool page_memcg_charged(struct page *page)
> +{
> + return false;
> +}
> +
> static inline struct mem_cgroup *page_memcg(struct page *page)
> {
> return NULL;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index faae16def127..86a8db937ec6 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -855,10 +855,11 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
> int val)
> {
> struct page *head = compound_head(page); /* rmap on tail pages */
> - struct mem_cgroup *memcg = page_memcg(head);
> + struct mem_cgroup *memcg;
> pg_data_t *pgdat = page_pgdat(page);
> struct lruvec *lruvec;
>
> + memcg = page_memcg_check(head);
> /* Untracked pages have no memcg, no lruvec. Update only the node */
> if (!memcg) {
> __mod_node_page_state(pgdat, idx, val);
> @@ -3166,12 +3167,13 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
> */
> void __memcg_kmem_uncharge_page(struct page *page, int order)
> {
> - struct mem_cgroup *memcg = page_memcg(page);
> + struct mem_cgroup *memcg;
> unsigned int nr_pages = 1 << order;
>
> - if (!memcg)
> + if (!page_memcg_charged(page))
> return;
>
> + memcg = page_memcg_check(page);
> VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
> __memcg_kmem_uncharge(memcg, nr_pages);
> page->memcg_data = 0;
> @@ -6827,24 +6829,25 @@ static void uncharge_batch(const struct uncharge_gather *ug)
> static void uncharge_page(struct page *page, struct uncharge_gather *ug)
> {
> unsigned long nr_pages;
> + struct mem_cgroup *memcg;
>
> VM_BUG_ON_PAGE(PageLRU(page), page);
>
> - if (!page_memcg(page))
> + if (!page_memcg_charged(page))
> return;
>
> /*
> * Nobody should be changing or seriously looking at
> - * page_memcg(page) at this point, we have fully
> - * exclusive access to the page.
> + * page memcg at this point, we have fully exclusive
> + * access to the page.
> */
> -
> - if (ug->memcg != page_memcg(page)) {
> + memcg = page_memcg_check(page);
> + if (ug->memcg != memcg) {
> if (ug->memcg) {
> uncharge_batch(ug);
> uncharge_gather_clear(ug);
> }
> - ug->memcg = page_memcg(page);
> + ug->memcg = memcg;
>
> /* pairs with css_put in uncharge_batch */
> css_get(&ug->memcg->css);
> @@ -6877,7 +6880,7 @@ void mem_cgroup_uncharge(struct page *page)
> return;
>
> /* Don't touch page->lru of any random page, pre-check: */
> - if (!page_memcg(page))
> + if (!page_memcg_charged(page))
> return;
>
> uncharge_gather_clear(&ug);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index f10966e3b4a5..bcb58ae15e24 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1124,7 +1124,7 @@ static inline bool page_expected_state(struct page *page,
> if (unlikely((unsigned long)page->mapping |
> page_ref_count(page) |
> #ifdef CONFIG_MEMCG
> - (unsigned long)page_memcg(page) |
> + page_memcg_charged(page) |
> #endif
> (page->flags & check_flags)))
> return false;
> @@ -1149,7 +1149,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
> bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
> }
> #ifdef CONFIG_MEMCG
> - if (unlikely(page_memcg(page)))
> + if (unlikely(page_memcg_charged(page)))
> bad_reason = "page still charged to cgroup";
> #endif
> return bad_reason;
> --
> 2.11.0
>
On Wed, Mar 03, 2021 at 01:59:15PM +0800, Muchun Song wrote:
> Since Roman series "The new cgroup slab memory controller" applied. All
> slab objects are charged via the new APIs of obj_cgroup. The new APIs
> introduce a struct obj_cgroup to charge slab objects. It prevents
> long-living objects from pinning the original memory cgroup in the memory.
> But there are still some corner objects (e.g. allocations larger than
> order-1 page on SLUB) which are not charged via the new APIs. Those
> objects (include the pages which are allocated from buddy allocator
> directly) are charged as kmem pages which still hold a reference to
> the memory cgroup.
>
> This patch aims to charge the kmem pages by using the new APIs of
> obj_cgroup. Finally, the page->memcg_data of the kmem page points to
> an object cgroup. We can use the page_objcg() to get the object
> cgroup associated with a kmem page. Or we can use page_memcg_check()
> to get the memory cgroup associated with a kmem page, but caller must
> ensure that the returned memcg won't be released (e.g. acquire the
> rcu_read_lock or css_set_lock).
I believe it's a good direction, but there are still things which
need to be figured out first.
>
> Signed-off-by: Muchun Song <[email protected]>
> ---
> include/linux/memcontrol.h | 63 +++++++++++++++++------
> mm/memcontrol.c | 123 +++++++++++++++++++++++++++++++--------------
> 2 files changed, 133 insertions(+), 53 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 049b80246cbf..5911b9d107b0 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -370,6 +370,18 @@ static inline bool page_memcg_charged(struct page *page)
> }
>
> /*
> + * After the initialization objcg->memcg is always pointing at
> + * a valid memcg, but can be atomically swapped to the parent memcg.
> + *
> + * The caller must ensure that the returned memcg won't be released:
> + * e.g. acquire the rcu_read_lock or css_set_lock.
> + */
> +static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
> +{
> + return READ_ONCE(objcg->memcg);
> +}
> +
> +/*
> * page_memcg - get the memory cgroup associated with a non-kmem page
> * @page: a pointer to the page struct
> *
> @@ -421,9 +433,10 @@ static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
> * @page: a pointer to the page struct
> *
> * Returns a pointer to the memory cgroup associated with the page,
> - * or NULL. This function unlike page_memcg() can take any page
> + * or NULL. This function unlike page_memcg() can take any non-kmem page
> * as an argument. It has to be used in cases when it's not known if a page
> - * has an associated memory cgroup pointer or an object cgroups vector.
> + * has an associated memory cgroup pointer or an object cgroups vector or
> + * an object cgroup.
> *
> * Any of the following ensures page and memcg binding stability:
> * - the page lock
> @@ -442,6 +455,17 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
> if (memcg_data & MEMCG_DATA_OBJCGS)
> return NULL;
>
> + if (memcg_data & MEMCG_DATA_KMEM) {
This is confusing: the comment above says it can't take kmem pages?
> + struct obj_cgroup *objcg;
> +
> + /*
> + * The caller must ensure that the returned memcg won't be
> + * released: e.g. acquire the rcu_read_lock or css_set_lock.
> + */
> + objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
> + return obj_cgroup_memcg(objcg);
> + }
> +
> return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
Also, the comment about page<->memcg binding stability is not correct anymore.
Taking page_lock, for example, won't protect memcg from being released,
if this a kmem page.
_Maybe_ it's ok to just say that page_memcg_check() requires a rcu lock,
but I'm not yet quite sure. The calling convention is already complicated,
we should avoid making it even more complicated, if we can.
> }
>
> @@ -500,6 +524,24 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
> return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
> }
>
> +/*
> + * page_objcg - get the object cgroup associated with a kmem page
> + * @page: a pointer to the page struct
> + *
> + * Returns a pointer to the object cgroup associated with the kmem page,
> + * or NULL. This function assumes that the page is known to have an
> + * associated object cgroup. It's only safe to call this function
> + * against kmem pages (PageMemcgKmem() returns true).
> + */
> +static inline struct obj_cgroup *page_objcg(struct page *page)
> +{
> + unsigned long memcg_data = page->memcg_data;
> +
> + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
> + VM_BUG_ON_PAGE(!(memcg_data & MEMCG_DATA_KMEM), page);
> +
> + return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
> +}
> #else
> static inline struct obj_cgroup **page_objcgs(struct page *page)
> {
> @@ -510,6 +552,11 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
> {
> return NULL;
> }
> +
> +static inline struct obj_cgroup *page_objcg(struct page *page)
> +{
> + return NULL;
> +}
> #endif
>
> static __always_inline bool memcg_stat_item_in_bytes(int idx)
> @@ -728,18 +775,6 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
> percpu_ref_put(&objcg->refcnt);
> }
>
> -/*
> - * After the initialization objcg->memcg is always pointing at
> - * a valid memcg, but can be atomically swapped to the parent memcg.
> - *
> - * The caller must ensure that the returned memcg won't be released:
> - * e.g. acquire the rcu_read_lock or css_set_lock.
> - */
> -static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
> -{
> - return READ_ONCE(objcg->memcg);
> -}
> -
> static inline void mem_cgroup_put(struct mem_cgroup *memcg)
> {
> if (memcg)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 86a8db937ec6..0cf342d22547 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -856,10 +856,16 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
> {
> struct page *head = compound_head(page); /* rmap on tail pages */
> struct mem_cgroup *memcg;
> - pg_data_t *pgdat = page_pgdat(page);
> + pg_data_t *pgdat;
> struct lruvec *lruvec;
>
> - memcg = page_memcg_check(head);
> + if (PageMemcgKmem(head)) {
> + __mod_lruvec_kmem_state(page_to_virt(head), idx, val);
> + return;
> + }
This is a very confusing part: we're converting the page to the virtual address
to run mem_cgroup_from_obj() inside __mod_lruvec_kmem_state() to get back the page.
> +
> + pgdat = page_pgdat(head);
> + memcg = page_memcg(head);
> /* Untracked pages have no memcg, no lruvec. Update only the node */
> if (!memcg) {
> __mod_node_page_state(pgdat, idx, val);
> @@ -3144,18 +3150,18 @@ static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_page
> */
> int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
> {
> - struct mem_cgroup *memcg;
> + struct obj_cgroup *objcg;
> int ret = 0;
>
> - memcg = get_mem_cgroup_from_current();
> - if (memcg && !mem_cgroup_is_root(memcg)) {
> - ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
> + objcg = get_obj_cgroup_from_current();
> + if (objcg) {
> + ret = obj_cgroup_charge_page(objcg, gfp, 1 << order);
> if (!ret) {
> - page->memcg_data = (unsigned long)memcg |
> + page->memcg_data = (unsigned long)objcg |
> MEMCG_DATA_KMEM;
> return 0;
> }
> - css_put(&memcg->css);
> + obj_cgroup_put(objcg);
> }
> return ret;
> }
> @@ -3167,17 +3173,18 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
> */
> void __memcg_kmem_uncharge_page(struct page *page, int order)
> {
> - struct mem_cgroup *memcg;
> + struct obj_cgroup *objcg;
> unsigned int nr_pages = 1 << order;
>
> if (!page_memcg_charged(page))
> return;
>
> - memcg = page_memcg_check(page);
> - VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
> - __memcg_kmem_uncharge(memcg, nr_pages);
> + VM_BUG_ON_PAGE(!PageMemcgKmem(page), page);
> +
> + objcg = page_objcg(page);
> + obj_cgroup_uncharge_page(objcg, nr_pages);
> page->memcg_data = 0;
> - css_put(&memcg->css);
> + obj_cgroup_put(objcg);
> }
>
> static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
> @@ -6794,8 +6801,12 @@ struct uncharge_gather {
> struct mem_cgroup *memcg;
> unsigned long nr_pages;
> unsigned long pgpgout;
> - unsigned long nr_kmem;
> struct page *dummy_page;
> +
> +#ifdef CONFIG_MEMCG_KMEM
> + struct obj_cgroup *objcg;
> + unsigned long nr_kmem;
> +#endif
> };
>
> static inline void uncharge_gather_clear(struct uncharge_gather *ug)
> @@ -6807,12 +6818,21 @@ static void uncharge_batch(const struct uncharge_gather *ug)
> {
> unsigned long flags;
>
> +#ifdef CONFIG_MEMCG_KMEM
> + if (ug->objcg) {
> + obj_cgroup_uncharge_page(ug->objcg, ug->nr_kmem);
> + /* drop reference from uncharge_kmem_page */
> + obj_cgroup_put(ug->objcg);
> + }
> +#endif
Hm, an obvious question here is why do we need to double the ug infrastructure
if we can just get kmem page's memcg and use the infra for user pages?
Because ug is holding a reference to memcg, it will not go away.
Maybe I'm missing something, but it seems that there is a simpler implementation.
Thanks!
> +
> + if (!ug->memcg)
> + return;
> +
> if (!mem_cgroup_is_root(ug->memcg)) {
> page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
> if (do_memsw_account())
> page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
> - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
> - page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
> memcg_oom_recover(ug->memcg);
> }
>
> @@ -6822,26 +6842,40 @@ static void uncharge_batch(const struct uncharge_gather *ug)
> memcg_check_events(ug->memcg, ug->dummy_page);
> local_irq_restore(flags);
>
> - /* drop reference from uncharge_page */
> + /* drop reference from uncharge_user_page */
> css_put(&ug->memcg->css);
> }
>
> -static void uncharge_page(struct page *page, struct uncharge_gather *ug)
> +#ifdef CONFIG_MEMCG_KMEM
> +static void uncharge_kmem_page(struct page *page, struct uncharge_gather *ug)
> {
> - unsigned long nr_pages;
> - struct mem_cgroup *memcg;
> + struct obj_cgroup *objcg = page_objcg(page);
>
> - VM_BUG_ON_PAGE(PageLRU(page), page);
> + if (ug->objcg != objcg) {
> + if (ug->objcg) {
> + uncharge_batch(ug);
> + uncharge_gather_clear(ug);
> + }
> + ug->objcg = objcg;
>
> - if (!page_memcg_charged(page))
> - return;
> + /* pairs with obj_cgroup_put in uncharge_batch */
> + obj_cgroup_get(ug->objcg);
> + }
> +
> + ug->nr_kmem += compound_nr(page);
> + page->memcg_data = 0;
> + obj_cgroup_put(ug->objcg);
> +}
> +#else
> +static void uncharge_kmem_page(struct page *page, struct uncharge_gather *ug)
> +{
> +}
> +#endif
> +
> +static void uncharge_user_page(struct page *page, struct uncharge_gather *ug)
> +{
> + struct mem_cgroup *memcg = page_memcg(page);
>
> - /*
> - * Nobody should be changing or seriously looking at
> - * page memcg at this point, we have fully exclusive
> - * access to the page.
> - */
> - memcg = page_memcg_check(page);
> if (ug->memcg != memcg) {
> if (ug->memcg) {
> uncharge_batch(ug);
> @@ -6852,18 +6886,30 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
> /* pairs with css_put in uncharge_batch */
> css_get(&ug->memcg->css);
> }
> + ug->pgpgout++;
> + ug->dummy_page = page;
> +
> + ug->nr_pages += compound_nr(page);
> + page->memcg_data = 0;
> + css_put(&ug->memcg->css);
> +}
>
> - nr_pages = compound_nr(page);
> - ug->nr_pages += nr_pages;
> +static void uncharge_page(struct page *page, struct uncharge_gather *ug)
> +{
> + VM_BUG_ON_PAGE(PageLRU(page), page);
>
> + if (!page_memcg_charged(page))
> + return;
> +
> + /*
> + * Nobody should be changing or seriously looking at
> + * page memcg at this point, we have fully exclusive
> + * access to the page.
> + */
> if (PageMemcgKmem(page))
> - ug->nr_kmem += nr_pages;
> + uncharge_kmem_page(page, ug);
> else
> - ug->pgpgout++;
> -
> - ug->dummy_page = page;
> - page->memcg_data = 0;
> - css_put(&ug->memcg->css);
> + uncharge_user_page(page, ug);
> }
>
> /**
> @@ -6906,8 +6952,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
> uncharge_gather_clear(&ug);
> list_for_each_entry(page, page_list, lru)
> uncharge_page(page, &ug);
> - if (ug.memcg)
> - uncharge_batch(&ug);
> + uncharge_batch(&ug);
> }
>
> /**
> --
> 2.11.0
>
On Wed, Mar 03, 2021 at 01:59:16PM +0800, Muchun Song wrote:
> The remote memcg charing APIs is a mechanism to charge pages to a given
> memcg. Since all kernel memory are charged by using obj_cgroup APIs.
> Actually, we want to charge kernel memory to the remote object cgroup
> instead of memory cgroup. So introduce remote objcg charging APIs to
> charge the kmem pages by using objcg_cgroup APIs. And if remote memcg
> and objcg are both set, objcg takes precedence over memcg to charge
> the kmem pages.
>
> In the later patch, we will use those API to charge kernel memory to
> the remote objcg.
I'd abandon/postpone the rest of the patchset (patches 4 and 5) as now.
They add a lot of new code to solve a theoretical problem (please, fix
me if I'm wrong), which is not a panic or data corruption, but
a sub-optimal garbage collection behavior. I think we need a better
motivation or/and an implementation which makes the code simpler
and smaller.
>
> Signed-off-by: Muchun Song <[email protected]>
> ---
> include/linux/sched.h | 4 ++++
> include/linux/sched/mm.h | 38 ++++++++++++++++++++++++++++++++++++++
> kernel/fork.c | 3 +++
> mm/memcontrol.c | 44 ++++++++++++++++++++++++++++++++++++++++----
> 4 files changed, 85 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index ee46f5cab95b..8edcc71a0a1d 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1318,6 +1318,10 @@ struct task_struct {
> /* Used by memcontrol for targeted memcg charge: */
> struct mem_cgroup *active_memcg;
> #endif
> +#ifdef CONFIG_MEMCG_KMEM
> + /* Used by memcontrol for targeted objcg charge: */
> + struct obj_cgroup *active_objcg;
> +#endif
>
> #ifdef CONFIG_BLK_CGROUP
> struct request_queue *throttle_queue;
> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
> index 1ae08b8462a4..be1189598b09 100644
> --- a/include/linux/sched/mm.h
> +++ b/include/linux/sched/mm.h
> @@ -330,6 +330,44 @@ set_active_memcg(struct mem_cgroup *memcg)
> }
> #endif
>
> +#ifdef CONFIG_MEMCG_KMEM
> +DECLARE_PER_CPU(struct obj_cgroup *, int_active_objcg);
> +
> +/**
> + * set_active_objcg - Starts the remote objcg kmem pages charging scope.
> + * @objcg: objcg to charge.
> + *
> + * This function marks the beginning of the remote objcg charging scope. All the
> + * __GFP_ACCOUNT kmem page allocations till the end of the scope will be charged
> + * to the given objcg.
> + *
> + * NOTE: This function can nest. Users must save the return value and
> + * reset the previous value after their own charging scope is over.
> + *
> + * If remote memcg and objcg are both set, objcg takes precedence over memcg
> + * to charge the kmem pages.
> + */
> +static inline struct obj_cgroup *set_active_objcg(struct obj_cgroup *objcg)
> +{
> + struct obj_cgroup *old;
> +
> + if (in_interrupt()) {
> + old = this_cpu_read(int_active_objcg);
> + this_cpu_write(int_active_objcg, objcg);
> + } else {
> + old = current->active_objcg;
> + current->active_objcg = objcg;
> + }
> +
> + return old;
> +}
> +#else
> +static inline struct obj_cgroup *set_active_objcg(struct obj_cgroup *objcg)
> +{
> + return NULL;
> +}
> +#endif
> +
> #ifdef CONFIG_MEMBARRIER
> enum {
> MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0),
> diff --git a/kernel/fork.c b/kernel/fork.c
> index d66cd1014211..b4b9dd5d122f 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -945,6 +945,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
> #ifdef CONFIG_MEMCG
> tsk->active_memcg = NULL;
> #endif
> +#ifdef CONFIG_MEMCG_KMEM
> + tsk->active_objcg = NULL;
> +#endif
> return tsk;
>
> free_stack:
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 0cf342d22547..e48d4ab0af76 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -79,6 +79,11 @@ struct mem_cgroup *root_mem_cgroup __read_mostly;
> /* Active memory cgroup to use from an interrupt context */
> DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
>
> +#ifdef CONFIG_MEMCG_KMEM
> +/* Active object cgroup to use from an interrupt context */
> +DEFINE_PER_CPU(struct obj_cgroup *, int_active_objcg);
> +#endif
> +
> /* Socket memory accounting disabled? */
> static bool cgroup_memory_nosocket;
>
> @@ -1076,7 +1081,7 @@ static __always_inline struct mem_cgroup *get_active_memcg(void)
> return memcg;
> }
>
> -static __always_inline bool memcg_kmem_bypass(void)
> +static __always_inline bool memcg_charge_bypass(void)
> {
> /* Allow remote memcg charging from any context. */
> if (unlikely(active_memcg()))
> @@ -1094,7 +1099,7 @@ static __always_inline bool memcg_kmem_bypass(void)
> */
> static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
> {
> - if (memcg_kmem_bypass())
> + if (memcg_charge_bypass())
> return NULL;
>
> if (unlikely(active_memcg()))
> @@ -1103,6 +1108,29 @@ static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
> return get_mem_cgroup_from_mm(current->mm);
> }
>
> +#ifdef CONFIG_MEMCG_KMEM
> +static __always_inline struct obj_cgroup *active_objcg(void)
> +{
> + if (in_interrupt())
> + return this_cpu_read(int_active_objcg);
> + else
> + return current->active_objcg;
> +}
> +
> +static __always_inline bool kmem_charge_bypass(void)
> +{
> + /* Allow remote charging from any context. */
> + if (unlikely(active_objcg() || active_memcg()))
> + return false;
> +
> + /* Memcg to charge can't be determined. */
> + if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
> + return true;
> +
> + return false;
> +}
> +#endif
> +
> /**
> * mem_cgroup_iter - iterate over memory cgroup hierarchy
> * @root: hierarchy root
> @@ -2997,13 +3025,20 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
>
> __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
> {
> - struct obj_cgroup *objcg = NULL;
> + struct obj_cgroup *objcg;
> struct mem_cgroup *memcg;
>
> - if (memcg_kmem_bypass())
> + if (kmem_charge_bypass())
> return NULL;
>
> rcu_read_lock();
> + objcg = active_objcg();
> + if (unlikely(objcg)) {
> + /* remote object cgroup must hold a reference. */
> + obj_cgroup_get(objcg);
> + goto out;
> + }
> +
> if (unlikely(active_memcg()))
> memcg = active_memcg();
> else
> @@ -3015,6 +3050,7 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
> break;
> objcg = NULL;
> }
> +out:
> rcu_read_unlock();
>
> return objcg;
> --
> 2.11.0
>
On Sat, Mar 6, 2021 at 3:00 AM Roman Gushchin <[email protected]> wrote:
>
> On Wed, Mar 03, 2021 at 01:59:14PM +0800, Muchun Song wrote:
> > We want to reuse the obj_cgroup APIs to charge the kmem pages when
> > If we do that, we should store an object cgroup pointer to
> > page->memcg_data for the kmem pages.
> >
> > Finally, page->memcg_data can have 3 different meanings.
> >
> > 1) For the slab pages, page->memcg_data points to an object cgroups
> > vector.
> >
> > 2) For the kmem pages (exclude the slab pages), page->memcg_data
> > points to an object cgroup.
> >
> > 3) For the user pages (e.g. the LRU pages), page->memcg_data points
> > to a memory cgroup.
> >
> > Currently we always get the memory cgroup associated with a page via
> > page_memcg() or page_memcg_rcu(). page_memcg_check() is special, it
> > has to be used in cases when it's not known if a page has an
> > associated memory cgroup pointer or an object cgroups vector. Because
> > the page->memcg_data of the kmem page is not pointing to a memory
> > cgroup in the later patch, the page_memcg() and page_memcg_rcu()
> > cannot be applicable for the kmem pages. In this patch, make
> > page_memcg() and page_memcg_rcu() no longer apply to the kmem pages.
> > We do not change the behavior of the page_memcg_check(), it is also
> > applicable for the kmem pages.
> >
> > In the end, there are 3 helpers to get the memcg associated with a page.
> > Usage is as follows.
> >
> > 1) Get the memory cgroup associated with a non-kmem page (e.g. the LRU
> > pages).
> >
> > - page_memcg()
> > - page_memcg_rcu()
> >
> > 2) Get the memory cgroup associated with a page. It has to be used in
> > cases when it's not known if a page has an associated memory cgroup
> > pointer or an object cgroups vector. Returns NULL for slab pages or
> > uncharged pages. Otherwise, returns memory cgroup for charged pages
> > (e.g. the kmem pages, the LRU pages).
> >
> > - page_memcg_check()
> >
> > In some place, we use page_memcg() to check whether the page is charged.
> > Now introduce page_memcg_charged() helper to do that.
> >
> > This is a preparation for reparenting the kmem pages.
> >
> > Signed-off-by: Muchun Song <[email protected]>
>
> This patch also looks good to me, but, please, make it safe for adding
> new memcg_data flags. E.g. if someone will add a new flag with a completely
> new meaning, it shouldn't break the code.
I'm not thoughtful enough. Will do that in the next version.
Thanks for your suggestions.
>
> I'll ack it after another look at the final version, but overall it
> looks good.
>
> Thanks!
>
> > ---
> > include/linux/memcontrol.h | 36 ++++++++++++++++++++++++++++--------
> > mm/memcontrol.c | 23 +++++++++++++----------
> > mm/page_alloc.c | 4 ++--
> > 3 files changed, 43 insertions(+), 20 deletions(-)
> >
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index e6dc793d587d..049b80246cbf 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -358,14 +358,26 @@ enum page_memcg_data_flags {
> >
> > #define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)
> >
> > +/* Return true for charged page, otherwise false. */
> > +static inline bool page_memcg_charged(struct page *page)
> > +{
> > + unsigned long memcg_data = page->memcg_data;
> > +
> > + VM_BUG_ON_PAGE(PageSlab(page), page);
> > + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
> > +
> > + return !!memcg_data;
> > +}
> > +
> > /*
> > - * page_memcg - get the memory cgroup associated with a page
> > + * page_memcg - get the memory cgroup associated with a non-kmem page
> > * @page: a pointer to the page struct
> > *
> > * Returns a pointer to the memory cgroup associated with the page,
> > * or NULL. This function assumes that the page is known to have a
> > * proper memory cgroup pointer. It's not safe to call this function
> > - * against some type of pages, e.g. slab pages or ex-slab pages.
> > + * against some type of pages, e.g. slab pages, kmem pages or ex-slab
> > + * pages.
> > *
> > * Any of the following ensures page and memcg binding stability:
> > * - the page lock
> > @@ -378,27 +390,30 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
> > unsigned long memcg_data = page->memcg_data;
> >
> > VM_BUG_ON_PAGE(PageSlab(page), page);
> > - VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
> > + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_FLAGS_MASK, page);
> >
> > - return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
> > + return (struct mem_cgroup *)memcg_data;
> > }
> >
> > /*
> > - * page_memcg_rcu - locklessly get the memory cgroup associated with a page
> > + * page_memcg_rcu - locklessly get the memory cgroup associated with a non-kmem page
> > * @page: a pointer to the page struct
> > *
> > * Returns a pointer to the memory cgroup associated with the page,
> > * or NULL. This function assumes that the page is known to have a
> > * proper memory cgroup pointer. It's not safe to call this function
> > - * against some type of pages, e.g. slab pages or ex-slab pages.
> > + * against some type of pages, e.g. slab pages, kmem pages or ex-slab
> > + * pages.
> > */
> > static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
> > {
> > + unsigned long memcg_data = READ_ONCE(page->memcg_data);
> > +
> > VM_BUG_ON_PAGE(PageSlab(page), page);
> > + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_FLAGS_MASK, page);
> > WARN_ON_ONCE(!rcu_read_lock_held());
> >
> > - return (struct mem_cgroup *)(READ_ONCE(page->memcg_data) &
> > - ~MEMCG_DATA_FLAGS_MASK);
> > + return (struct mem_cgroup *)memcg_data;
> > }
> >
> > /*
> > @@ -1072,6 +1087,11 @@ void mem_cgroup_split_huge_fixup(struct page *head);
> >
> > struct mem_cgroup;
> >
> > +static inline bool page_memcg_charged(struct page *page)
> > +{
> > + return false;
> > +}
> > +
> > static inline struct mem_cgroup *page_memcg(struct page *page)
> > {
> > return NULL;
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index faae16def127..86a8db937ec6 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -855,10 +855,11 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
> > int val)
> > {
> > struct page *head = compound_head(page); /* rmap on tail pages */
> > - struct mem_cgroup *memcg = page_memcg(head);
> > + struct mem_cgroup *memcg;
> > pg_data_t *pgdat = page_pgdat(page);
> > struct lruvec *lruvec;
> >
> > + memcg = page_memcg_check(head);
> > /* Untracked pages have no memcg, no lruvec. Update only the node */
> > if (!memcg) {
> > __mod_node_page_state(pgdat, idx, val);
> > @@ -3166,12 +3167,13 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
> > */
> > void __memcg_kmem_uncharge_page(struct page *page, int order)
> > {
> > - struct mem_cgroup *memcg = page_memcg(page);
> > + struct mem_cgroup *memcg;
> > unsigned int nr_pages = 1 << order;
> >
> > - if (!memcg)
> > + if (!page_memcg_charged(page))
> > return;
> >
> > + memcg = page_memcg_check(page);
> > VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
> > __memcg_kmem_uncharge(memcg, nr_pages);
> > page->memcg_data = 0;
> > @@ -6827,24 +6829,25 @@ static void uncharge_batch(const struct uncharge_gather *ug)
> > static void uncharge_page(struct page *page, struct uncharge_gather *ug)
> > {
> > unsigned long nr_pages;
> > + struct mem_cgroup *memcg;
> >
> > VM_BUG_ON_PAGE(PageLRU(page), page);
> >
> > - if (!page_memcg(page))
> > + if (!page_memcg_charged(page))
> > return;
> >
> > /*
> > * Nobody should be changing or seriously looking at
> > - * page_memcg(page) at this point, we have fully
> > - * exclusive access to the page.
> > + * page memcg at this point, we have fully exclusive
> > + * access to the page.
> > */
> > -
> > - if (ug->memcg != page_memcg(page)) {
> > + memcg = page_memcg_check(page);
> > + if (ug->memcg != memcg) {
> > if (ug->memcg) {
> > uncharge_batch(ug);
> > uncharge_gather_clear(ug);
> > }
> > - ug->memcg = page_memcg(page);
> > + ug->memcg = memcg;
> >
> > /* pairs with css_put in uncharge_batch */
> > css_get(&ug->memcg->css);
> > @@ -6877,7 +6880,7 @@ void mem_cgroup_uncharge(struct page *page)
> > return;
> >
> > /* Don't touch page->lru of any random page, pre-check: */
> > - if (!page_memcg(page))
> > + if (!page_memcg_charged(page))
> > return;
> >
> > uncharge_gather_clear(&ug);
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index f10966e3b4a5..bcb58ae15e24 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -1124,7 +1124,7 @@ static inline bool page_expected_state(struct page *page,
> > if (unlikely((unsigned long)page->mapping |
> > page_ref_count(page) |
> > #ifdef CONFIG_MEMCG
> > - (unsigned long)page_memcg(page) |
> > + page_memcg_charged(page) |
> > #endif
> > (page->flags & check_flags)))
> > return false;
> > @@ -1149,7 +1149,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
> > bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
> > }
> > #ifdef CONFIG_MEMCG
> > - if (unlikely(page_memcg(page)))
> > + if (unlikely(page_memcg_charged(page)))
> > bad_reason = "page still charged to cgroup";
> > #endif
> > return bad_reason;
> > --
> > 2.11.0
> >
On Sat, Mar 6, 2021 at 3:41 AM Roman Gushchin <[email protected]> wrote:
>
> On Wed, Mar 03, 2021 at 01:59:15PM +0800, Muchun Song wrote:
> > Since Roman series "The new cgroup slab memory controller" applied. All
> > slab objects are charged via the new APIs of obj_cgroup. The new APIs
> > introduce a struct obj_cgroup to charge slab objects. It prevents
> > long-living objects from pinning the original memory cgroup in the memory.
> > But there are still some corner objects (e.g. allocations larger than
> > order-1 page on SLUB) which are not charged via the new APIs. Those
> > objects (include the pages which are allocated from buddy allocator
> > directly) are charged as kmem pages which still hold a reference to
> > the memory cgroup.
> >
> > This patch aims to charge the kmem pages by using the new APIs of
> > obj_cgroup. Finally, the page->memcg_data of the kmem page points to
> > an object cgroup. We can use the page_objcg() to get the object
> > cgroup associated with a kmem page. Or we can use page_memcg_check()
> > to get the memory cgroup associated with a kmem page, but caller must
> > ensure that the returned memcg won't be released (e.g. acquire the
> > rcu_read_lock or css_set_lock).
>
> I believe it's a good direction, but there are still things which
> need to be figured out first.
>
> >
> > Signed-off-by: Muchun Song <[email protected]>
> > ---
> > include/linux/memcontrol.h | 63 +++++++++++++++++------
> > mm/memcontrol.c | 123 +++++++++++++++++++++++++++++++--------------
> > 2 files changed, 133 insertions(+), 53 deletions(-)
> >
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 049b80246cbf..5911b9d107b0 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -370,6 +370,18 @@ static inline bool page_memcg_charged(struct page *page)
> > }
> >
> > /*
> > + * After the initialization objcg->memcg is always pointing at
> > + * a valid memcg, but can be atomically swapped to the parent memcg.
> > + *
> > + * The caller must ensure that the returned memcg won't be released:
> > + * e.g. acquire the rcu_read_lock or css_set_lock.
> > + */
> > +static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
> > +{
> > + return READ_ONCE(objcg->memcg);
> > +}
> > +
> > +/*
> > * page_memcg - get the memory cgroup associated with a non-kmem page
> > * @page: a pointer to the page struct
> > *
> > @@ -421,9 +433,10 @@ static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
> > * @page: a pointer to the page struct
> > *
> > * Returns a pointer to the memory cgroup associated with the page,
> > - * or NULL. This function unlike page_memcg() can take any page
> > + * or NULL. This function unlike page_memcg() can take any non-kmem page
> > * as an argument. It has to be used in cases when it's not known if a page
> > - * has an associated memory cgroup pointer or an object cgroups vector.
> > + * has an associated memory cgroup pointer or an object cgroups vector or
> > + * an object cgroup.
> > *
> > * Any of the following ensures page and memcg binding stability:
> > * - the page lock
> > @@ -442,6 +455,17 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
> > if (memcg_data & MEMCG_DATA_OBJCGS)
> > return NULL;
> >
> > + if (memcg_data & MEMCG_DATA_KMEM) {
>
> This is confusing: the comment above says it can't take kmem pages?
It is my fault. I will fix the comment.
>
> > + struct obj_cgroup *objcg;
> > +
> > + /*
> > + * The caller must ensure that the returned memcg won't be
> > + * released: e.g. acquire the rcu_read_lock or css_set_lock.
> > + */
> > + objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
> > + return obj_cgroup_memcg(objcg);
> > + }
> > +
> > return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
>
> Also, the comment about page<->memcg binding stability is not correct anymore.
> Taking page_lock, for example, won't protect memcg from being released,
> if this a kmem page.
I also have a question: Will the page lock of the kmem page be taken
somewhere?
>
> _Maybe_ it's ok to just say that page_memcg_check() requires a rcu lock,
> but I'm not yet quite sure. The calling convention is already complicated,
> we should avoid making it even more complicated, if we can.
In most scenarios, we can distinguish what page it is. So we can see
that there are only 2 users of it. I think that it may be acceptable to just
complicate page_memcg_check(). Document it requires a rcu lock.
>
> > }
> >
> > @@ -500,6 +524,24 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
> > return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
> > }
> >
> > +/*
> > + * page_objcg - get the object cgroup associated with a kmem page
> > + * @page: a pointer to the page struct
> > + *
> > + * Returns a pointer to the object cgroup associated with the kmem page,
> > + * or NULL. This function assumes that the page is known to have an
> > + * associated object cgroup. It's only safe to call this function
> > + * against kmem pages (PageMemcgKmem() returns true).
> > + */
> > +static inline struct obj_cgroup *page_objcg(struct page *page)
> > +{
> > + unsigned long memcg_data = page->memcg_data;
> > +
> > + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
> > + VM_BUG_ON_PAGE(!(memcg_data & MEMCG_DATA_KMEM), page);
> > +
> > + return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
> > +}
> > #else
> > static inline struct obj_cgroup **page_objcgs(struct page *page)
> > {
> > @@ -510,6 +552,11 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
> > {
> > return NULL;
> > }
> > +
> > +static inline struct obj_cgroup *page_objcg(struct page *page)
> > +{
> > + return NULL;
> > +}
> > #endif
> >
> > static __always_inline bool memcg_stat_item_in_bytes(int idx)
> > @@ -728,18 +775,6 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
> > percpu_ref_put(&objcg->refcnt);
> > }
> >
> > -/*
> > - * After the initialization objcg->memcg is always pointing at
> > - * a valid memcg, but can be atomically swapped to the parent memcg.
> > - *
> > - * The caller must ensure that the returned memcg won't be released:
> > - * e.g. acquire the rcu_read_lock or css_set_lock.
> > - */
> > -static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
> > -{
> > - return READ_ONCE(objcg->memcg);
> > -}
> > -
> > static inline void mem_cgroup_put(struct mem_cgroup *memcg)
> > {
> > if (memcg)
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 86a8db937ec6..0cf342d22547 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -856,10 +856,16 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
> > {
> > struct page *head = compound_head(page); /* rmap on tail pages */
> > struct mem_cgroup *memcg;
> > - pg_data_t *pgdat = page_pgdat(page);
> > + pg_data_t *pgdat;
> > struct lruvec *lruvec;
> >
> > - memcg = page_memcg_check(head);
> > + if (PageMemcgKmem(head)) {
> > + __mod_lruvec_kmem_state(page_to_virt(head), idx, val);
> > + return;
> > + }
>
> This is a very confusing part: we're converting the page to the virtual address
> to run mem_cgroup_from_obj() inside __mod_lruvec_kmem_state() to get back the page.
OK. I will rework the code here.
>
> > +
> > + pgdat = page_pgdat(head);
> > + memcg = page_memcg(head);
> > /* Untracked pages have no memcg, no lruvec. Update only the node */
> > if (!memcg) {
> > __mod_node_page_state(pgdat, idx, val);
> > @@ -3144,18 +3150,18 @@ static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_page
> > */
> > int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
> > {
> > - struct mem_cgroup *memcg;
> > + struct obj_cgroup *objcg;
> > int ret = 0;
> >
> > - memcg = get_mem_cgroup_from_current();
> > - if (memcg && !mem_cgroup_is_root(memcg)) {
> > - ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
> > + objcg = get_obj_cgroup_from_current();
> > + if (objcg) {
> > + ret = obj_cgroup_charge_page(objcg, gfp, 1 << order);
> > if (!ret) {
> > - page->memcg_data = (unsigned long)memcg |
> > + page->memcg_data = (unsigned long)objcg |
> > MEMCG_DATA_KMEM;
> > return 0;
> > }
> > - css_put(&memcg->css);
> > + obj_cgroup_put(objcg);
> > }
> > return ret;
> > }
> > @@ -3167,17 +3173,18 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
> > */
> > void __memcg_kmem_uncharge_page(struct page *page, int order)
> > {
> > - struct mem_cgroup *memcg;
> > + struct obj_cgroup *objcg;
> > unsigned int nr_pages = 1 << order;
> >
> > if (!page_memcg_charged(page))
> > return;
> >
> > - memcg = page_memcg_check(page);
> > - VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
> > - __memcg_kmem_uncharge(memcg, nr_pages);
> > + VM_BUG_ON_PAGE(!PageMemcgKmem(page), page);
> > +
> > + objcg = page_objcg(page);
> > + obj_cgroup_uncharge_page(objcg, nr_pages);
> > page->memcg_data = 0;
> > - css_put(&memcg->css);
> > + obj_cgroup_put(objcg);
> > }
> >
> > static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
> > @@ -6794,8 +6801,12 @@ struct uncharge_gather {
> > struct mem_cgroup *memcg;
> > unsigned long nr_pages;
> > unsigned long pgpgout;
> > - unsigned long nr_kmem;
> > struct page *dummy_page;
> > +
> > +#ifdef CONFIG_MEMCG_KMEM
> > + struct obj_cgroup *objcg;
> > + unsigned long nr_kmem;
> > +#endif
> > };
> >
> > static inline void uncharge_gather_clear(struct uncharge_gather *ug)
> > @@ -6807,12 +6818,21 @@ static void uncharge_batch(const struct uncharge_gather *ug)
> > {
> > unsigned long flags;
> >
> > +#ifdef CONFIG_MEMCG_KMEM
> > + if (ug->objcg) { > +#endif
>
> Hm, an obvious question here is why do we need to double the ug infrastructure
> if we can just get kmem page's memcg and use the infra for user pages?
> Because ug is holding a reference to memcg, it will not go away.
> Maybe I'm missing something, but it seems that there is a simpler implementation.
Yeah. There is a simpler implementation. I will do a try. Thanks for your
suggestions.
>
> Thanks!
>
> > +
> > + if (!ug->memcg)
> > + return;
> > +
> > if (!mem_cgroup_is_root(ug->memcg)) {
> > page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
> > if (do_memsw_account())
> > page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
> > - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
> > - page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
> > memcg_oom_recover(ug->memcg);
> > }
> >
> > @@ -6822,26 +6842,40 @@ static void uncharge_batch(const struct uncharge_gather *ug)
> > memcg_check_events(ug->memcg, ug->dummy_page);
> > local_irq_restore(flags);
> >
> > - /* drop reference from uncharge_page */
> > + /* drop reference from uncharge_user_page */
> > css_put(&ug->memcg->css);
> > }
> >
> > -static void uncharge_page(struct page *page, struct uncharge_gather *ug)
> > +#ifdef CONFIG_MEMCG_KMEM
> > +static void uncharge_kmem_page(struct page *page, struct uncharge_gather *ug)
> > {
> > - unsigned long nr_pages;
> > - struct mem_cgroup *memcg;
> > + struct obj_cgroup *objcg = page_objcg(page);
> >
> > - VM_BUG_ON_PAGE(PageLRU(page), page);
> > + if (ug->objcg != objcg) {
> > + if (ug->objcg) {
> > + uncharge_batch(ug);
> > + uncharge_gather_clear(ug);
> > + }
> > + ug->objcg = objcg;
> >
> > - if (!page_memcg_charged(page))
> > - return;
> > + /* pairs with obj_cgroup_put in uncharge_batch */
> > + obj_cgroup_get(ug->objcg);
> > + }
> > +
> > + ug->nr_kmem += compound_nr(page);
> > + page->memcg_data = 0;
> > + obj_cgroup_put(ug->objcg);
> > +}
> > +#else
> > +static void uncharge_kmem_page(struct page *page, struct uncharge_gather *ug)
> > +{
> > +}
> > +#endif
> > +
> > +static void uncharge_user_page(struct page *page, struct uncharge_gather *ug)
> > +{
> > + struct mem_cgroup *memcg = page_memcg(page);
> >
> > - /*
> > - * Nobody should be changing or seriously looking at
> > - * page memcg at this point, we have fully exclusive
> > - * access to the page.
> > - */
> > - memcg = page_memcg_check(page);
> > if (ug->memcg != memcg) {
> > if (ug->memcg) {
> > uncharge_batch(ug);
> > @@ -6852,18 +6886,30 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
> > /* pairs with css_put in uncharge_batch */
> > css_get(&ug->memcg->css);
> > }
> > + ug->pgpgout++;
> > + ug->dummy_page = page;
> > +
> > + ug->nr_pages += compound_nr(page);
> > + page->memcg_data = 0;
> > + css_put(&ug->memcg->css);
> > +}
> >
> > - nr_pages = compound_nr(page);
> > - ug->nr_pages += nr_pages;
> > +static void uncharge_page(struct page *page, struct uncharge_gather *ug)
> > +{
> > + VM_BUG_ON_PAGE(PageLRU(page), page);
> >
> > + if (!page_memcg_charged(page))
> > + return;
> > +
> > + /*
> > + * Nobody should be changing or seriously looking at
> > + * page memcg at this point, we have fully exclusive
> > + * access to the page.
> > + */
> > if (PageMemcgKmem(page))
> > - ug->nr_kmem += nr_pages;
> > + uncharge_kmem_page(page, ug);
> > else
> > - ug->pgpgout++;
> > -
> > - ug->dummy_page = page;
> > - page->memcg_data = 0;
> > - css_put(&ug->memcg->css);
> > + uncharge_user_page(page, ug);
> > }
> >
> > /**
> > @@ -6906,8 +6952,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
> > uncharge_gather_clear(&ug);
> > list_for_each_entry(page, page_list, lru)
> > uncharge_page(page, &ug);
> > - if (ug.memcg)
> > - uncharge_batch(&ug);
> > + uncharge_batch(&ug);
> > }
> >
> > /**
> > --
> > 2.11.0
> >