LinuxLists.cc - [PATCH 0/7] memcontrol code cleanup and simplification

[permalink] [raw]

Subject: [PATCH 4/7] mm: memcontrol: simplify lruvec_holds_page_lru_lock

We already have a helper lruvec_memcg() to get the memcg from lruvec, we
do not need to do it ourselves in the lruvec_holds_page_lru_lock(). So use
lruvec_memcg() instead. And if mem_cgroup_disabled() returns false, the
page_memcg(page) (the LRU pages) cannot be NULL. So remove the odd logic
of "memcg = page_memcg(page) ? : root_mem_cgroup". And use lruvec_pgdat
to simplify the code. We can have a single definition for this function
that works for !CONFIG_MEMCG, CONFIG_MEMCG + mem_cgroup_disabled() and
CONFIG_MEMCG.

Signed-off-by: Muchun Song <[email protected]>
Acked-by: Johannes Weiner <[email protected]>
---
include/linux/memcontrol.h | 31 +++++++------------------------
1 file changed, 7 insertions(+), 24 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 4f49865c9958..38b8d3fb24ff 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -755,22 +755,6 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
return mem_cgroup_lruvec(memcg, pgdat);
}

-static inline bool lruvec_holds_page_lru_lock(struct page *page,
- struct lruvec *lruvec)
-{
- pg_data_t *pgdat = page_pgdat(page);
- const struct mem_cgroup *memcg;
- struct mem_cgroup_per_node *mz;
-
- if (mem_cgroup_disabled())
- return lruvec == &pgdat->__lruvec;
-
- mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- memcg = page_memcg(page) ? : root_mem_cgroup;
-
- return lruvec->pgdat == pgdat && mz->memcg == memcg;
-}
-
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);

struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
@@ -1229,14 +1213,6 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
return &pgdat->__lruvec;
}

-static inline bool lruvec_holds_page_lru_lock(struct page *page,
- struct lruvec *lruvec)
-{
- pg_data_t *pgdat = page_pgdat(page);
-
- return lruvec == &pgdat->__lruvec;
-}
-
static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
{
}
@@ -1518,6 +1494,13 @@ static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
spin_unlock_irqrestore(&lruvec->lru_lock, flags);
}

+static inline bool lruvec_holds_page_lru_lock(struct page *page,
+ struct lruvec *lruvec)
+{
+ return lruvec_pgdat(lruvec) == page_pgdat(page) &&
+ lruvec_memcg(lruvec) == page_memcg(page);
+}
+
/* Don't lock again iff page's lruvec locked */
static inline struct lruvec *relock_page_lruvec_irq(struct page *page,
struct lruvec *locked_lruvec)
--
2.11.0

2021-04-13 12:46:55

[permalink] [raw]

Subject: [PATCH 3/7] mm: memcontrol: remove the pgdata parameter of mem_cgroup_page_lruvec

All the callers of mem_cgroup_page_lruvec() just pass page_pgdat(page)
as the 2nd parameter to it (except isolate_migratepages_block()). But
for isolate_migratepages_block(), the page_pgdat(page) is also equal
to the local variable of @pgdat. So mem_cgroup_page_lruvec() do not
need the pgdat parameter. Just remove it to simplify the code.

Signed-off-by: Muchun Song <[email protected]>
Acked-by: Johannes Weiner <[email protected]>
---
include/linux/memcontrol.h | 10 +++++-----
mm/compaction.c | 2 +-
mm/memcontrol.c | 9 +++------
mm/swap.c | 2 +-
mm/workingset.c | 2 +-
5 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index c960fd49c3e8..4f49865c9958 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -743,13 +743,12 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
/**
* mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
* @page: the page
- * @pgdat: pgdat of the page
*
* This function relies on page->mem_cgroup being stable.
*/
-static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
- struct pglist_data *pgdat)
+static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
{
+ pg_data_t *pgdat = page_pgdat(page);
struct mem_cgroup *memcg = page_memcg(page);

VM_WARN_ON_ONCE_PAGE(!memcg && !mem_cgroup_disabled(), page);
@@ -1223,9 +1222,10 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
return &pgdat->__lruvec;
}

-static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
- struct pglist_data *pgdat)
+static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
{
+ pg_data_t *pgdat = page_pgdat(page);
+
return &pgdat->__lruvec;
}

diff --git a/mm/compaction.c b/mm/compaction.c
index caa4c36c1db3..e7da342003dd 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1033,7 +1033,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (!TestClearPageLRU(page))
goto isolate_fail_put;

- lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ lruvec = mem_cgroup_page_lruvec(page);

/* If we already hold the lock, we can skip some rechecking */
if (lruvec != locked) {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9cbfff59b171..1f807448233e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1177,9 +1177,8 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
struct lruvec *lock_page_lruvec(struct page *page)
{
struct lruvec *lruvec;
- struct pglist_data *pgdat = page_pgdat(page);

- lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ lruvec = mem_cgroup_page_lruvec(page);
spin_lock(&lruvec->lru_lock);

lruvec_memcg_debug(lruvec, page);
@@ -1190,9 +1189,8 @@ struct lruvec *lock_page_lruvec(struct page *page)
struct lruvec *lock_page_lruvec_irq(struct page *page)
{
struct lruvec *lruvec;
- struct pglist_data *pgdat = page_pgdat(page);

- lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ lruvec = mem_cgroup_page_lruvec(page);
spin_lock_irq(&lruvec->lru_lock);

lruvec_memcg_debug(lruvec, page);
@@ -1203,9 +1201,8 @@ struct lruvec *lock_page_lruvec_irq(struct page *page)
struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
{
struct lruvec *lruvec;
- struct pglist_data *pgdat = page_pgdat(page);

- lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ lruvec = mem_cgroup_page_lruvec(page);
spin_lock_irqsave(&lruvec->lru_lock, *flags);

lruvec_memcg_debug(lruvec, page);
diff --git a/mm/swap.c b/mm/swap.c
index a75a8265302b..e0d5699213cc 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -313,7 +313,7 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)

void lru_note_cost_page(struct page *page)
{
- lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)),
+ lru_note_cost(mem_cgroup_page_lruvec(page),
page_is_file_lru(page), thp_nr_pages(page));
}

diff --git a/mm/workingset.c b/mm/workingset.c
index b7cdeca5a76d..4f7a306ce75a 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -408,7 +408,7 @@ void workingset_activation(struct page *page)
memcg = page_memcg_rcu(page);
if (!mem_cgroup_disabled() && !memcg)
goto out;
- lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+ lruvec = mem_cgroup_page_lruvec(page);
workingset_age_nonresident(lruvec, thp_nr_pages(page));
out:
rcu_read_unlock();
--
2.11.0

2021-04-13 12:47:20

[permalink] [raw]

Subject: [PATCH 2/7] mm: memcontrol: bail out early when !mm in get_mem_cgroup_from_mm

When mm is NULL, we do not need to hold rcu lock and call css_tryget for
the root memcg. And we also do not need to check !mm in every loop of
while. So bail out early when !mm.

Signed-off-by: Muchun Song <[email protected]>
Acked-by: Johannes Weiner <[email protected]>
Reviewed-by: Shakeel Butt <[email protected]>
---
mm/memcontrol.c | 21 ++++++++++-----------
1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f229de925aa5..9cbfff59b171 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -901,20 +901,19 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
if (mem_cgroup_disabled())
return NULL;

+ /*
+ * Page cache insertions can happen without an
+ * actual mm context, e.g. during disk probing
+ * on boot, loopback IO, acct() writes etc.
+ */
+ if (unlikely(!mm))
+ return root_mem_cgroup;
+
rcu_read_lock();
do {
- /*
- * Page cache insertions can happen without an
- * actual mm context, e.g. during disk probing
- * on boot, loopback IO, acct() writes etc.
- */
- if (unlikely(!mm))
+ memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!memcg))
memcg = root_mem_cgroup;
- else {
- memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (unlikely(!memcg))
- memcg = root_mem_cgroup;
- }
} while (!css_tryget(&memcg->css));
rcu_read_unlock();
return memcg;
--
2.11.0

2021-04-13 13:44:53

[permalink] [raw]

Subject: [PATCH 7/7] mm: vmscan: remove noinline_for_stack

The noinline_for_stack is introduced by commit 666356297ec4 ("vmscan:
set up pagevec as late as possible in shrink_inactive_list()"), its
purpose is to delay the allocation of pagevec as late as possible to
save stack memory. But the commit 2bcf88796381 ("mm: take pagevecs off
reclaim stack") replace pagevecs by lists of pages_to_free. So we do
not need noinline_for_stack, just remove it (let the compiler decide
whether to inline).

Signed-off-by: Muchun Song <[email protected]>
Acked-by: Johannes Weiner <[email protected]>
Acked-by: Roman Gushchin <[email protected]>
Reviewed-by: Shakeel Butt <[email protected]>
---
mm/vmscan.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 64bf07cc20f2..e40b21298d77 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2015,8 +2015,8 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
*
* Returns the number of pages moved to the given lruvec.
*/
-static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
- struct list_head *list)
+static unsigned int move_pages_to_lru(struct lruvec *lruvec,
+ struct list_head *list)
{
int nr_pages, nr_moved = 0;
LIST_HEAD(pages_to_free);
@@ -2096,7 +2096,7 @@ static int current_may_throttle(void)
* shrink_inactive_list() is a helper for shrink_node(). It returns the number
* of reclaimed pages
*/
-static noinline_for_stack unsigned long
+static unsigned long
shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
struct scan_control *sc, enum lru_list lru)
{
--
2.11.0

2021-04-13 13:44:53

[permalink] [raw]

Subject: [PATCH 5/7] mm: memcontrol: simplify the logic of objcg pinning memcg

The obj_cgroup_release() and memcg_reparent_objcgs() are serialized by
the css_set_lock. We do not need to care about objcg->memcg being
released in the process of obj_cgroup_release(). So there is no need
to pin memcg before releasing objcg. Remove those pinning logic to
simplfy the code.

There are only two places that modifies the objcg->memcg. One is the
initialization to objcg->memcg in the memcg_online_kmem(), another
is objcgs reparenting in the memcg_reparent_objcgs(). It is also
impossible for the two to run in parallel. So xchg() is unnecessary
and it is enough to use WRITE_ONCE().

Signed-off-by: Muchun Song <[email protected]>
Acked-by: Johannes Weiner <[email protected]>
---
mm/memcontrol.c | 20 ++++++--------------
1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1f807448233e..42d8c0f4ab1d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -261,7 +261,6 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
static void obj_cgroup_release(struct percpu_ref *ref)
{
struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
- struct mem_cgroup *memcg;
unsigned int nr_bytes;
unsigned int nr_pages;
unsigned long flags;
@@ -291,11 +290,9 @@ static void obj_cgroup_release(struct percpu_ref *ref)
nr_pages = nr_bytes >> PAGE_SHIFT;

spin_lock_irqsave(&css_set_lock, flags);
- memcg = obj_cgroup_memcg(objcg);
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
list_del(&objcg->list);
- mem_cgroup_put(memcg);
spin_unlock_irqrestore(&css_set_lock, flags);

percpu_ref_exit(ref);
@@ -330,17 +327,12 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,

spin_lock_irq(&css_set_lock);

- /* Move active objcg to the parent's list */
- xchg(&objcg->memcg, parent);
- css_get(&parent->css);
- list_add(&objcg->list, &parent->objcg_list);
-
- /* Move already reparented objcgs to the parent's list */
- list_for_each_entry(iter, &memcg->objcg_list, list) {
- css_get(&parent->css);
- xchg(&iter->memcg, parent);
- css_put(&memcg->css);
- }
+ /* 1) Ready to reparent active objcg. */
+ list_add(&objcg->list, &memcg->objcg_list);
+ /* 2) Reparent active objcg and already reparented objcgs to parent. */
+ list_for_each_entry(iter, &memcg->objcg_list, list)
+ WRITE_ONCE(iter->memcg, parent);
+ /* 3) Move already reparented objcgs to the parent's list */
list_splice(&memcg->objcg_list, &parent->objcg_list);

spin_unlock_irq(&css_set_lock);
--
2.11.0

2021-04-13 15:26:15

by Shakeel Butt

[permalink] [raw]

Subject: Re: [PATCH 4/7] mm: memcontrol: simplify lruvec_holds_page_lru_lock

On Mon, Apr 12, 2021 at 11:57 PM Muchun Song <[email protected]> wrote:
>
> We already have a helper lruvec_memcg() to get the memcg from lruvec, we
> do not need to do it ourselves in the lruvec_holds_page_lru_lock(). So use
> lruvec_memcg() instead. And if mem_cgroup_disabled() returns false, the
> page_memcg(page) (the LRU pages) cannot be NULL. So remove the odd logic
> of "memcg = page_memcg(page) ? : root_mem_cgroup". And use lruvec_pgdat
> to simplify the code. We can have a single definition for this function
> that works for !CONFIG_MEMCG, CONFIG_MEMCG + mem_cgroup_disabled() and
> CONFIG_MEMCG.
>
> Signed-off-by: Muchun Song <[email protected]>
> Acked-by: Johannes Weiner <[email protected]>

Reviewed-by: Shakeel Butt <[email protected]>

2021-04-13 17:17:08

by Shakeel Butt

[permalink] [raw]

Subject: Re: [PATCH 3/7] mm: memcontrol: remove the pgdata parameter of mem_cgroup_page_lruvec

On Mon, Apr 12, 2021 at 11:57 PM Muchun Song <[email protected]> wrote:
>
> All the callers of mem_cgroup_page_lruvec() just pass page_pgdat(page)
> as the 2nd parameter to it (except isolate_migratepages_block()). But
> for isolate_migratepages_block(), the page_pgdat(page) is also equal
> to the local variable of @pgdat. So mem_cgroup_page_lruvec() do not
> need the pgdat parameter. Just remove it to simplify the code.
>
> Signed-off-by: Muchun Song <[email protected]>
> Acked-by: Johannes Weiner <[email protected]>

Reviewed-by: Shakeel Butt <[email protected]>

2021-04-13 20:28:58

by Shakeel Butt

[permalink] [raw]

Subject: Re: [PATCH 5/7] mm: memcontrol: simplify the logic of objcg pinning memcg

On Mon, Apr 12, 2021 at 11:58 PM Muchun Song <[email protected]> wrote:
>
> The obj_cgroup_release() and memcg_reparent_objcgs() are serialized by
> the css_set_lock. We do not need to care about objcg->memcg being
> released in the process of obj_cgroup_release(). So there is no need
> to pin memcg before releasing objcg. Remove those pinning logic to
> simplfy the code.
>
> There are only two places that modifies the objcg->memcg. One is the
> initialization to objcg->memcg in the memcg_online_kmem(), another
> is objcgs reparenting in the memcg_reparent_objcgs(). It is also
> impossible for the two to run in parallel. So xchg() is unnecessary
> and it is enough to use WRITE_ONCE().
>
> Signed-off-by: Muchun Song <[email protected]>
> Acked-by: Johannes Weiner <[email protected]>

Reviewed-by: Shakeel Butt <[email protected]>

2021-04-13 22:22:52

[permalink] [raw]

Subject: Re: [PATCH 5/7] mm: memcontrol: simplify the logic of objcg pinning memcg

On Tue, Apr 13, 2021 at 02:51:51PM +0800, Muchun Song wrote:
> The obj_cgroup_release() and memcg_reparent_objcgs() are serialized by
> the css_set_lock. We do not need to care about objcg->memcg being
> released in the process of obj_cgroup_release(). So there is no need
> to pin memcg before releasing objcg. Remove those pinning logic to
> simplfy the code.
>
> There are only two places that modifies the objcg->memcg. One is the
> initialization to objcg->memcg in the memcg_online_kmem(), another
> is objcgs reparenting in the memcg_reparent_objcgs(). It is also
> impossible for the two to run in parallel. So xchg() is unnecessary
> and it is enough to use WRITE_ONCE().
>
> Signed-off-by: Muchun Song <[email protected]>
> Acked-by: Johannes Weiner <[email protected]>

It's a good one! It took me some time to realize that it's safe.
Thanks!

Acked-by: Roman Gushchin <[email protected]>

> ---
> mm/memcontrol.c | 20 ++++++--------------
> 1 file changed, 6 insertions(+), 14 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 1f807448233e..42d8c0f4ab1d 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -261,7 +261,6 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
> static void obj_cgroup_release(struct percpu_ref *ref)
> {
> struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
> - struct mem_cgroup *memcg;
> unsigned int nr_bytes;
> unsigned int nr_pages;
> unsigned long flags;
> @@ -291,11 +290,9 @@ static void obj_cgroup_release(struct percpu_ref *ref)
> nr_pages = nr_bytes >> PAGE_SHIFT;
>
> spin_lock_irqsave(&css_set_lock, flags);
> - memcg = obj_cgroup_memcg(objcg);
> if (nr_pages)
> obj_cgroup_uncharge_pages(objcg, nr_pages);
> list_del(&objcg->list);
> - mem_cgroup_put(memcg);
> spin_unlock_irqrestore(&css_set_lock, flags);
>
> percpu_ref_exit(ref);
> @@ -330,17 +327,12 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
>
> spin_lock_irq(&css_set_lock);
>
> - /* Move active objcg to the parent's list */
> - xchg(&objcg->memcg, parent);
> - css_get(&parent->css);
> - list_add(&objcg->list, &parent->objcg_list);
> -
> - /* Move already reparented objcgs to the parent's list */
> - list_for_each_entry(iter, &memcg->objcg_list, list) {
> - css_get(&parent->css);
> - xchg(&iter->memcg, parent);
> - css_put(&memcg->css);
> - }
> + /* 1) Ready to reparent active objcg. */
> + list_add(&objcg->list, &memcg->objcg_list);
> + /* 2) Reparent active objcg and already reparented objcgs to parent. */
> + list_for_each_entry(iter, &memcg->objcg_list, list)
> + WRITE_ONCE(iter->memcg, parent);
> + /* 3) Move already reparented objcgs to the parent's list */
> list_splice(&memcg->objcg_list, &parent->objcg_list);
>
> spin_unlock_irq(&css_set_lock);
> --
> 2.11.0
>

2021-04-13 22:30:27

[permalink] [raw]

Subject: Re: [PATCH 4/7] mm: memcontrol: simplify lruvec_holds_page_lru_lock

On Tue, Apr 13, 2021 at 02:51:50PM +0800, Muchun Song wrote:
> We already have a helper lruvec_memcg() to get the memcg from lruvec, we
> do not need to do it ourselves in the lruvec_holds_page_lru_lock(). So use
> lruvec_memcg() instead. And if mem_cgroup_disabled() returns false, the
> page_memcg(page) (the LRU pages) cannot be NULL. So remove the odd logic
> of "memcg = page_memcg(page) ? : root_mem_cgroup". And use lruvec_pgdat
> to simplify the code. We can have a single definition for this function
> that works for !CONFIG_MEMCG, CONFIG_MEMCG + mem_cgroup_disabled() and
> CONFIG_MEMCG.
>
> Signed-off-by: Muchun Song <[email protected]>
> Acked-by: Johannes Weiner <[email protected]>

Acked-by: Roman Gushchin <[email protected]>

> ---
> include/linux/memcontrol.h | 31 +++++++------------------------
> 1 file changed, 7 insertions(+), 24 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 4f49865c9958..38b8d3fb24ff 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -755,22 +755,6 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
> return mem_cgroup_lruvec(memcg, pgdat);
> }
>
> -static inline bool lruvec_holds_page_lru_lock(struct page *page,
> - struct lruvec *lruvec)
> -{
> - pg_data_t *pgdat = page_pgdat(page);
> - const struct mem_cgroup *memcg;
> - struct mem_cgroup_per_node *mz;
> -
> - if (mem_cgroup_disabled())
> - return lruvec == &pgdat->__lruvec;
> -
> - mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> - memcg = page_memcg(page) ? : root_mem_cgroup;
> -
> - return lruvec->pgdat == pgdat && mz->memcg == memcg;
> -}
> -
> struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
>
> struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
> @@ -1229,14 +1213,6 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
> return &pgdat->__lruvec;
> }
>
> -static inline bool lruvec_holds_page_lru_lock(struct page *page,
> - struct lruvec *lruvec)
> -{
> - pg_data_t *pgdat = page_pgdat(page);
> -
> - return lruvec == &pgdat->__lruvec;
> -}
> -
> static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
> {
> }
> @@ -1518,6 +1494,13 @@ static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
> spin_unlock_irqrestore(&lruvec->lru_lock, flags);
> }
>
> +static inline bool lruvec_holds_page_lru_lock(struct page *page,
> + struct lruvec *lruvec)
> +{
> + return lruvec_pgdat(lruvec) == page_pgdat(page) &&
> + lruvec_memcg(lruvec) == page_memcg(page);
> +}
> +
> /* Don't lock again iff page's lruvec locked */
> static inline struct lruvec *relock_page_lruvec_irq(struct page *page,
> struct lruvec *locked_lruvec)
> --
> 2.11.0
>

2021-04-14 05:09:45

[permalink] [raw]

Subject: Re: [PATCH 2/7] mm: memcontrol: bail out early when !mm in get_mem_cgroup_from_mm

On Tue, Apr 13, 2021 at 02:51:48PM +0800, Muchun Song wrote:
> When mm is NULL, we do not need to hold rcu lock and call css_tryget for
> the root memcg. And we also do not need to check !mm in every loop of
> while. So bail out early when !mm.
>
> Signed-off-by: Muchun Song <[email protected]>
> Acked-by: Johannes Weiner <[email protected]>
> Reviewed-by: Shakeel Butt <[email protected]>

Acked-by: Roman Gushchin <[email protected]>

Nice!

> ---
> mm/memcontrol.c | 21 ++++++++++-----------
> 1 file changed, 10 insertions(+), 11 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index f229de925aa5..9cbfff59b171 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -901,20 +901,19 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
> if (mem_cgroup_disabled())
> return NULL;
>
> + /*
> + * Page cache insertions can happen without an
> + * actual mm context, e.g. during disk probing
> + * on boot, loopback IO, acct() writes etc.
> + */
> + if (unlikely(!mm))
> + return root_mem_cgroup;
> +
> rcu_read_lock();
> do {
> - /*
> - * Page cache insertions can happen without an
> - * actual mm context, e.g. during disk probing
> - * on boot, loopback IO, acct() writes etc.
> - */
> - if (unlikely(!mm))
> + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
> + if (unlikely(!memcg))
> memcg = root_mem_cgroup;
> - else {
> - memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
> - if (unlikely(!memcg))
> - memcg = root_mem_cgroup;
> - }
> } while (!css_tryget(&memcg->css));
> rcu_read_unlock();
> return memcg;
> --
> 2.11.0
>

2021-04-14 05:15:46

[permalink] [raw]

Subject: Re: [PATCH 3/7] mm: memcontrol: remove the pgdata parameter of mem_cgroup_page_lruvec

On Tue, Apr 13, 2021 at 02:51:49PM +0800, Muchun Song wrote:
> All the callers of mem_cgroup_page_lruvec() just pass page_pgdat(page)
> as the 2nd parameter to it (except isolate_migratepages_block()). But
> for isolate_migratepages_block(), the page_pgdat(page) is also equal
> to the local variable of @pgdat. So mem_cgroup_page_lruvec() do not
> need the pgdat parameter. Just remove it to simplify the code.
>
> Signed-off-by: Muchun Song <[email protected]>
> Acked-by: Johannes Weiner <[email protected]>

Acked-by: Roman Gushchin <[email protected]>

> ---
> include/linux/memcontrol.h | 10 +++++-----
> mm/compaction.c | 2 +-
> mm/memcontrol.c | 9 +++------
> mm/swap.c | 2 +-
> mm/workingset.c | 2 +-
> 5 files changed, 11 insertions(+), 14 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index c960fd49c3e8..4f49865c9958 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -743,13 +743,12 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
> /**
> * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
> * @page: the page
> - * @pgdat: pgdat of the page
> *
> * This function relies on page->mem_cgroup being stable.
> */
> -static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
> - struct pglist_data *pgdat)
> +static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
> {
> + pg_data_t *pgdat = page_pgdat(page);
> struct mem_cgroup *memcg = page_memcg(page);
>
> VM_WARN_ON_ONCE_PAGE(!memcg && !mem_cgroup_disabled(), page);
> @@ -1223,9 +1222,10 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
> return &pgdat->__lruvec;
> }
>
> -static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
> - struct pglist_data *pgdat)
> +static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
> {
> + pg_data_t *pgdat = page_pgdat(page);
> +
> return &pgdat->__lruvec;
> }
>
> diff --git a/mm/compaction.c b/mm/compaction.c
> index caa4c36c1db3..e7da342003dd 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -1033,7 +1033,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
> if (!TestClearPageLRU(page))
> goto isolate_fail_put;
>
> - lruvec = mem_cgroup_page_lruvec(page, pgdat);
> + lruvec = mem_cgroup_page_lruvec(page);
>
> /* If we already hold the lock, we can skip some rechecking */
> if (lruvec != locked) {
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 9cbfff59b171..1f807448233e 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1177,9 +1177,8 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
> struct lruvec *lock_page_lruvec(struct page *page)
> {
> struct lruvec *lruvec;
> - struct pglist_data *pgdat = page_pgdat(page);
>
> - lruvec = mem_cgroup_page_lruvec(page, pgdat);
> + lruvec = mem_cgroup_page_lruvec(page);
> spin_lock(&lruvec->lru_lock);
>
> lruvec_memcg_debug(lruvec, page);
> @@ -1190,9 +1189,8 @@ struct lruvec *lock_page_lruvec(struct page *page)
> struct lruvec *lock_page_lruvec_irq(struct page *page)
> {
> struct lruvec *lruvec;
> - struct pglist_data *pgdat = page_pgdat(page);
>
> - lruvec = mem_cgroup_page_lruvec(page, pgdat);
> + lruvec = mem_cgroup_page_lruvec(page);
> spin_lock_irq(&lruvec->lru_lock);
>
> lruvec_memcg_debug(lruvec, page);
> @@ -1203,9 +1201,8 @@ struct lruvec *lock_page_lruvec_irq(struct page *page)
> struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
> {
> struct lruvec *lruvec;
> - struct pglist_data *pgdat = page_pgdat(page);
>
> - lruvec = mem_cgroup_page_lruvec(page, pgdat);
> + lruvec = mem_cgroup_page_lruvec(page);
> spin_lock_irqsave(&lruvec->lru_lock, *flags);
>
> lruvec_memcg_debug(lruvec, page);
> diff --git a/mm/swap.c b/mm/swap.c
> index a75a8265302b..e0d5699213cc 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -313,7 +313,7 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
>
> void lru_note_cost_page(struct page *page)
> {
> - lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)),
> + lru_note_cost(mem_cgroup_page_lruvec(page),
> page_is_file_lru(page), thp_nr_pages(page));
> }
>
> diff --git a/mm/workingset.c b/mm/workingset.c
> index b7cdeca5a76d..4f7a306ce75a 100644
> --- a/mm/workingset.c
> +++ b/mm/workingset.c
> @@ -408,7 +408,7 @@ void workingset_activation(struct page *page)
> memcg = page_memcg_rcu(page);
> if (!mem_cgroup_disabled() && !memcg)
> goto out;
> - lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
> + lruvec = mem_cgroup_page_lruvec(page);
> workingset_age_nonresident(lruvec, thp_nr_pages(page));
> out:
> rcu_read_unlock();
> --
> 2.11.0
>

2021-04-14 15:01:46

[permalink] [raw]

Subject: Re: [PATCH 2/7] mm: memcontrol: bail out early when !mm in get_mem_cgroup_from_mm

On Tue 13-04-21 14:51:48, Muchun Song wrote:
> When mm is NULL, we do not need to hold rcu lock and call css_tryget for
> the root memcg. And we also do not need to check !mm in every loop of
> while. So bail out early when !mm.

mem_cgroup_charge and other callers unconditionally drop the reference
so how come this does not underflow reference count?

> Signed-off-by: Muchun Song <[email protected]>
> Acked-by: Johannes Weiner <[email protected]>
> Reviewed-by: Shakeel Butt <[email protected]>
> ---
> mm/memcontrol.c | 21 ++++++++++-----------
> 1 file changed, 10 insertions(+), 11 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index f229de925aa5..9cbfff59b171 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -901,20 +901,19 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
> if (mem_cgroup_disabled())
> return NULL;
>
> + /*
> + * Page cache insertions can happen without an
> + * actual mm context, e.g. during disk probing
> + * on boot, loopback IO, acct() writes etc.
> + */
> + if (unlikely(!mm))
> + return root_mem_cgroup;
> +
> rcu_read_lock();
> do {
> - /*
> - * Page cache insertions can happen without an
> - * actual mm context, e.g. during disk probing
> - * on boot, loopback IO, acct() writes etc.
> - */
> - if (unlikely(!mm))
> + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
> + if (unlikely(!memcg))
> memcg = root_mem_cgroup;
> - else {
> - memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
> - if (unlikely(!memcg))
> - memcg = root_mem_cgroup;
> - }
> } while (!css_tryget(&memcg->css));
> rcu_read_unlock();
> return memcg;
> --
> 2.11.0

--
Michal Hocko
SUSE Labs

2021-04-14 15:05:47

[permalink] [raw]

Subject: Re: [PATCH 7/7] mm: vmscan: remove noinline_for_stack

On Tue 13-04-21 14:51:53, Muchun Song wrote:
> The noinline_for_stack is introduced by commit 666356297ec4 ("vmscan:
> set up pagevec as late as possible in shrink_inactive_list()"), its
> purpose is to delay the allocation of pagevec as late as possible to
> save stack memory. But the commit 2bcf88796381 ("mm: take pagevecs off
> reclaim stack") replace pagevecs by lists of pages_to_free. So we do
> not need noinline_for_stack, just remove it (let the compiler decide
> whether to inline).
>
> Signed-off-by: Muchun Song <[email protected]>
> Acked-by: Johannes Weiner <[email protected]>
> Acked-by: Roman Gushchin <[email protected]>
> Reviewed-by: Shakeel Butt <[email protected]>

Acked-by: Michal Hocko <[email protected]>

> ---
> mm/vmscan.c | 6 +++---
> 1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 64bf07cc20f2..e40b21298d77 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2015,8 +2015,8 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
> *
> * Returns the number of pages moved to the given lruvec.
> */
> -static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
> - struct list_head *list)
> +static unsigned int move_pages_to_lru(struct lruvec *lruvec,
> + struct list_head *list)
> {
> int nr_pages, nr_moved = 0;
> LIST_HEAD(pages_to_free);
> @@ -2096,7 +2096,7 @@ static int current_may_throttle(void)
> * shrink_inactive_list() is a helper for shrink_node(). It returns the number
> * of reclaimed pages
> */
> -static noinline_for_stack unsigned long
> +static unsigned long
> shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
> struct scan_control *sc, enum lru_list lru)
> {
> --
> 2.11.0

--
Michal Hocko
SUSE Labs

2021-04-14 22:31:06

[permalink] [raw]

Subject: Re: [PATCH 3/7] mm: memcontrol: remove the pgdata parameter of mem_cgroup_page_lruvec

On Tue 13-04-21 14:51:49, Muchun Song wrote:
> All the callers of mem_cgroup_page_lruvec() just pass page_pgdat(page)
> as the 2nd parameter to it (except isolate_migratepages_block()). But
> for isolate_migratepages_block(), the page_pgdat(page) is also equal
> to the local variable of @pgdat. So mem_cgroup_page_lruvec() do not
> need the pgdat parameter. Just remove it to simplify the code.
>
> Signed-off-by: Muchun Song <[email protected]>
> Acked-by: Johannes Weiner <[email protected]>

I like this. Two arguments where one can be directly inferred from the
first one can just lead to subtle bugs. In this case it even doesn't
give any advantage for most callers.

Acked-by: Michal Hocko <[email protected]>

> ---
> include/linux/memcontrol.h | 10 +++++-----
> mm/compaction.c | 2 +-
> mm/memcontrol.c | 9 +++------
> mm/swap.c | 2 +-
> mm/workingset.c | 2 +-
> 5 files changed, 11 insertions(+), 14 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index c960fd49c3e8..4f49865c9958 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -743,13 +743,12 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
> /**
> * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
> * @page: the page
> - * @pgdat: pgdat of the page
> *
> * This function relies on page->mem_cgroup being stable.
> */
> -static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
> - struct pglist_data *pgdat)
> +static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
> {
> + pg_data_t *pgdat = page_pgdat(page);
> struct mem_cgroup *memcg = page_memcg(page);
>
> VM_WARN_ON_ONCE_PAGE(!memcg && !mem_cgroup_disabled(), page);
> @@ -1223,9 +1222,10 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
> return &pgdat->__lruvec;
> }
>
> -static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
> - struct pglist_data *pgdat)
> +static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
> {
> + pg_data_t *pgdat = page_pgdat(page);
> +
> return &pgdat->__lruvec;
> }
>
> diff --git a/mm/compaction.c b/mm/compaction.c
> index caa4c36c1db3..e7da342003dd 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -1033,7 +1033,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
> if (!TestClearPageLRU(page))
> goto isolate_fail_put;
>
> - lruvec = mem_cgroup_page_lruvec(page, pgdat);
> + lruvec = mem_cgroup_page_lruvec(page);
>
> /* If we already hold the lock, we can skip some rechecking */
> if (lruvec != locked) {
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 9cbfff59b171..1f807448233e 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1177,9 +1177,8 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
> struct lruvec *lock_page_lruvec(struct page *page)
> {
> struct lruvec *lruvec;
> - struct pglist_data *pgdat = page_pgdat(page);
>
> - lruvec = mem_cgroup_page_lruvec(page, pgdat);
> + lruvec = mem_cgroup_page_lruvec(page);
> spin_lock(&lruvec->lru_lock);
>
> lruvec_memcg_debug(lruvec, page);
> @@ -1190,9 +1189,8 @@ struct lruvec *lock_page_lruvec(struct page *page)
> struct lruvec *lock_page_lruvec_irq(struct page *page)
> {
> struct lruvec *lruvec;
> - struct pglist_data *pgdat = page_pgdat(page);
>
> - lruvec = mem_cgroup_page_lruvec(page, pgdat);
> + lruvec = mem_cgroup_page_lruvec(page);
> spin_lock_irq(&lruvec->lru_lock);
>
> lruvec_memcg_debug(lruvec, page);
> @@ -1203,9 +1201,8 @@ struct lruvec *lock_page_lruvec_irq(struct page *page)
> struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
> {
> struct lruvec *lruvec;
> - struct pglist_data *pgdat = page_pgdat(page);
>
> - lruvec = mem_cgroup_page_lruvec(page, pgdat);
> + lruvec = mem_cgroup_page_lruvec(page);
> spin_lock_irqsave(&lruvec->lru_lock, *flags);
>
> lruvec_memcg_debug(lruvec, page);
> diff --git a/mm/swap.c b/mm/swap.c
> index a75a8265302b..e0d5699213cc 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -313,7 +313,7 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
>
> void lru_note_cost_page(struct page *page)
> {
> - lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)),
> + lru_note_cost(mem_cgroup_page_lruvec(page),
> page_is_file_lru(page), thp_nr_pages(page));
> }
>
> diff --git a/mm/workingset.c b/mm/workingset.c
> index b7cdeca5a76d..4f7a306ce75a 100644
> --- a/mm/workingset.c
> +++ b/mm/workingset.c
> @@ -408,7 +408,7 @@ void workingset_activation(struct page *page)
> memcg = page_memcg_rcu(page);
> if (!mem_cgroup_disabled() && !memcg)
> goto out;
> - lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
> + lruvec = mem_cgroup_page_lruvec(page);
> workingset_age_nonresident(lruvec, thp_nr_pages(page));
> out:
> rcu_read_unlock();
> --
> 2.11.0

--
Michal Hocko
SUSE Labs

2021-04-14 22:37:42

[permalink] [raw]

Subject: Re: [PATCH 4/7] mm: memcontrol: simplify lruvec_holds_page_lru_lock

On Tue 13-04-21 14:51:50, Muchun Song wrote:
> We already have a helper lruvec_memcg() to get the memcg from lruvec, we
> do not need to do it ourselves in the lruvec_holds_page_lru_lock(). So use
> lruvec_memcg() instead. And if mem_cgroup_disabled() returns false, the
> page_memcg(page) (the LRU pages) cannot be NULL. So remove the odd logic
> of "memcg = page_memcg(page) ? : root_mem_cgroup". And use lruvec_pgdat
> to simplify the code. We can have a single definition for this function
> that works for !CONFIG_MEMCG, CONFIG_MEMCG + mem_cgroup_disabled() and
> CONFIG_MEMCG.

Neat. While you are at it wouldn't it make sesne to rename the function
as well. I do not want to bikeshed but this is really a misnomer. it
doesn't check anything about locking. page_belongs_lruvec?

> Signed-off-by: Muchun Song <[email protected]>
> Acked-by: Johannes Weiner <[email protected]>

Acked-by: Michal Hocko <[email protected]>

> ---
> include/linux/memcontrol.h | 31 +++++++------------------------
> 1 file changed, 7 insertions(+), 24 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 4f49865c9958..38b8d3fb24ff 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -755,22 +755,6 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
> return mem_cgroup_lruvec(memcg, pgdat);
> }
>
> -static inline bool lruvec_holds_page_lru_lock(struct page *page,
> - struct lruvec *lruvec)
> -{
> - pg_data_t *pgdat = page_pgdat(page);
> - const struct mem_cgroup *memcg;
> - struct mem_cgroup_per_node *mz;
> -
> - if (mem_cgroup_disabled())
> - return lruvec == &pgdat->__lruvec;
> -
> - mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> - memcg = page_memcg(page) ? : root_mem_cgroup;
> -
> - return lruvec->pgdat == pgdat && mz->memcg == memcg;
> -}
> -
> struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
>
> struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
> @@ -1229,14 +1213,6 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
> return &pgdat->__lruvec;
> }
>
> -static inline bool lruvec_holds_page_lru_lock(struct page *page,
> - struct lruvec *lruvec)
> -{
> - pg_data_t *pgdat = page_pgdat(page);
> -
> - return lruvec == &pgdat->__lruvec;
> -}
> -
> static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
> {
> }
> @@ -1518,6 +1494,13 @@ static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
> spin_unlock_irqrestore(&lruvec->lru_lock, flags);
> }
>
> +static inline bool lruvec_holds_page_lru_lock(struct page *page,
> + struct lruvec *lruvec)
> +{
> + return lruvec_pgdat(lruvec) == page_pgdat(page) &&
> + lruvec_memcg(lruvec) == page_memcg(page);
> +}
> +
> /* Don't lock again iff page's lruvec locked */
> static inline struct lruvec *relock_page_lruvec_irq(struct page *page,
> struct lruvec *locked_lruvec)
> --
> 2.11.0

--
Michal Hocko
SUSE Labs

2021-04-14 23:09:49

[permalink] [raw]

Subject: Re: [External] Re: [PATCH 2/7] mm: memcontrol: bail out early when !mm in get_mem_cgroup_from_mm

On Wed, Apr 14, 2021 at 5:24 PM Michal Hocko <[email protected]> wrote:
>
> On Tue 13-04-21 14:51:48, Muchun Song wrote:
> > When mm is NULL, we do not need to hold rcu lock and call css_tryget for
> > the root memcg. And we also do not need to check !mm in every loop of
> > while. So bail out early when !mm.
>
> mem_cgroup_charge and other callers unconditionally drop the reference
> so how come this does not underflow reference count?

For the root memcg, the CSS_NO_REF flag is set, so css_get
and css_put do not get or put reference.

Thanks.

>
> > Signed-off-by: Muchun Song <[email protected]>
> > Acked-by: Johannes Weiner <[email protected]>
> > Reviewed-by: Shakeel Butt <[email protected]>
> > ---
> > mm/memcontrol.c | 21 ++++++++++-----------
> > 1 file changed, 10 insertions(+), 11 deletions(-)
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index f229de925aa5..9cbfff59b171 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -901,20 +901,19 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
> > if (mem_cgroup_disabled())
> > return NULL;
> >
> > + /*
> > + * Page cache insertions can happen without an
> > + * actual mm context, e.g. during disk probing
> > + * on boot, loopback IO, acct() writes etc.
> > + */
> > + if (unlikely(!mm))
> > + return root_mem_cgroup;
> > +
> > rcu_read_lock();
> > do {
> > - /*
> > - * Page cache insertions can happen without an
> > - * actual mm context, e.g. during disk probing
> > - * on boot, loopback IO, acct() writes etc.
> > - */
> > - if (unlikely(!mm))
> > + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
> > + if (unlikely(!memcg))
> > memcg = root_mem_cgroup;
> > - else {
> > - memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
> > - if (unlikely(!memcg))
> > - memcg = root_mem_cgroup;
> > - }
> > } while (!css_tryget(&memcg->css));
> > rcu_read_unlock();
> > return memcg;
> > --
> > 2.11.0
>
> --
> Michal Hocko
> SUSE Labs

2021-04-14 23:09:51

[permalink] [raw]

Subject: Re: [External] Re: [PATCH 2/7] mm: memcontrol: bail out early when !mm in get_mem_cgroup_from_mm

On Wed 14-04-21 18:04:35, Muchun Song wrote:
> On Wed, Apr 14, 2021 at 5:24 PM Michal Hocko <[email protected]> wrote:
> >
> > On Tue 13-04-21 14:51:48, Muchun Song wrote:
> > > When mm is NULL, we do not need to hold rcu lock and call css_tryget for
> > > the root memcg. And we also do not need to check !mm in every loop of
> > > while. So bail out early when !mm.
> >
> > mem_cgroup_charge and other callers unconditionally drop the reference
> > so how come this does not underflow reference count?
>
> For the root memcg, the CSS_NO_REF flag is set, so css_get
> and css_put do not get or put reference.

Ohh, right you are. I must have forgot about that special case. I am
pretty sure I (and likely few more) will stumble over that in the future
again. A small comment explaining that the reference can be safely
ignore would be helpful.

Anyway
Acked-by: Michal Hocko <[email protected]>

Thanks!

--
Michal Hocko
SUSE Labs

2021-04-15 03:15:25

[permalink] [raw]

Subject: Re: [External] Re: [PATCH 2/7] mm: memcontrol: bail out early when !mm in get_mem_cgroup_from_mm

On Wed, Apr 14, 2021 at 6:15 PM Michal Hocko <[email protected]> wrote:
>
> On Wed 14-04-21 18:04:35, Muchun Song wrote:
> > On Wed, Apr 14, 2021 at 5:24 PM Michal Hocko <[email protected]> wrote:
> > >
> > > On Tue 13-04-21 14:51:48, Muchun Song wrote:
> > > > When mm is NULL, we do not need to hold rcu lock and call css_tryget for
> > > > the root memcg. And we also do not need to check !mm in every loop of
> > > > while. So bail out early when !mm.
> > >
> > > mem_cgroup_charge and other callers unconditionally drop the reference
> > > so how come this does not underflow reference count?
> >
> > For the root memcg, the CSS_NO_REF flag is set, so css_get
> > and css_put do not get or put reference.
>
> Ohh, right you are. I must have forgot about that special case. I am
> pretty sure I (and likely few more) will stumble over that in the future
> again. A small comment explaining that the reference can be safely
> ignore would be helpful.

OK. Will do.

>
> Anyway
> Acked-by: Michal Hocko <[email protected]>

Thanks.

>
> Thanks!
>
> --
> Michal Hocko
> SUSE Labs

2021-04-15 07:10:52