2019-02-28 18:22:02

by Johannes Weiner

[permalink] [raw]
Subject: [PATCH 0/6] mm: memcontrol: clean up the LRU counts tracking

[ Resend #2: Sorry about the spam, I mixed up the header fields in
git-send-email and I don't know who did and didn't receive the
garbled previous attempt.

Resend #1: Rebased on top of the latest mmots. ]

The memcg LRU stats usage is currently a bit messy. Memcg has private
per-zone counters because reclaim needs zone granularity sometimes,
but we also have plenty of users that need to awkwardly sum them up to
node or memcg granularity. Meanwhile the canonical per-memcg vmstats
do not track the LRU counts (NR_INACTIVE_ANON etc.) as you'd expect.

This series enables LRU count tracking in the per-memcg vmstats array
such that lruvec_page_state() and memcg_page_state() work on the enum
node_stat_item items for the LRU counters. Then it converts all the
callers that don't specifically need per-zone numbers over to that.

include/linux/memcontrol.h | 28 ---------------
include/linux/mm_inline.h | 2 +-
include/linux/mmzone.h | 5 ---
mm/memcontrol.c | 85 +++++++++++++++++++++++++-------------------
mm/vmscan.c | 2 +-
mm/workingset.c | 5 +--
6 files changed, 54 insertions(+), 73 deletions(-)




2019-02-28 18:21:12

by Johannes Weiner

[permalink] [raw]
Subject: [PATCH 1/6] mm: memcontrol: track LRU counts in the vmstats array

The memcg code currently maintains private per-zone breakdowns of the
LRU counters. This is necessary for reclaim decisions which are still
zone-based, but there are a variety of users of these counters that
only want the aggregate per-lruvec or per-memcg LRU counts, and they
need to painfully sum up the zone counters on each request for that.

These would be better served using the memcg vmstats arrays, which
track VM statistics at the desired scope already. They just don't have
the LRU counts right now.

So to kick off the conversion, begin tracking LRU counts in those.

Signed-off-by: Johannes Weiner <[email protected]>
---
include/linux/mm_inline.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 04ec454d44ce..6f2fef7b0784 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -29,7 +29,7 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec,
{
struct pglist_data *pgdat = lruvec_pgdat(lruvec);

- __mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages);
+ __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
__mod_zone_page_state(&pgdat->node_zones[zid],
NR_ZONE_LRU_BASE + lru, nr_pages);
}
--
2.20.1


2019-02-28 18:22:03

by Johannes Weiner

[permalink] [raw]
Subject: [PATCH 5/6] mm: memcontrol: push down mem_cgroup_nr_lru_pages()

mem_cgroup_nr_lru_pages() is just a convenience wrapper around
memcg_page_state() that takes bitmasks of lru indexes and aggregates
the counts for those.

Replace callsites where the bitmask is simple enough with direct
memcg_page_state() call(s).

Signed-off-by: Johannes Weiner <[email protected]>
---
mm/memcontrol.c | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ad6214b3d20b..76f599fbbbe8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1361,7 +1361,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)

for (i = 0; i < NR_LRU_LISTS; i++)
pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
- K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
+ K(memcg_page_state(iter, NR_LRU_BASE + i)));

pr_cont("\n");
}
@@ -3016,8 +3016,8 @@ static void accumulate_vmstats(struct mem_cgroup *memcg,
? acc->vmevents_array[i] : i);

for (i = 0; i < NR_LRU_LISTS; i++)
- acc->lru_pages[i] +=
- mem_cgroup_nr_lru_pages(mi, BIT(i));
+ acc->lru_pages[i] += memcg_page_state(mi,
+ NR_LRU_BASE + i);
}
}

@@ -3447,7 +3447,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)

for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
- mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
+ memcg_page_state(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);

/* Hierarchical information */
memory = memsw = PAGE_COUNTER_MAX;
@@ -3937,8 +3938,8 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,

/* this should eventually include NR_UNSTABLE_NFS */
*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
- *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
- (1 << LRU_ACTIVE_FILE));
+ *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) +
+ memcg_page_state(memcg, NR_ACTIVE_FILE);
*pheadroom = PAGE_COUNTER_MAX;

while ((parent = parent_mem_cgroup(memcg))) {
--
2.20.1


2019-02-28 18:22:38

by Johannes Weiner

[permalink] [raw]
Subject: [PATCH 2/6] mm: memcontrol: replace zone summing with lruvec_page_state()

Instead of adding up the zone counters, use lruvec_page_state() to get
the node state directly. This is a bit cheaper and more stream-lined.

Signed-off-by: Johannes Weiner <[email protected]>
---
include/linux/memcontrol.h | 18 ------------------
mm/memcontrol.c | 2 +-
mm/vmscan.c | 2 +-
3 files changed, 2 insertions(+), 20 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 534267947664..5050d281f67d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -517,19 +517,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid, unsigned int lru_mask);

-static inline
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
-{
- struct mem_cgroup_per_node *mz;
- unsigned long nr_pages = 0;
- int zid;
-
- mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- for (zid = 0; zid < MAX_NR_ZONES; zid++)
- nr_pages += mz->lru_zone_size[zid][lru];
- return nr_pages;
-}
-
static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
enum lru_list lru, int zone_idx)
@@ -985,11 +972,6 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
return true;
}

-static inline unsigned long
-mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
-{
- return 0;
-}
static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
enum lru_list lru, int zone_idx)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5fc2e1a7d4d2..d85a41cfee60 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -737,7 +737,7 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
continue;
- nr += mem_cgroup_get_lru_size(lruvec, lru);
+ nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
}
return nr;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ac4806f0f332..cdf8d92e6b89 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -346,7 +346,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone
int zid;

if (!mem_cgroup_disabled())
- lru_size = mem_cgroup_get_lru_size(lruvec, lru);
+ lru_size = lruvec_page_state(lruvec, NR_LRU_BASE + lru);
else
lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);

--
2.20.1


2019-02-28 18:24:40

by Roman Gushchin

[permalink] [raw]
Subject: Re: [PATCH 0/6] mm: memcontrol: clean up the LRU counts tracking

On Thu, Feb 28, 2019 at 11:30:14AM -0500, Johannes Weiner wrote:
> [ Resend #2: Sorry about the spam, I mixed up the header fields in
> git-send-email and I don't know who did and didn't receive the
> garbled previous attempt.
>
> Resend #1: Rebased on top of the latest mmots. ]
>
> The memcg LRU stats usage is currently a bit messy. Memcg has private
> per-zone counters because reclaim needs zone granularity sometimes,
> but we also have plenty of users that need to awkwardly sum them up to
> node or memcg granularity. Meanwhile the canonical per-memcg vmstats
> do not track the LRU counts (NR_INACTIVE_ANON etc.) as you'd expect.
>
> This series enables LRU count tracking in the per-memcg vmstats array
> such that lruvec_page_state() and memcg_page_state() work on the enum
> node_stat_item items for the LRU counters. Then it converts all the
> callers that don't specifically need per-zone numbers over to that.
>
> include/linux/memcontrol.h | 28 ---------------
> include/linux/mm_inline.h | 2 +-
> include/linux/mmzone.h | 5 ---
> mm/memcontrol.c | 85 +++++++++++++++++++++++++-------------------
> mm/vmscan.c | 2 +-
> mm/workingset.c | 5 +--
> 6 files changed, 54 insertions(+), 73 deletions(-)
>
>


The patchset looks very good to me!

Reviewed-by: Roman Gushchin <[email protected]>

Thanks!

2019-02-28 19:40:02

by Johannes Weiner

[permalink] [raw]
Subject: [PATCH 4/6] mm: memcontrol: push down mem_cgroup_node_nr_lru_pages()

mem_cgroup_node_nr_lru_pages() is just a convenience wrapper around
lruvec_page_state() that takes bitmasks of lru indexes and aggregates
the counts for those.

Replace callsites where the bitmask is simple enough with direct
lruvec_page_state() calls.

This removes the last extern user of mem_cgroup_node_nr_lru_pages(),
so make that function private again, too.

Signed-off-by: Johannes Weiner <[email protected]>
---
include/linux/memcontrol.h | 10 ----------
mm/memcontrol.c | 10 +++++++---
mm/workingset.c | 5 +++--
3 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5050d281f67d..57029eefd225 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -514,9 +514,6 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
int zid, int nr_pages);

-unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid, unsigned int lru_mask);
-
static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
enum lru_list lru, int zone_idx)
@@ -979,13 +976,6 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
return 0;
}

-static inline unsigned long
-mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid, unsigned int lru_mask)
-{
- return 0;
-}
-
static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e702b67cde41..ad6214b3d20b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -725,7 +725,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
__this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
}

-unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid, unsigned int lru_mask)
{
struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
@@ -1430,11 +1430,15 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
int nid, bool noswap)
{
- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
+ struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
+
+ if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
+ lruvec_page_state(lruvec, NR_ACTIVE_FILE))
return true;
if (noswap || !total_swap_pages)
return false;
- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
+ if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
+ lruvec_page_state(lruvec, NR_ACTIVE_ANON))
return true;
return false;

diff --git a/mm/workingset.c b/mm/workingset.c
index dcb994f2acc2..dbc333a21254 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -427,10 +427,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
#ifdef CONFIG_MEMCG
if (sc->memcg) {
struct lruvec *lruvec;
+ int i;

- pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
- LRU_ALL);
lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg);
+ for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
+ pages += lruvec_page_state(lruvec, NR_LRU_BASE + i);
pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE);
pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE);
} else
--
2.20.1


2019-02-28 19:40:22

by Johannes Weiner

[permalink] [raw]
Subject: [PATCH 3/6] mm: memcontrol: replace node summing with memcg_page_state()

Instead of adding up the node counters, use memcg_page_state() to get
the memcg state directly. This is a bit cheaper and more stream-lined.

Signed-off-by: Johannes Weiner <[email protected]>
---
mm/memcontrol.c | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d85a41cfee60..e702b67cde41 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -746,10 +746,13 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
unsigned int lru_mask)
{
unsigned long nr = 0;
- int nid;
+ enum lru_list lru;

- for_each_node_state(nid, N_MEMORY)
- nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
+ for_each_lru(lru) {
+ if (!(BIT(lru) & lru_mask))
+ continue;
+ nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
+ }
return nr;
}

--
2.20.1


2019-02-28 19:40:31

by Johannes Weiner

[permalink] [raw]
Subject: [PATCH 6/6] mm: memcontrol: quarantine the mem_cgroup_[node_]nr_lru_pages() API

Only memcg_numa_stat_show() uses those wrappers and the lru bitmasks,
group them together.

Signed-off-by: Johannes Weiner <[email protected]>
---
include/linux/mmzone.h | 5 ----
mm/memcontrol.c | 67 +++++++++++++++++++++++-------------------
2 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2fd4247262e9..4f92d32c26a7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -305,11 +305,6 @@ struct lruvec {
#endif
};

-/* Mask used at gathering information at once (see memcontrol.c) */
-#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
-#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
-#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
-
/* Isolate unmapped file */
#define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2)
/* Isolate for asynchronous migration */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 76f599fbbbe8..84243831b738 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -725,37 +725,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
__this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
}

-static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid, unsigned int lru_mask)
-{
- struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
- unsigned long nr = 0;
- enum lru_list lru;
-
- VM_BUG_ON((unsigned)nid >= nr_node_ids);
-
- for_each_lru(lru) {
- if (!(BIT(lru) & lru_mask))
- continue;
- nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
- }
- return nr;
-}
-
-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
- unsigned int lru_mask)
-{
- unsigned long nr = 0;
- enum lru_list lru;
-
- for_each_lru(lru) {
- if (!(BIT(lru) & lru_mask))
- continue;
- nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
- }
- return nr;
-}
-
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
enum mem_cgroup_events_target target)
{
@@ -3357,6 +3326,42 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
#endif

#ifdef CONFIG_NUMA
+
+#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
+#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
+#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
+
+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ int nid, unsigned int lru_mask)
+{
+ struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
+ unsigned long nr = 0;
+ enum lru_list lru;
+
+ VM_BUG_ON((unsigned)nid >= nr_node_ids);
+
+ for_each_lru(lru) {
+ if (!(BIT(lru) & lru_mask))
+ continue;
+ nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
+ }
+ return nr;
+}
+
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
+ unsigned int lru_mask)
+{
+ unsigned long nr = 0;
+ enum lru_list lru;
+
+ for_each_lru(lru) {
+ if (!(BIT(lru) & lru_mask))
+ continue;
+ nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
+ }
+ return nr;
+}
+
static int memcg_numa_stat_show(struct seq_file *m, void *v)
{
struct numa_stat {
--
2.20.1


2019-03-22 01:57:40

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 3/6] mm: memcontrol: replace node summing with memcg_page_state()

On Thu, 28 Feb 2019 11:30:17 -0500 Johannes Weiner <[email protected]> wrote:

> Instead of adding up the node counters, use memcg_page_state() to get
> the memcg state directly. This is a bit cheaper and more stream-lined.
>
> ...
>
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -746,10 +746,13 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
> unsigned int lru_mask)
> {
> unsigned long nr = 0;
> - int nid;
> + enum lru_list lru;
>
> - for_each_node_state(nid, N_MEMORY)
> - nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
> + for_each_lru(lru) {
> + if (!(BIT(lru) & lru_mask))
> + continue;
> + nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
> + }

Might be able to use for_each_set_bit(&lru_mnask) here, but it's much
of a muchness.