Most of the memory overhead of a memcg object is due to memcg stats
maintained by the kernel. Since stats updates happen in performance
critical codepaths, the stats are maintained per-cpu and numa specific
stats are maintained per-node * per-cpu. This drastically increase the
overhead on large machines i.e. large of CPUs and multiple numa nodes.
This patch series tries to reduce the overhead by at least not
allocating the memory for stats which are not memcg specific.
The main change from the v1 is the indirection approach used in this
patchset instead of rearranging the members of node_stat_item.
Shakeel Butt (7):
memcg: reduce memory size of mem_cgroup_events_index
memcg: dynamically allocate lruvec_stats
memcg: reduce memory for the lruvec and memcg stats
memcg: cleanup __mod_memcg_lruvec_state
memcg: pr_warn_once for unexpected events and stats
memcg: use proper type for mod_memcg_state
mm: cleanup WORKINGSET_NODES in workingset
include/linux/memcontrol.h | 75 ++----------
mm/memcontrol.c | 245 ++++++++++++++++++++++++++++++++-----
mm/workingset.c | 7 +-
3 files changed, 230 insertions(+), 97 deletions(-)
--
2.43.0
To decouple the dependency of lruvec_stats on NR_VM_NODE_STAT_ITEMS, we
need to dynamically allocate lruvec_stats in the mem_cgroup_per_node
structure. Also move the definition of lruvec_stats_percpu and
lruvec_stats and related functions to the memcontrol.c to facilitate
later patches. No functional changes in the patch.
Signed-off-by: Shakeel Butt <[email protected]>
---
include/linux/memcontrol.h | 62 +++------------------------
mm/memcontrol.c | 87 ++++++++++++++++++++++++++++++++------
2 files changed, 81 insertions(+), 68 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9aba0d0462ca..ab8a6e884375 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -83,6 +83,8 @@ enum mem_cgroup_events_target {
struct memcg_vmstats_percpu;
struct memcg_vmstats;
+struct lruvec_stats_percpu;
+struct lruvec_stats;
struct mem_cgroup_reclaim_iter {
struct mem_cgroup *position;
@@ -90,25 +92,6 @@ struct mem_cgroup_reclaim_iter {
unsigned int generation;
};
-struct lruvec_stats_percpu {
- /* Local (CPU and cgroup) state */
- long state[NR_VM_NODE_STAT_ITEMS];
-
- /* Delta calculation for lockless upward propagation */
- long state_prev[NR_VM_NODE_STAT_ITEMS];
-};
-
-struct lruvec_stats {
- /* Aggregated (CPU and subtree) state */
- long state[NR_VM_NODE_STAT_ITEMS];
-
- /* Non-hierarchical (CPU aggregated) state */
- long state_local[NR_VM_NODE_STAT_ITEMS];
-
- /* Pending child counts during tree propagation */
- long state_pending[NR_VM_NODE_STAT_ITEMS];
-};
-
/*
* per-node information in memory controller.
*/
@@ -116,7 +99,7 @@ struct mem_cgroup_per_node {
struct lruvec lruvec;
struct lruvec_stats_percpu __percpu *lruvec_stats_percpu;
- struct lruvec_stats lruvec_stats;
+ struct lruvec_stats *lruvec_stats;
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
@@ -1037,42 +1020,9 @@ static inline void mod_memcg_page_state(struct page *page,
}
unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
-
-static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
- enum node_stat_item idx)
-{
- struct mem_cgroup_per_node *pn;
- long x;
-
- if (mem_cgroup_disabled())
- return node_page_state(lruvec_pgdat(lruvec), idx);
-
- pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- x = READ_ONCE(pn->lruvec_stats.state[idx]);
-#ifdef CONFIG_SMP
- if (x < 0)
- x = 0;
-#endif
- return x;
-}
-
-static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
- enum node_stat_item idx)
-{
- struct mem_cgroup_per_node *pn;
- long x = 0;
-
- if (mem_cgroup_disabled())
- return node_page_state(lruvec_pgdat(lruvec), idx);
-
- pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- x = READ_ONCE(pn->lruvec_stats.state_local[idx]);
-#ifdef CONFIG_SMP
- if (x < 0)
- x = 0;
-#endif
- return x;
-}
+unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
+unsigned long lruvec_page_state_local(struct lruvec *lruvec,
+ enum node_stat_item idx);
void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 53769d06053f..5e337ed6c6bf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -576,6 +576,60 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
return mz;
}
+struct lruvec_stats_percpu {
+ /* Local (CPU and cgroup) state */
+ long state[NR_VM_NODE_STAT_ITEMS];
+
+ /* Delta calculation for lockless upward propagation */
+ long state_prev[NR_VM_NODE_STAT_ITEMS];
+};
+
+struct lruvec_stats {
+ /* Aggregated (CPU and subtree) state */
+ long state[NR_VM_NODE_STAT_ITEMS];
+
+ /* Non-hierarchical (CPU aggregated) state */
+ long state_local[NR_VM_NODE_STAT_ITEMS];
+
+ /* Pending child counts during tree propagation */
+ long state_pending[NR_VM_NODE_STAT_ITEMS];
+};
+
+unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
+{
+ struct mem_cgroup_per_node *pn;
+ long x;
+
+ if (mem_cgroup_disabled())
+ return node_page_state(lruvec_pgdat(lruvec), idx);
+
+ pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+ x = READ_ONCE(pn->lruvec_stats->state[idx]);
+#ifdef CONFIG_SMP
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
+}
+
+unsigned long lruvec_page_state_local(struct lruvec *lruvec,
+ enum node_stat_item idx)
+{
+ struct mem_cgroup_per_node *pn;
+ long x = 0;
+
+ if (mem_cgroup_disabled())
+ return node_page_state(lruvec_pgdat(lruvec), idx);
+
+ pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+ x = READ_ONCE(pn->lruvec_stats->state_local[idx]);
+#ifdef CONFIG_SMP
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
+}
+
/* Subset of vm_event_item to report for memcg event stats */
static const unsigned int memcg_vm_event_stat[] = {
PGPGIN,
@@ -5492,18 +5546,25 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn)
return 1;
+ pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats), GFP_KERNEL,
+ node);
+ if (!pn->lruvec_stats)
+ goto fail;
+
pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
GFP_KERNEL_ACCOUNT);
- if (!pn->lruvec_stats_percpu) {
- kfree(pn);
- return 1;
- }
+ if (!pn->lruvec_stats_percpu)
+ goto fail;
lruvec_init(&pn->lruvec);
pn->memcg = memcg;
memcg->nodeinfo[node] = pn;
return 0;
+fail:
+ kfree(pn->lruvec_stats);
+ kfree(pn);
+ return 1;
}
static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
@@ -5514,6 +5575,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
return;
free_percpu(pn->lruvec_stats_percpu);
+ kfree(pn->lruvec_stats);
kfree(pn);
}
@@ -5866,18 +5928,19 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
for_each_node_state(nid, N_MEMORY) {
struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
- struct mem_cgroup_per_node *ppn = NULL;
+ struct lruvec_stats *lstats = pn->lruvec_stats;
+ struct lruvec_stats *plstats = NULL;
struct lruvec_stats_percpu *lstatc;
if (parent)
- ppn = parent->nodeinfo[nid];
+ plstats = parent->nodeinfo[nid]->lruvec_stats;
lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
- delta = pn->lruvec_stats.state_pending[i];
+ delta = lstats->state_pending[i];
if (delta)
- pn->lruvec_stats.state_pending[i] = 0;
+ lstats->state_pending[i] = 0;
delta_cpu = 0;
v = READ_ONCE(lstatc->state[i]);
@@ -5888,12 +5951,12 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
}
if (delta_cpu)
- pn->lruvec_stats.state_local[i] += delta_cpu;
+ lstats->state_local[i] += delta_cpu;
if (delta) {
- pn->lruvec_stats.state[i] += delta;
- if (ppn)
- ppn->lruvec_stats.state_pending[i] += delta;
+ lstats->state[i] += delta;
+ if (plstats)
+ plstats->state_pending[i] += delta;
}
}
}
--
2.43.0
mem_cgroup_events_index is a translation table to get the right index of
the memcg relevant entry for the general vm_event_item. At the moment,
it is defined as integer array. However on a typical system the max
entry of vm_event_item (NR_VM_EVENT_ITEMS) is 113, so we don't need to
use int as storage type of the array. For now just use int8_t as type
and add a BUILD_BUG_ON() and will switch to short once NR_VM_EVENT_ITEMS
touches 127.
Signed-off-by: Shakeel Butt <[email protected]>
---
mm/memcontrol.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 602ad5faad4d..53769d06053f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -607,11 +607,14 @@ static const unsigned int memcg_vm_event_stat[] = {
};
#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
-static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
+static int8_t mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
static void init_memcg_events(void)
{
- int i;
+ int8_t i;
+
+ /* Switch to short once this failure occurs. */
+ BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= 127 /* INT8_MAX */);
for (i = 0; i < NR_MEMCG_EVENTS; ++i)
mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1;
--
2.43.0
At the moment, the amount of memory allocated for stats related structs
in the mem_cgroup corresponds to the size of enum node_stat_item.
However not all fields in enum node_stat_item has corresponding memcg
stats. So, let's use indirection mechanism similar to the one used for
memcg vmstats management.
For a given x86_64 config, the size of stats with and without patch is:
structs size in bytes w/o with
struct lruvec_stats 1128 648
struct lruvec_stats_percpu 752 432
struct memcg_vmstats 1832 1352
struct memcg_vmstats_percpu 1280 960
The memory savings is further compounded by the fact that these structs
are allocated for each cpu and for each node. To be precise, for each
memcg the memory saved would be:
Memory saved = ((21 * 3 * NR_NODES) + (21 * 2 * NR_NODS * NR_CPUS) +
(21 * 3) + (21 * 2 * NR_CPUS)) * sizeof(long)
Where 21 is the number of fields eliminated.
Signed-off-by: Shakeel Butt <[email protected]>
---
mm/memcontrol.c | 138 ++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 115 insertions(+), 23 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5e337ed6c6bf..c164bc9b8ed6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -576,35 +576,105 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
return mz;
}
+/* Subset of node_stat_item for memcg stats */
+static const unsigned int memcg_node_stat_items[] = {
+ NR_INACTIVE_ANON,
+ NR_ACTIVE_ANON,
+ NR_INACTIVE_FILE,
+ NR_ACTIVE_FILE,
+ NR_UNEVICTABLE,
+ NR_SLAB_RECLAIMABLE_B,
+ NR_SLAB_UNRECLAIMABLE_B,
+ WORKINGSET_REFAULT_ANON,
+ WORKINGSET_REFAULT_FILE,
+ WORKINGSET_ACTIVATE_ANON,
+ WORKINGSET_ACTIVATE_FILE,
+ WORKINGSET_RESTORE_ANON,
+ WORKINGSET_RESTORE_FILE,
+ WORKINGSET_NODERECLAIM,
+ NR_ANON_MAPPED,
+ NR_FILE_MAPPED,
+ NR_FILE_PAGES,
+ NR_FILE_DIRTY,
+ NR_WRITEBACK,
+ NR_SHMEM,
+ NR_SHMEM_THPS,
+ NR_FILE_THPS,
+ NR_ANON_THPS,
+ NR_KERNEL_STACK_KB,
+ NR_PAGETABLE,
+ NR_SECONDARY_PAGETABLE,
+#ifdef CONFIG_SWAP
+ NR_SWAPCACHE,
+#endif
+};
+
+static const unsigned int memcg_stat_items[] = {
+ MEMCG_SWAP,
+ MEMCG_SOCK,
+ MEMCG_PERCPU_B,
+ MEMCG_VMALLOC,
+ MEMCG_KMEM,
+ MEMCG_ZSWAP_B,
+ MEMCG_ZSWAPPED,
+};
+
+#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
+#define NR_MEMCG_STATS (NR_MEMCG_NODE_STAT_ITEMS + ARRAY_SIZE(memcg_stat_items))
+static int8_t mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly;
+
+static void init_memcg_stats(void)
+{
+ int8_t i, j = 0;
+
+ /* Switch to short once this failure occurs. */
+ BUILD_BUG_ON(NR_MEMCG_STATS >= 127 /* INT8_MAX */);
+
+ for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i)
+ mem_cgroup_stats_index[memcg_node_stat_items[i]] = ++j;
+
+ for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i)
+ mem_cgroup_stats_index[memcg_stat_items[i]] = ++j;
+}
+
+static inline int memcg_stats_index(int idx)
+{
+ return mem_cgroup_stats_index[idx] - 1;
+}
+
struct lruvec_stats_percpu {
/* Local (CPU and cgroup) state */
- long state[NR_VM_NODE_STAT_ITEMS];
+ long state[NR_MEMCG_NODE_STAT_ITEMS];
/* Delta calculation for lockless upward propagation */
- long state_prev[NR_VM_NODE_STAT_ITEMS];
+ long state_prev[NR_MEMCG_NODE_STAT_ITEMS];
};
struct lruvec_stats {
/* Aggregated (CPU and subtree) state */
- long state[NR_VM_NODE_STAT_ITEMS];
+ long state[NR_MEMCG_NODE_STAT_ITEMS];
/* Non-hierarchical (CPU aggregated) state */
- long state_local[NR_VM_NODE_STAT_ITEMS];
+ long state_local[NR_MEMCG_NODE_STAT_ITEMS];
/* Pending child counts during tree propagation */
- long state_pending[NR_VM_NODE_STAT_ITEMS];
+ long state_pending[NR_MEMCG_NODE_STAT_ITEMS];
};
unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
{
struct mem_cgroup_per_node *pn;
- long x;
+ long x = 0;
+ int i;
if (mem_cgroup_disabled())
return node_page_state(lruvec_pgdat(lruvec), idx);
- pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- x = READ_ONCE(pn->lruvec_stats->state[idx]);
+ i = memcg_stats_index(idx);
+ if (i >= 0) {
+ pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+ x = READ_ONCE(pn->lruvec_stats->state[i]);
+ }
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
@@ -617,12 +687,16 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
{
struct mem_cgroup_per_node *pn;
long x = 0;
+ int i;
if (mem_cgroup_disabled())
return node_page_state(lruvec_pgdat(lruvec), idx);
- pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- x = READ_ONCE(pn->lruvec_stats->state_local[idx]);
+ i = memcg_stats_index(idx);
+ if (i >= 0) {
+ pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+ x = READ_ONCE(pn->lruvec_stats->state_local[i]);
+ }
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
@@ -690,11 +764,11 @@ struct memcg_vmstats_percpu {
/* The above should fit a single cacheline for memcg_rstat_updated() */
/* Local (CPU and cgroup) page state & events */
- long state[MEMCG_NR_STAT];
+ long state[NR_MEMCG_STATS];
unsigned long events[NR_MEMCG_EVENTS];
/* Delta calculation for lockless upward propagation */
- long state_prev[MEMCG_NR_STAT];
+ long state_prev[NR_MEMCG_STATS];
unsigned long events_prev[NR_MEMCG_EVENTS];
/* Cgroup1: threshold notifications & softlimit tree updates */
@@ -704,15 +778,15 @@ struct memcg_vmstats_percpu {
struct memcg_vmstats {
/* Aggregated (CPU and subtree) page state & events */
- long state[MEMCG_NR_STAT];
+ long state[NR_MEMCG_STATS];
unsigned long events[NR_MEMCG_EVENTS];
/* Non-hierarchical (CPU aggregated) page state & events */
- long state_local[MEMCG_NR_STAT];
+ long state_local[NR_MEMCG_STATS];
unsigned long events_local[NR_MEMCG_EVENTS];
/* Pending child counts during tree propagation */
- long state_pending[MEMCG_NR_STAT];
+ long state_pending[NR_MEMCG_STATS];
unsigned long events_pending[NR_MEMCG_EVENTS];
/* Stats updates since the last flush */
@@ -845,7 +919,13 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
{
- long x = READ_ONCE(memcg->vmstats->state[idx]);
+ long x;
+ int i = memcg_stats_index(idx);
+
+ if (i < 0)
+ return 0;
+
+ x = READ_ONCE(memcg->vmstats->state[i]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
@@ -877,18 +957,25 @@ static int memcg_state_val_in_pages(int idx, int val)
*/
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
{
- if (mem_cgroup_disabled())
+ int i = memcg_stats_index(idx);
+
+ if (mem_cgroup_disabled() || i < 0)
return;
- __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
+ __this_cpu_add(memcg->vmstats_percpu->state[i], val);
memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
}
/* idx can be of type enum memcg_stat_item or node_stat_item. */
static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
{
- long x = READ_ONCE(memcg->vmstats->state_local[idx]);
+ long x;
+ int i = memcg_stats_index(idx);
+
+ if (i < 0)
+ return 0;
+ x = READ_ONCE(memcg->vmstats->state_local[i]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
@@ -902,6 +989,10 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup *memcg;
+ int i = memcg_stats_index(idx);
+
+ if (i < 0)
+ return;
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
@@ -931,10 +1022,10 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
}
/* Update memcg */
- __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
+ __this_cpu_add(memcg->vmstats_percpu->state[i], val);
/* Update lruvec */
- __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
+ __this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
memcg_stats_unlock();
@@ -5702,6 +5793,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
page_counter_init(&memcg->kmem, &parent->kmem);
page_counter_init(&memcg->tcpmem, &parent->tcpmem);
} else {
+ init_memcg_stats();
init_memcg_events();
page_counter_init(&memcg->memory, NULL);
page_counter_init(&memcg->swap, NULL);
@@ -5873,7 +5965,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
- for (i = 0; i < MEMCG_NR_STAT; i++) {
+ for (i = 0; i < NR_MEMCG_STATS; i++) {
/*
* Collect the aggregated propagation counts of groups
* below us. We're in a per-cpu loop here and this is
@@ -5937,7 +6029,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+ for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; i++) {
delta = lstats->state_pending[i];
if (delta)
lstats->state_pending[i] = 0;
--
2.43.0
To reduce memory usage by the memcg events and stats, the kernel uses
indirection table and only allocate stats and events which are being
used by the memcg code. To make this more robust, let's add warnings
where unexpected stats and events indexes are used.
Signed-off-by: Shakeel Butt <[email protected]>
---
mm/memcontrol.c | 43 ++++++++++++++++++++++++++++++++++---------
1 file changed, 34 insertions(+), 9 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 103e0e53e20a..36145089dcf5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -671,9 +671,11 @@ unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
return node_page_state(lruvec_pgdat(lruvec), idx);
i = memcg_stats_index(idx);
- if (i >= 0) {
+ if (likely(i >= 0)) {
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
x = READ_ONCE(pn->lruvec_stats->state[i]);
+ } else {
+ pr_warn_once("%s: stat item index: %d\n", __func__, idx);
}
#ifdef CONFIG_SMP
if (x < 0)
@@ -693,9 +695,11 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return node_page_state(lruvec_pgdat(lruvec), idx);
i = memcg_stats_index(idx);
- if (i >= 0) {
+ if (likely(i >= 0)) {
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
x = READ_ONCE(pn->lruvec_stats->state_local[i]);
+ } else {
+ pr_warn_once("%s: stat item index: %d\n", __func__, idx);
}
#ifdef CONFIG_SMP
if (x < 0)
@@ -922,8 +926,10 @@ unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
long x;
int i = memcg_stats_index(idx);
- if (i < 0)
+ if (unlikely(i < 0)) {
+ pr_warn_once("%s: stat item index: %d\n", __func__, idx);
return 0;
+ }
x = READ_ONCE(memcg->vmstats->state[i]);
#ifdef CONFIG_SMP
@@ -959,8 +965,13 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
{
int i = memcg_stats_index(idx);
- if (mem_cgroup_disabled() || i < 0)
+ if (mem_cgroup_disabled())
+ return;
+
+ if (unlikely(i < 0)) {
+ pr_warn_once("%s: stat item index: %d\n", __func__, idx);
return;
+ }
__this_cpu_add(memcg->vmstats_percpu->state[i], val);
memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
@@ -972,8 +983,10 @@ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
long x;
int i = memcg_stats_index(idx);
- if (i < 0)
+ if (unlikely(i < 0)) {
+ pr_warn_once("%s: stat item index: %d\n", __func__, idx);
return 0;
+ }
x = READ_ONCE(memcg->vmstats->state_local[i]);
#ifdef CONFIG_SMP
@@ -991,8 +1004,10 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
struct mem_cgroup *memcg;
int i = memcg_stats_index(idx);
- if (i < 0)
+ if (unlikely(i < 0)) {
+ pr_warn_once("%s: stat item index: %d\n", __func__, idx);
return;
+ }
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
@@ -1107,8 +1122,13 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
{
int index = memcg_events_index(idx);
- if (mem_cgroup_disabled() || index < 0)
+ if (mem_cgroup_disabled())
+ return;
+
+ if (unlikely(index < 0)) {
+ pr_warn_once("%s: event item index: %d\n", __func__, idx);
return;
+ }
memcg_stats_lock();
__this_cpu_add(memcg->vmstats_percpu->events[index], count);
@@ -1120,8 +1140,11 @@ static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
{
int index = memcg_events_index(event);
- if (index < 0)
+ if (unlikely(index < 0)) {
+ pr_warn_once("%s: event item index: %d\n", __func__, event);
return 0;
+ }
+
return READ_ONCE(memcg->vmstats->events[index]);
}
@@ -1129,8 +1152,10 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
{
int index = memcg_events_index(event);
- if (index < 0)
+ if (unlikely(index < 0)) {
+ pr_warn_once("%s: event item index: %d\n", __func__, event);
return 0;
+ }
return READ_ONCE(memcg->vmstats->events_local[index]);
}
--
2.43.0
The memcg stats update functions can take arbitrary integer but the
only input which make sense is enum memcg_stat_item and we don't
want these functions to be called with arbitrary integer, so replace
the parameter type with enum memcg_stat_item and compiler will be able
to warn if memcg stat update functions are called with incorrect index
value.
Signed-off-by: Shakeel Butt <[email protected]>
---
include/linux/memcontrol.h | 13 +++++++------
mm/memcontrol.c | 3 ++-
2 files changed, 9 insertions(+), 7 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ab8a6e884375..73cad69dfb5a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -974,7 +974,8 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);
void folio_memcg_lock(struct folio *folio);
void folio_memcg_unlock(struct folio *folio);
-void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
+void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
+ int val);
/* try to stablize folio_memcg() for all the pages in a memcg */
static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
@@ -995,7 +996,7 @@ static inline void mem_cgroup_unlock_pages(void)
/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void mod_memcg_state(struct mem_cgroup *memcg,
- int idx, int val)
+ enum memcg_stat_item idx, int val)
{
unsigned long flags;
@@ -1005,7 +1006,7 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
}
static inline void mod_memcg_page_state(struct page *page,
- int idx, int val)
+ enum memcg_stat_item idx, int val)
{
struct mem_cgroup *memcg;
@@ -1491,19 +1492,19 @@ static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
}
static inline void __mod_memcg_state(struct mem_cgroup *memcg,
- int idx,
+ enum memcg_stat_item idx,
int nr)
{
}
static inline void mod_memcg_state(struct mem_cgroup *memcg,
- int idx,
+ enum memcg_stat_item idx,
int nr)
{
}
static inline void mod_memcg_page_state(struct page *page,
- int idx, int val)
+ enum memcg_stat_item idx, int val)
{
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 36145089dcf5..d11536ef59ef 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -961,7 +961,8 @@ static int memcg_state_val_in_pages(int idx, int val)
* @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
* @val: delta to add to the counter, can be negative
*/
-void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
+void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
+ int val)
{
int i = memcg_stats_index(idx);
--
2.43.0
WORKINGSET_NODES is not exposed in the memcg stats and thus there is no
need to use the memcg specific stat update functions for it. In future
if we decide to expose WORKINGSET_NODES in the memcg stats, we can
revert this patch.
Signed-off-by: Shakeel Butt <[email protected]>
---
mm/workingset.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/mm/workingset.c b/mm/workingset.c
index f2a0ecaf708d..c22adb93622a 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -618,6 +618,7 @@ struct list_lru shadow_nodes;
void workingset_update_node(struct xa_node *node)
{
struct address_space *mapping;
+ struct page *page = virt_to_page(node);
/*
* Track non-empty nodes that contain only shadow entries;
@@ -633,12 +634,12 @@ void workingset_update_node(struct xa_node *node)
if (node->count && node->count == node->nr_values) {
if (list_empty(&node->private_list)) {
list_lru_add_obj(&shadow_nodes, &node->private_list);
- __inc_lruvec_kmem_state(node, WORKINGSET_NODES);
+ __inc_node_page_state(page, WORKINGSET_NODES);
}
} else {
if (!list_empty(&node->private_list)) {
list_lru_del_obj(&shadow_nodes, &node->private_list);
- __dec_lruvec_kmem_state(node, WORKINGSET_NODES);
+ __dec_node_page_state(page, WORKINGSET_NODES);
}
}
}
@@ -742,7 +743,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
}
list_lru_isolate(lru, item);
- __dec_lruvec_kmem_state(node, WORKINGSET_NODES);
+ __dec_node_page_state(virt_to_page(node), WORKINGSET_NODES);
spin_unlock(lru_lock);
--
2.43.0
There are no memcg specific stats for NR_SHMEM_PMDMAPPED and
NR_FILE_PMDMAPPED. Let's remove them.
Signed-off-by: Shakeel Butt <[email protected]>
---
mm/memcontrol.c | 2 --
1 file changed, 2 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c164bc9b8ed6..103e0e53e20a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1009,8 +1009,6 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
case NR_ANON_MAPPED:
case NR_FILE_MAPPED:
case NR_ANON_THPS:
- case NR_SHMEM_PMDMAPPED:
- case NR_FILE_PMDMAPPED:
if (WARN_ON_ONCE(!in_task()))
pr_warn("stat item index: %d\n", idx);
break;
--
2.43.0
On Fri, Apr 26, 2024 at 5:37 PM Shakeel Butt <[email protected]> wrote:
>
> mem_cgroup_events_index is a translation table to get the right index of
> the memcg relevant entry for the general vm_event_item. At the moment,
> it is defined as integer array. However on a typical system the max
> entry of vm_event_item (NR_VM_EVENT_ITEMS) is 113, so we don't need to
> use int as storage type of the array. For now just use int8_t as type
> and add a BUILD_BUG_ON() and will switch to short once NR_VM_EVENT_ITEMS
> touches 127.
Any reason not to use uint8_t (or simply u8) and U8_MAX (instead of
the hardcoded 127)?
>
> Signed-off-by: Shakeel Butt <[email protected]>
> ---
> mm/memcontrol.c | 7 +++++--
> 1 file changed, 5 insertions(+), 2 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 602ad5faad4d..53769d06053f 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -607,11 +607,14 @@ static const unsigned int memcg_vm_event_stat[] = {
> };
>
> #define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
> -static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
> +static int8_t mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
>
> static void init_memcg_events(void)
> {
> - int i;
> + int8_t i;
> +
> + /* Switch to short once this failure occurs. */
> + BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= 127 /* INT8_MAX */);
>
> for (i = 0; i < NR_MEMCG_EVENTS; ++i)
> mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1;
> --
> 2.43.0
>
>
On Fri, Apr 26, 2024 at 5:37 PM Shakeel Butt <[email protected]> wrote:
>
> At the moment, the amount of memory allocated for stats related structs
> in the mem_cgroup corresponds to the size of enum node_stat_item.
> However not all fields in enum node_stat_item has corresponding memcg
> stats. So, let's use indirection mechanism similar to the one used for
> memcg vmstats management.
Just curious, does the indirection table cause any noticeable
performance impact? My guess is no, but the update paths are usually
very performance sensitive.
I guess lkp will shout at us if there are any noticeable regressions.
On Fri, Apr 26, 2024 at 5:38 PM Shakeel Butt <[email protected]> wrote:
>
> There are no memcg specific stats for NR_SHMEM_PMDMAPPED and
> NR_FILE_PMDMAPPED. Let's remove them.
>
> Signed-off-by: Shakeel Butt <[email protected]>
Reviewed-by: Yosry Ahmed <[email protected]>
> ---
> mm/memcontrol.c | 2 --
> 1 file changed, 2 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index c164bc9b8ed6..103e0e53e20a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1009,8 +1009,6 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
> case NR_ANON_MAPPED:
> case NR_FILE_MAPPED:
> case NR_ANON_THPS:
> - case NR_SHMEM_PMDMAPPED:
> - case NR_FILE_PMDMAPPED:
> if (WARN_ON_ONCE(!in_task()))
> pr_warn("stat item index: %d\n", idx);
> break;
> --
> 2.43.0
>
>
On Fri, Apr 26, 2024 at 5:38 PM Shakeel Butt <[email protected]> wrote:
>
> To reduce memory usage by the memcg events and stats, the kernel uses
> indirection table and only allocate stats and events which are being
> used by the memcg code. To make this more robust, let's add warnings
> where unexpected stats and events indexes are used.
>
> Signed-off-by: Shakeel Butt <[email protected]>
> ---
> mm/memcontrol.c | 43 ++++++++++++++++++++++++++++++++++---------
> 1 file changed, 34 insertions(+), 9 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 103e0e53e20a..36145089dcf5 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -671,9 +671,11 @@ unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
> return node_page_state(lruvec_pgdat(lruvec), idx);
>
> i = memcg_stats_index(idx);
> - if (i >= 0) {
> + if (likely(i >= 0)) {
> pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> x = READ_ONCE(pn->lruvec_stats->state[i]);
> + } else {
> + pr_warn_once("%s: stat item index: %d\n", __func__, idx);
> }
Can we make these more compact by using WARN_ON_ONCE() instead:
if (WARN_ON_ONCE(i < 0))
return 0;
I guess the advantage of using pr_warn_once() is that we get to print
the exact stat index, but the stack trace from WARN_ON_ONCE() should
make it obvious in most cases AFAICT.
No strong opinions either way.
On Fri, Apr 26, 2024 at 05:42:48PM -0700, Yosry Ahmed wrote:
> On Fri, Apr 26, 2024 at 5:37 PM Shakeel Butt <[email protected]> wrote:
> >
> > mem_cgroup_events_index is a translation table to get the right index of
> > the memcg relevant entry for the general vm_event_item. At the moment,
> > it is defined as integer array. However on a typical system the max
> > entry of vm_event_item (NR_VM_EVENT_ITEMS) is 113, so we don't need to
> > use int as storage type of the array. For now just use int8_t as type
> > and add a BUILD_BUG_ON() and will switch to short once NR_VM_EVENT_ITEMS
> > touches 127.
>
> Any reason not to use uint8_t (or simply u8) and U8_MAX (instead of
> the hardcoded 127)?
>
Just to keep the error check simple i.e. (index < 0). If we hit 127 then
we can switch to uint8_t and S8_MAX as error. Though 127 should be
replaced by S8_MAX. Somehow I was grep'ing for INT8*MAX variants.
Anyways if there is more support for uint8_t, I will change otherwise I
will keep as is.
On Fri, Apr 26, 2024 at 05:51:23PM -0700, Yosry Ahmed wrote:
> On Fri, Apr 26, 2024 at 5:37 PM Shakeel Butt <[email protected]> wrote:
> >
> > At the moment, the amount of memory allocated for stats related structs
> > in the mem_cgroup corresponds to the size of enum node_stat_item.
> > However not all fields in enum node_stat_item has corresponding memcg
> > stats. So, let's use indirection mechanism similar to the one used for
> > memcg vmstats management.
>
> Just curious, does the indirection table cause any noticeable
> performance impact? My guess is no, but the update paths are usually
> very performance sensitive.
>
> I guess lkp will shout at us if there are any noticeable regressions.
Yeah and that is the reason I made the indirection table smaller (i.e.
int8_t) to keep more entries in a single cacheline.
On Fri, Apr 26, 2024 at 05:58:16PM -0700, Yosry Ahmed wrote:
> On Fri, Apr 26, 2024 at 5:38 PM Shakeel Butt <[email protected]> wrote:
> >
> > To reduce memory usage by the memcg events and stats, the kernel uses
> > indirection table and only allocate stats and events which are being
> > used by the memcg code. To make this more robust, let's add warnings
> > where unexpected stats and events indexes are used.
> >
> > Signed-off-by: Shakeel Butt <[email protected]>
> > ---
> > mm/memcontrol.c | 43 ++++++++++++++++++++++++++++++++++---------
> > 1 file changed, 34 insertions(+), 9 deletions(-)
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 103e0e53e20a..36145089dcf5 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -671,9 +671,11 @@ unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
> > return node_page_state(lruvec_pgdat(lruvec), idx);
> >
> > i = memcg_stats_index(idx);
> > - if (i >= 0) {
> > + if (likely(i >= 0)) {
> > pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> > x = READ_ONCE(pn->lruvec_stats->state[i]);
> > + } else {
> > + pr_warn_once("%s: stat item index: %d\n", __func__, idx);
> > }
>
> Can we make these more compact by using WARN_ON_ONCE() instead:
>
> if (WARN_ON_ONCE(i < 0))
> return 0;
>
> I guess the advantage of using pr_warn_once() is that we get to print
> the exact stat index, but the stack trace from WARN_ON_ONCE() should
> make it obvious in most cases AFAICT.
>
> No strong opinions either way.
One reason I used pr_warn_once() over WARN_ON_ONCE() is the syzbot
trigger. No need to trip the bot over this error condition.
On Fri, Apr 26, 2024 at 6:16 PM Shakeel Butt <[email protected]> wrote:
>
> On Fri, Apr 26, 2024 at 05:51:23PM -0700, Yosry Ahmed wrote:
> > On Fri, Apr 26, 2024 at 5:37 PM Shakeel Butt <[email protected]> wrote:
> > >
> > > At the moment, the amount of memory allocated for stats related structs
> > > in the mem_cgroup corresponds to the size of enum node_stat_item.
> > > However not all fields in enum node_stat_item has corresponding memcg
> > > stats. So, let's use indirection mechanism similar to the one used for
> > > memcg vmstats management.
> >
> > Just curious, does the indirection table cause any noticeable
> > performance impact? My guess is no, but the update paths are usually
> > very performance sensitive.
> >
> > I guess lkp will shout at us if there are any noticeable regressions.
>
> Yeah and that is the reason I made the indirection table smaller (i.e.
> int8_t) to keep more entries in a single cacheline.
Thanks for pointing this out, probably worth including in the commit log :)
On Fri, Apr 26, 2024 at 5:37 PM Shakeel Butt <[email protected]> wrote:
>
> To decouple the dependency of lruvec_stats on NR_VM_NODE_STAT_ITEMS, we
> need to dynamically allocate lruvec_stats in the mem_cgroup_per_node
> structure. Also move the definition of lruvec_stats_percpu and
> lruvec_stats and related functions to the memcontrol.c to facilitate
> later patches. No functional changes in the patch.
>
> Signed-off-by: Shakeel Butt <[email protected]>
Reviewed-by: Yosry Ahmed <[email protected]>
On Fri, Apr 26, 2024 at 06:18:13PM -0700, Shakeel Butt wrote:
> On Fri, Apr 26, 2024 at 05:58:16PM -0700, Yosry Ahmed wrote:
> > On Fri, Apr 26, 2024 at 5:38 PM Shakeel Butt <[email protected]> wrote:
> > >
> > > To reduce memory usage by the memcg events and stats, the kernel uses
> > > indirection table and only allocate stats and events which are being
> > > used by the memcg code. To make this more robust, let's add warnings
> > > where unexpected stats and events indexes are used.
> > >
> > > Signed-off-by: Shakeel Butt <[email protected]>
> > > ---
> > > mm/memcontrol.c | 43 ++++++++++++++++++++++++++++++++++---------
> > > 1 file changed, 34 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > index 103e0e53e20a..36145089dcf5 100644
> > > --- a/mm/memcontrol.c
> > > +++ b/mm/memcontrol.c
> > > @@ -671,9 +671,11 @@ unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
> > > return node_page_state(lruvec_pgdat(lruvec), idx);
> > >
> > > i = memcg_stats_index(idx);
> > > - if (i >= 0) {
> > > + if (likely(i >= 0)) {
> > > pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> > > x = READ_ONCE(pn->lruvec_stats->state[i]);
> > > + } else {
> > > + pr_warn_once("%s: stat item index: %d\n", __func__, idx);
> > > }
> >
> > Can we make these more compact by using WARN_ON_ONCE() instead:
> >
> > if (WARN_ON_ONCE(i < 0))
> > return 0;
> >
> > I guess the advantage of using pr_warn_once() is that we get to print
> > the exact stat index, but the stack trace from WARN_ON_ONCE() should
> > make it obvious in most cases AFAICT.
if (WARN_ONCE(i < 0, "stat item %d not in memcg_node_stat_items\n", i))
return 0;
should work?
> > No strong opinions either way.
>
> One reason I used pr_warn_once() over WARN_ON_ONCE() is the syzbot
> trigger. No need to trip the bot over this error condition.
The warn splat is definitely quite verbose. But I think that would
only be annoying initially, in case a site was missed. Down the line,
it seems helpful to have this stand out to somebody who is trying to
add a new cgroup stat and forgets to update the right enums.
On Fri, Apr 26, 2024 at 05:37:27PM -0700, Shakeel Butt wrote:
> mem_cgroup_events_index is a translation table to get the right index of
> the memcg relevant entry for the general vm_event_item. At the moment,
> it is defined as integer array. However on a typical system the max
> entry of vm_event_item (NR_VM_EVENT_ITEMS) is 113, so we don't need to
> use int as storage type of the array. For now just use int8_t as type
> and add a BUILD_BUG_ON() and will switch to short once NR_VM_EVENT_ITEMS
> touches 127.
>
> Signed-off-by: Shakeel Butt <[email protected]>
Reviewed-by: Roman Gushchin <[email protected]>
On Fri, Apr 26, 2024 at 05:37:30PM -0700, Shakeel Butt wrote:
> There are no memcg specific stats for NR_SHMEM_PMDMAPPED and
> NR_FILE_PMDMAPPED. Let's remove them.
>
> Signed-off-by: Shakeel Butt <[email protected]>
Reviewed-by: Roman Gushchin <[email protected]>
On Fri, Apr 26, 2024 at 05:37:28PM -0700, Shakeel Butt wrote:
> To decouple the dependency of lruvec_stats on NR_VM_NODE_STAT_ITEMS, we
> need to dynamically allocate lruvec_stats in the mem_cgroup_per_node
> structure. Also move the definition of lruvec_stats_percpu and
> lruvec_stats and related functions to the memcontrol.c to facilitate
> later patches. No functional changes in the patch.
>
> Signed-off-by: Shakeel Butt <[email protected]>
> ---
> include/linux/memcontrol.h | 62 +++------------------------
> mm/memcontrol.c | 87 ++++++++++++++++++++++++++++++++------
> 2 files changed, 81 insertions(+), 68 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 9aba0d0462ca..ab8a6e884375 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -83,6 +83,8 @@ enum mem_cgroup_events_target {
>
> struct memcg_vmstats_percpu;
> struct memcg_vmstats;
> +struct lruvec_stats_percpu;
> +struct lruvec_stats;
>
> struct mem_cgroup_reclaim_iter {
> struct mem_cgroup *position;
> @@ -90,25 +92,6 @@ struct mem_cgroup_reclaim_iter {
> unsigned int generation;
> };
>
> -struct lruvec_stats_percpu {
> - /* Local (CPU and cgroup) state */
> - long state[NR_VM_NODE_STAT_ITEMS];
> -
> - /* Delta calculation for lockless upward propagation */
> - long state_prev[NR_VM_NODE_STAT_ITEMS];
> -};
> -
> -struct lruvec_stats {
> - /* Aggregated (CPU and subtree) state */
> - long state[NR_VM_NODE_STAT_ITEMS];
> -
> - /* Non-hierarchical (CPU aggregated) state */
> - long state_local[NR_VM_NODE_STAT_ITEMS];
> -
> - /* Pending child counts during tree propagation */
> - long state_pending[NR_VM_NODE_STAT_ITEMS];
> -};
> -
> /*
> * per-node information in memory controller.
> */
> @@ -116,7 +99,7 @@ struct mem_cgroup_per_node {
> struct lruvec lruvec;
>
> struct lruvec_stats_percpu __percpu *lruvec_stats_percpu;
> - struct lruvec_stats lruvec_stats;
> + struct lruvec_stats *lruvec_stats;
>
> unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
>
> @@ -1037,42 +1020,9 @@ static inline void mod_memcg_page_state(struct page *page,
> }
>
> unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
> -
> -static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
> - enum node_stat_item idx)
> -{
> - struct mem_cgroup_per_node *pn;
> - long x;
> -
> - if (mem_cgroup_disabled())
> - return node_page_state(lruvec_pgdat(lruvec), idx);
> -
> - pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> - x = READ_ONCE(pn->lruvec_stats.state[idx]);
> -#ifdef CONFIG_SMP
> - if (x < 0)
> - x = 0;
> -#endif
> - return x;
> -}
> -
> -static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> - enum node_stat_item idx)
> -{
> - struct mem_cgroup_per_node *pn;
> - long x = 0;
> -
> - if (mem_cgroup_disabled())
> - return node_page_state(lruvec_pgdat(lruvec), idx);
> -
> - pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> - x = READ_ONCE(pn->lruvec_stats.state_local[idx]);
> -#ifdef CONFIG_SMP
> - if (x < 0)
> - x = 0;
> -#endif
> - return x;
> -}
> +unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
> +unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> + enum node_stat_item idx);
>
> void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
> void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 53769d06053f..5e337ed6c6bf 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -576,6 +576,60 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
> return mz;
> }
>
> +struct lruvec_stats_percpu {
> + /* Local (CPU and cgroup) state */
> + long state[NR_VM_NODE_STAT_ITEMS];
> +
> + /* Delta calculation for lockless upward propagation */
> + long state_prev[NR_VM_NODE_STAT_ITEMS];
> +};
> +
> +struct lruvec_stats {
> + /* Aggregated (CPU and subtree) state */
> + long state[NR_VM_NODE_STAT_ITEMS];
> +
> + /* Non-hierarchical (CPU aggregated) state */
> + long state_local[NR_VM_NODE_STAT_ITEMS];
> +
> + /* Pending child counts during tree propagation */
> + long state_pending[NR_VM_NODE_STAT_ITEMS];
> +};
> +
> +unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
> +{
> + struct mem_cgroup_per_node *pn;
> + long x;
> +
> + if (mem_cgroup_disabled())
> + return node_page_state(lruvec_pgdat(lruvec), idx);
> +
> + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> + x = READ_ONCE(pn->lruvec_stats->state[idx]);
> +#ifdef CONFIG_SMP
> + if (x < 0)
> + x = 0;
> +#endif
> + return x;
> +}
> +
> +unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> + enum node_stat_item idx)
> +{
> + struct mem_cgroup_per_node *pn;
> + long x = 0;
> +
> + if (mem_cgroup_disabled())
> + return node_page_state(lruvec_pgdat(lruvec), idx);
> +
> + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> + x = READ_ONCE(pn->lruvec_stats->state_local[idx]);
> +#ifdef CONFIG_SMP
> + if (x < 0)
> + x = 0;
> +#endif
Not directly related to your change, but do we still need it? And if yes,
do we really care about !CONFIG_SMP case enough to justify these #ifdefs?
> + return x;
> +}
> +
> /* Subset of vm_event_item to report for memcg event stats */
> static const unsigned int memcg_vm_event_stat[] = {
> PGPGIN,
> @@ -5492,18 +5546,25 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
> if (!pn)
> return 1;
>
> + pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats), GFP_KERNEL,
> + node);
Why not GFP_KERNEL_ACCOUNT?
> + if (!pn->lruvec_stats)
> + goto fail;
> +
> pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
> GFP_KERNEL_ACCOUNT);
> - if (!pn->lruvec_stats_percpu) {
> - kfree(pn);
> - return 1;
> - }
> + if (!pn->lruvec_stats_percpu)
> + goto fail;
>
> lruvec_init(&pn->lruvec);
> pn->memcg = memcg;
>
> memcg->nodeinfo[node] = pn;
> return 0;
> +fail:
> + kfree(pn->lruvec_stats);
> + kfree(pn);
> + return 1;
> }
>
> static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
> @@ -5514,6 +5575,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
> return;
>
> free_percpu(pn->lruvec_stats_percpu);
> + kfree(pn->lruvec_stats);
> kfree(pn);
> }
>
> @@ -5866,18 +5928,19 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>
> for_each_node_state(nid, N_MEMORY) {
> struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
> - struct mem_cgroup_per_node *ppn = NULL;
> + struct lruvec_stats *lstats = pn->lruvec_stats;
> + struct lruvec_stats *plstats = NULL;
> struct lruvec_stats_percpu *lstatc;
>
> if (parent)
> - ppn = parent->nodeinfo[nid];
> + plstats = parent->nodeinfo[nid]->lruvec_stats;
>
> lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
>
> for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
> - delta = pn->lruvec_stats.state_pending[i];
> + delta = lstats->state_pending[i];
> if (delta)
> - pn->lruvec_stats.state_pending[i] = 0;
> + lstats->state_pending[i] = 0;
>
> delta_cpu = 0;
> v = READ_ONCE(lstatc->state[i]);
> @@ -5888,12 +5951,12 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
> }
>
> if (delta_cpu)
> - pn->lruvec_stats.state_local[i] += delta_cpu;
> + lstats->state_local[i] += delta_cpu;
>
> if (delta) {
> - pn->lruvec_stats.state[i] += delta;
> - if (ppn)
> - ppn->lruvec_stats.state_pending[i] += delta;
> + lstats->state[i] += delta;
> + if (plstats)
> + plstats->state_pending[i] += delta;
> }
> }
> }
> --
> 2.43.0
>
On Fri, Apr 26, 2024 at 05:37:29PM -0700, Shakeel Butt wrote:
> At the moment, the amount of memory allocated for stats related structs
> in the mem_cgroup corresponds to the size of enum node_stat_item.
> However not all fields in enum node_stat_item has corresponding memcg
> stats. So, let's use indirection mechanism similar to the one used for
> memcg vmstats management.
>
> For a given x86_64 config, the size of stats with and without patch is:
>
> structs size in bytes w/o with
>
> struct lruvec_stats 1128 648
> struct lruvec_stats_percpu 752 432
> struct memcg_vmstats 1832 1352
> struct memcg_vmstats_percpu 1280 960
>
> The memory savings is further compounded by the fact that these structs
> are allocated for each cpu and for each node. To be precise, for each
> memcg the memory saved would be:
>
> Memory saved = ((21 * 3 * NR_NODES) + (21 * 2 * NR_NODS * NR_CPUS) +
> (21 * 3) + (21 * 2 * NR_CPUS)) * sizeof(long)
>
> Where 21 is the number of fields eliminated.
Nice savings!
>
> Signed-off-by: Shakeel Butt <[email protected]>
> ---
> mm/memcontrol.c | 138 ++++++++++++++++++++++++++++++++++++++++--------
> 1 file changed, 115 insertions(+), 23 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 5e337ed6c6bf..c164bc9b8ed6 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -576,35 +576,105 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
> return mz;
> }
>
> +/* Subset of node_stat_item for memcg stats */
> +static const unsigned int memcg_node_stat_items[] = {
> + NR_INACTIVE_ANON,
> + NR_ACTIVE_ANON,
> + NR_INACTIVE_FILE,
> + NR_ACTIVE_FILE,
> + NR_UNEVICTABLE,
> + NR_SLAB_RECLAIMABLE_B,
> + NR_SLAB_UNRECLAIMABLE_B,
> + WORKINGSET_REFAULT_ANON,
> + WORKINGSET_REFAULT_FILE,
> + WORKINGSET_ACTIVATE_ANON,
> + WORKINGSET_ACTIVATE_FILE,
> + WORKINGSET_RESTORE_ANON,
> + WORKINGSET_RESTORE_FILE,
> + WORKINGSET_NODERECLAIM,
> + NR_ANON_MAPPED,
> + NR_FILE_MAPPED,
> + NR_FILE_PAGES,
> + NR_FILE_DIRTY,
> + NR_WRITEBACK,
> + NR_SHMEM,
> + NR_SHMEM_THPS,
> + NR_FILE_THPS,
> + NR_ANON_THPS,
> + NR_KERNEL_STACK_KB,
> + NR_PAGETABLE,
> + NR_SECONDARY_PAGETABLE,
> +#ifdef CONFIG_SWAP
> + NR_SWAPCACHE,
> +#endif
> +};
> +
> +static const unsigned int memcg_stat_items[] = {
> + MEMCG_SWAP,
> + MEMCG_SOCK,
> + MEMCG_PERCPU_B,
> + MEMCG_VMALLOC,
> + MEMCG_KMEM,
> + MEMCG_ZSWAP_B,
> + MEMCG_ZSWAPPED,
> +};
> +
> +#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
> +#define NR_MEMCG_STATS (NR_MEMCG_NODE_STAT_ITEMS + ARRAY_SIZE(memcg_stat_items))
> +static int8_t mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly;
> +
> +static void init_memcg_stats(void)
> +{
> + int8_t i, j = 0;
> +
> + /* Switch to short once this failure occurs. */
> + BUILD_BUG_ON(NR_MEMCG_STATS >= 127 /* INT8_MAX */);
> +
> + for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i)
> + mem_cgroup_stats_index[memcg_node_stat_items[i]] = ++j;
> +
> + for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i)
> + mem_cgroup_stats_index[memcg_stat_items[i]] = ++j;
> +}
> +
> +static inline int memcg_stats_index(int idx)
> +{
> + return mem_cgroup_stats_index[idx] - 1;
> +}
Hm, I'm slightly worried about the performance penalty due to the increased cache
footprint. Can't we have some formula to translate idx to memcg_idx instead of
a translation table?
If it requires a re-arrangement of items we can add a translation table on the
read side to save the visible order in procfs/sysfs.
Or I'm overthinking and the real difference is negligible?
Thanks!
On Fri, Apr 26, 2024 at 05:37:31PM -0700, Shakeel Butt wrote:
> To reduce memory usage by the memcg events and stats, the kernel uses
> indirection table and only allocate stats and events which are being
> used by the memcg code. To make this more robust, let's add warnings
> where unexpected stats and events indexes are used.
>
> Signed-off-by: Shakeel Butt <[email protected]>
> ---
> mm/memcontrol.c | 43 ++++++++++++++++++++++++++++++++++---------
> 1 file changed, 34 insertions(+), 9 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 103e0e53e20a..36145089dcf5 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -671,9 +671,11 @@ unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
> return node_page_state(lruvec_pgdat(lruvec), idx);
>
> i = memcg_stats_index(idx);
> - if (i >= 0) {
> + if (likely(i >= 0)) {
> pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> x = READ_ONCE(pn->lruvec_stats->state[i]);
> + } else {
> + pr_warn_once("%s: stat item index: %d\n", __func__, idx);
> }
I think it's generally a CONFIG_DEBUG_VM material. Do we have some extra
concerns here?
Having pr_warn_on_once() would be nice here.
On Fri, Apr 26, 2024 at 05:37:33PM -0700, Shakeel Butt wrote:
> WORKINGSET_NODES is not exposed in the memcg stats and thus there is no
> need to use the memcg specific stat update functions for it. In future
> if we decide to expose WORKINGSET_NODES in the memcg stats, we can
> revert this patch.
>
> Signed-off-by: Shakeel Butt <[email protected]>
Reviewed-by: Roman Gushchin <[email protected]>
Thanks for the series, Shakeel!
On Fri, Apr 26, 2024 at 5:37 PM Shakeel Butt <[email protected]> wrote:
>
> At the moment, the amount of memory allocated for stats related structs
> in the mem_cgroup corresponds to the size of enum node_stat_item.
> However not all fields in enum node_stat_item has corresponding memcg
> stats. So, let's use indirection mechanism similar to the one used for
> memcg vmstats management.
>
> For a given x86_64 config, the size of stats with and without patch is:
>
> structs size in bytes w/o with
>
> struct lruvec_stats 1128 648
> struct lruvec_stats_percpu 752 432
> struct memcg_vmstats 1832 1352
> struct memcg_vmstats_percpu 1280 960
>
> The memory savings is further compounded by the fact that these structs
> are allocated for each cpu and for each node. To be precise, for each
> memcg the memory saved would be:
>
> Memory saved = ((21 * 3 * NR_NODES) + (21 * 2 * NR_NODS * NR_CPUS) +
> (21 * 3) + (21 * 2 * NR_CPUS)) * sizeof(long)
>
> Where 21 is the number of fields eliminated.
>
> Signed-off-by: Shakeel Butt <[email protected]>
> ---
> mm/memcontrol.c | 138 ++++++++++++++++++++++++++++++++++++++++--------
> 1 file changed, 115 insertions(+), 23 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 5e337ed6c6bf..c164bc9b8ed6 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -576,35 +576,105 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
> return mz;
> }
>
> +/* Subset of node_stat_item for memcg stats */
> +static const unsigned int memcg_node_stat_items[] = {
> + NR_INACTIVE_ANON,
> + NR_ACTIVE_ANON,
> + NR_INACTIVE_FILE,
> + NR_ACTIVE_FILE,
> + NR_UNEVICTABLE,
> + NR_SLAB_RECLAIMABLE_B,
> + NR_SLAB_UNRECLAIMABLE_B,
> + WORKINGSET_REFAULT_ANON,
> + WORKINGSET_REFAULT_FILE,
> + WORKINGSET_ACTIVATE_ANON,
> + WORKINGSET_ACTIVATE_FILE,
> + WORKINGSET_RESTORE_ANON,
> + WORKINGSET_RESTORE_FILE,
> + WORKINGSET_NODERECLAIM,
> + NR_ANON_MAPPED,
> + NR_FILE_MAPPED,
> + NR_FILE_PAGES,
> + NR_FILE_DIRTY,
> + NR_WRITEBACK,
> + NR_SHMEM,
> + NR_SHMEM_THPS,
> + NR_FILE_THPS,
> + NR_ANON_THPS,
> + NR_KERNEL_STACK_KB,
> + NR_PAGETABLE,
> + NR_SECONDARY_PAGETABLE,
> +#ifdef CONFIG_SWAP
> + NR_SWAPCACHE,
> +#endif
> +};
> +
> +static const unsigned int memcg_stat_items[] = {
> + MEMCG_SWAP,
> + MEMCG_SOCK,
> + MEMCG_PERCPU_B,
> + MEMCG_VMALLOC,
> + MEMCG_KMEM,
> + MEMCG_ZSWAP_B,
> + MEMCG_ZSWAPPED,
> +};
Unsigned for these? All the values are positive now, but I don't think
we'll get a build warning if a negative one ever got added, just a
crash or corruption. BUG_ON in init_memcg_stats if a
memcg_stat_items[i] < 0?
On Mon, Apr 29, 2024 at 08:50:11AM -0700, Roman Gushchin wrote:
> On Fri, Apr 26, 2024 at 05:37:28PM -0700, Shakeel Butt wrote:
[...]
> > +unsigned long lruvec_page_state_local(struct lruvec *lruvec,
> > + enum node_stat_item idx)
> > +{
> > + struct mem_cgroup_per_node *pn;
> > + long x = 0;
> > +
> > + if (mem_cgroup_disabled())
> > + return node_page_state(lruvec_pgdat(lruvec), idx);
> > +
> > + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> > + x = READ_ONCE(pn->lruvec_stats->state_local[idx]);
> > +#ifdef CONFIG_SMP
> > + if (x < 0)
> > + x = 0;
> > +#endif
>
> Not directly related to your change, but do we still need it? And if yes,
> do we really care about !CONFIG_SMP case enough to justify these #ifdefs?
>
That's a good question and I think this is still needed. Particularly on
large machines with large number of CPUs, we can have a situation where
the flusher is flushing the CPU 100 and in parallel some workload
allocated a lot of pages on, let's say, CPU 0 and freed on CPU 200.
> > + return x;
> > +}
> > +
> > /* Subset of vm_event_item to report for memcg event stats */
> > static const unsigned int memcg_vm_event_stat[] = {
> > PGPGIN,
> > @@ -5492,18 +5546,25 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
> > if (!pn)
> > return 1;
> >
> > + pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats), GFP_KERNEL,
> > + node);
>
> Why not GFP_KERNEL_ACCOUNT?
>
Previously struct lruvec_stats was part of struct mem_cgroup_per_node
and we use GFP_KERNEL to allocate struct mem_cgroup_per_node. I kept the
behavior same and if we want to switch to GFP_KERNEL_ACCOUNT, I think it
should be a separate patch.
Thanks for the review.
Shakeel
On Sat, Apr 27, 2024 at 10:22:34AM -0400, Johannes Weiner wrote:
> On Fri, Apr 26, 2024 at 06:18:13PM -0700, Shakeel Butt wrote:
> > On Fri, Apr 26, 2024 at 05:58:16PM -0700, Yosry Ahmed wrote:
> > > On Fri, Apr 26, 2024 at 5:38 PM Shakeel Butt <[email protected]> wrote:
[...]
> > >
> > > Can we make these more compact by using WARN_ON_ONCE() instead:
> > >
> > > if (WARN_ON_ONCE(i < 0))
> > > return 0;
> > >
> > > I guess the advantage of using pr_warn_once() is that we get to print
> > > the exact stat index, but the stack trace from WARN_ON_ONCE() should
> > > make it obvious in most cases AFAICT.
>
> if (WARN_ONCE(i < 0, "stat item %d not in memcg_node_stat_items\n", i))
> return 0;
>
> should work?
>
> > > No strong opinions either way.
> >
> > One reason I used pr_warn_once() over WARN_ON_ONCE() is the syzbot
> > trigger. No need to trip the bot over this error condition.
>
> The warn splat is definitely quite verbose. But I think that would
> only be annoying initially, in case a site was missed. Down the line,
> it seems helpful to have this stand out to somebody who is trying to
> add a new cgroup stat and forgets to update the right enums.
Sounds good to me. I will change it to WARN_ONCE().
On Mon, Apr 29, 2024 at 09:06:23AM -0700, Roman Gushchin wrote:
> On Fri, Apr 26, 2024 at 05:37:31PM -0700, Shakeel Butt wrote:
> > To reduce memory usage by the memcg events and stats, the kernel uses
> > indirection table and only allocate stats and events which are being
> > used by the memcg code. To make this more robust, let's add warnings
> > where unexpected stats and events indexes are used.
> >
> > Signed-off-by: Shakeel Butt <[email protected]>
> > ---
> > mm/memcontrol.c | 43 ++++++++++++++++++++++++++++++++++---------
> > 1 file changed, 34 insertions(+), 9 deletions(-)
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 103e0e53e20a..36145089dcf5 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -671,9 +671,11 @@ unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
> > return node_page_state(lruvec_pgdat(lruvec), idx);
> >
> > i = memcg_stats_index(idx);
> > - if (i >= 0) {
> > + if (likely(i >= 0)) {
> > pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> > x = READ_ONCE(pn->lruvec_stats->state[i]);
> > + } else {
> > + pr_warn_once("%s: stat item index: %d\n", __func__, idx);
> > }
>
> I think it's generally a CONFIG_DEBUG_VM material. Do we have some extra
> concerns here?
>
> Having pr_warn_on_once() would be nice here.
No extra concern, just want this indirection table to be up to date in
future.
On Mon, Apr 29, 2024 at 09:00:16AM -0700, Roman Gushchin wrote:
> On Fri, Apr 26, 2024 at 05:37:29PM -0700, Shakeel Butt wrote:
[...]
>
> Hm, I'm slightly worried about the performance penalty due to the increased cache
> footprint. Can't we have some formula to translate idx to memcg_idx instead of
> a translation table?
> If it requires a re-arrangement of items we can add a translation table on the
> read side to save the visible order in procfs/sysfs.
> Or I'm overthinking and the real difference is negligible?
>
No, you are not overthinking and the update side is very performance
sensitive. On my simple testing I do not see any regression but I am
hoping that the Intel's performance bot will tell us if this is really
worth exploring.
> Thanks!
On Mon, Apr 29, 2024 at 10:35:38AM -0700, T.J. Mercier wrote:
> On Fri, Apr 26, 2024 at 5:37 PM Shakeel Butt <[email protected]> wrote:
> >
[...]
> > +
> > +static const unsigned int memcg_stat_items[] = {
> > + MEMCG_SWAP,
> > + MEMCG_SOCK,
> > + MEMCG_PERCPU_B,
> > + MEMCG_VMALLOC,
> > + MEMCG_KMEM,
> > + MEMCG_ZSWAP_B,
> > + MEMCG_ZSWAPPED,
> > +};
>
> Unsigned for these? All the values are positive now, but I don't think
> we'll get a build warning if a negative one ever got added, just a
> crash or corruption. BUG_ON in init_memcg_stats if a
> memcg_stat_items[i] < 0?
We are depending on NR_VM_NODE_STAT_ITEMS to tell the number of elements
for vmstats. So, I think there is an implicit assumption that there are
no negative enums in enum node_stat_item. So, if we want to verify those
assumptions then we should be adding such warnings/build-bugs in vmstat
first.
On Mon, Apr 29, 2024 at 12:46:32PM -0700, Shakeel Butt wrote:
> On Mon, Apr 29, 2024 at 08:50:11AM -0700, Roman Gushchin wrote:
> > On Fri, Apr 26, 2024 at 05:37:28PM -0700, Shakeel Butt wrote:
> [...]
>
> > > + return x;
> > > +}
> > > +
> > > /* Subset of vm_event_item to report for memcg event stats */
> > > static const unsigned int memcg_vm_event_stat[] = {
> > > PGPGIN,
> > > @@ -5492,18 +5546,25 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
> > > if (!pn)
> > > return 1;
> > >
> > > + pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats), GFP_KERNEL,
> > > + node);
> >
> > Why not GFP_KERNEL_ACCOUNT?
> >
>
> Previously struct lruvec_stats was part of struct mem_cgroup_per_node
> and we use GFP_KERNEL to allocate struct mem_cgroup_per_node. I kept the
> behavior same and if we want to switch to GFP_KERNEL_ACCOUNT, I think it
> should be a separate patch.
Agree. Here is the patch:
--
From fd6854c0b272c5314bce6c9dee7d3c8f8cee3a86 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <[email protected]>
Date: Mon, 29 Apr 2024 13:57:26 -0700
Subject: [PATCH] mm: memcg: account memory used for memcg vmstats and lruvec
stats
The percpu memory used by memcg's memory statistics is already accounted.
For consistency, let's enable accounting for vmstats and lruvec stats
as well.
Signed-off-by: Roman Gushchin <[email protected]>
---
mm/memcontrol.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d11536ef59ef..2fe25d49cfaa 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5661,8 +5661,8 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn)
return 1;
- pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats), GFP_KERNEL,
- node);
+ pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats),
+ GFP_KERNEL_ACCOUNT, node);
if (!pn->lruvec_stats)
goto fail;
@@ -5733,7 +5733,8 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
goto fail;
}
- memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL);
+ memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats),
+ GFP_KERNEL_ACCOUNT);
if (!memcg->vmstats)
goto fail;
--
2.43.2
On Mon, Apr 29, 2024 at 02:02:23PM -0700, Roman Gushchin wrote:
> On Mon, Apr 29, 2024 at 12:46:32PM -0700, Shakeel Butt wrote:
> > On Mon, Apr 29, 2024 at 08:50:11AM -0700, Roman Gushchin wrote:
> > > On Fri, Apr 26, 2024 at 05:37:28PM -0700, Shakeel Butt wrote:
> > [...]
> >
> > > > + return x;
> > > > +}
> > > > +
> > > > /* Subset of vm_event_item to report for memcg event stats */
> > > > static const unsigned int memcg_vm_event_stat[] = {
> > > > PGPGIN,
> > > > @@ -5492,18 +5546,25 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
> > > > if (!pn)
> > > > return 1;
> > > >
> > > > + pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats), GFP_KERNEL,
> > > > + node);
> > >
> > > Why not GFP_KERNEL_ACCOUNT?
> > >
> >
> > Previously struct lruvec_stats was part of struct mem_cgroup_per_node
> > and we use GFP_KERNEL to allocate struct mem_cgroup_per_node. I kept the
> > behavior same and if we want to switch to GFP_KERNEL_ACCOUNT, I think it
> > should be a separate patch.
>
> Agree. Here is the patch:
Awesome, I will pull this in my v3 post.
>
> --
>
> From fd6854c0b272c5314bce6c9dee7d3c8f8cee3a86 Mon Sep 17 00:00:00 2001
> From: Roman Gushchin <[email protected]>
> Date: Mon, 29 Apr 2024 13:57:26 -0700
> Subject: [PATCH] mm: memcg: account memory used for memcg vmstats and lruvec
> stats
>
> The percpu memory used by memcg's memory statistics is already accounted.
> For consistency, let's enable accounting for vmstats and lruvec stats
> as well.
>
> Signed-off-by: Roman Gushchin <[email protected]>
Acked-by: Shakeel Butt <[email protected]>
> ---
> mm/memcontrol.c | 7 ++++---
> 1 file changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index d11536ef59ef..2fe25d49cfaa 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -5661,8 +5661,8 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
> if (!pn)
> return 1;
>
> - pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats), GFP_KERNEL,
> - node);
> + pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats),
> + GFP_KERNEL_ACCOUNT, node);
> if (!pn->lruvec_stats)
> goto fail;
>
> @@ -5733,7 +5733,8 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
> goto fail;
> }
>
> - memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL);
> + memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats),
> + GFP_KERNEL_ACCOUNT);
> if (!memcg->vmstats)
> goto fail;
>
> --
> 2.43.2
>
On Mon, Apr 29, 2024 at 1:13 PM Shakeel Butt <[email protected]> wrote:
>
> On Mon, Apr 29, 2024 at 10:35:38AM -0700, T.J. Mercier wrote:
> > On Fri, Apr 26, 2024 at 5:37 PM Shakeel Butt <[email protected]> wrote:
> > >
> [...]
> > > +
> > > +static const unsigned int memcg_stat_items[] = {
> > > + MEMCG_SWAP,
> > > + MEMCG_SOCK,
> > > + MEMCG_PERCPU_B,
> > > + MEMCG_VMALLOC,
> > > + MEMCG_KMEM,
> > > + MEMCG_ZSWAP_B,
> > > + MEMCG_ZSWAPPED,
> > > +};
> >
> > Unsigned for these? All the values are positive now, but I don't think
> > we'll get a build warning if a negative one ever got added, just a
> > crash or corruption. BUG_ON in init_memcg_stats if a
> > memcg_stat_items[i] < 0?
>
> We are depending on NR_VM_NODE_STAT_ITEMS to tell the number of elements
> for vmstats. So, I think there is an implicit assumption that there are
> no negative enums in enum node_stat_item. So, if we want to verify those
> assumptions then we should be adding such warnings/build-bugs in vmstat
> first.
Ok fair. I guess this if we get C23:
enum node_stat_item : unsigned {