Hi,
This patch series is aimed to convert all THP vmstat counters to pages
and some KiB vmstat counters to bytes.
The unit of some vmstat counters are pages, some are bytes, some are
HPAGE_PMD_NR, and some are KiB. When we want to expose these vmstat
counters to the userspace, we have to know the unit of the vmstat counters
is which one. It makes the code complex. Because there are too many choices,
the probability of making a mistake will be greater.
For example, the below is some bug fix:
- 7de2e9f195b9 ("mm: memcontrol: correct the NR_ANON_THPS counter of hierarchical memcg")
- not committed(it is the first commit in this series) ("mm: memcontrol: fix NR_ANON_THPS account")
This patch series can make the code simple (161 insertions(+), 187 deletions(-)).
And make the unit of the vmstat counters are either pages or bytes. Fewer choices
means lower probability of making mistakes :).
This was inspired by Johannes and Roman. Thanks to them.
Changes in v1 -> v2:
- Change the series subject from "Convert all THP vmstat counters to pages"
to "Convert all vmstat counters to pages or bytes".
- Convert NR_KERNEL_SCS_KB account to bytes.
- Convert vmstat slab counters to bytes.
- Remove {global_}node_page_state_pages.
Muchun Song (12):
mm: memcontrol: fix NR_ANON_THPS account
mm: memcontrol: convert NR_ANON_THPS account to pages
mm: memcontrol: convert NR_FILE_THPS account to pages
mm: memcontrol: convert NR_SHMEM_THPS account to pages
mm: memcontrol: convert NR_SHMEM_PMDMAPPED account to pages
mm: memcontrol: convert NR_FILE_PMDMAPPED account to pages
mm: memcontrol: convert kernel stack account to bytes
mm: memcontrol: convert NR_KERNEL_SCS_KB account to bytes
mm: memcontrol: convert vmstat slab counters to bytes
mm: memcontrol: scale stat_threshold for byted-sized vmstat
mm: memcontrol: make the slab calculation consistent
mm: memcontrol: remove {global_}node_page_state_pages
drivers/base/node.c | 25 ++++-----
fs/proc/meminfo.c | 22 ++++----
include/linux/mmzone.h | 21 +++-----
include/linux/vmstat.h | 21 ++------
kernel/fork.c | 8 +--
kernel/power/snapshot.c | 2 +-
kernel/scs.c | 4 +-
mm/filemap.c | 4 +-
mm/huge_memory.c | 9 ++--
mm/khugepaged.c | 4 +-
mm/memcontrol.c | 131 ++++++++++++++++++++++++------------------------
mm/oom_kill.c | 2 +-
mm/page_alloc.c | 17 +++----
mm/rmap.c | 19 ++++---
mm/shmem.c | 3 +-
mm/vmscan.c | 2 +-
mm/vmstat.c | 54 ++++++++------------
17 files changed, 161 insertions(+), 187 deletions(-)
--
2.11.0
Convert the NR_ANON_THPS account to pages.
Signed-off-by: Muchun Song <[email protected]>
---
drivers/base/node.c | 3 +--
fs/proc/meminfo.c | 2 +-
mm/huge_memory.c | 3 ++-
mm/memcontrol.c | 20 ++++++--------------
mm/page_alloc.c | 2 +-
mm/rmap.c | 7 ++++---
6 files changed, 15 insertions(+), 22 deletions(-)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 6ffa470e2984..7ebe4c2f64d1 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -461,8 +461,7 @@ static ssize_t node_read_meminfo(struct device *dev,
nid, K(sunreclaimable)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
,
- nid, K(node_page_state(pgdat, NR_ANON_THPS) *
- HPAGE_PMD_NR),
+ nid, K(node_page_state(pgdat, NR_ANON_THPS)),
nid, K(node_page_state(pgdat, NR_SHMEM_THPS) *
HPAGE_PMD_NR),
nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 887a5532e449..1f7e1945c313 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -129,7 +129,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
show_val_kb(m, "AnonHugePages: ",
- global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR);
+ global_node_page_state(NR_ANON_THPS));
show_val_kb(m, "ShmemHugePages: ",
global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR);
show_val_kb(m, "ShmemPmdMapped: ",
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c94dfc0cc1a3..1a13e1dab3ec 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2178,7 +2178,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
lock_page_memcg(page);
if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
/* Last compound_mapcount is gone. */
- __dec_lruvec_page_state(page, NR_ANON_THPS);
+ __mod_lruvec_page_state(page, NR_ANON_THPS,
+ -HPAGE_PMD_NR);
if (TestClearPageDoubleMap(page)) {
/* No need in mapcount reference anymore */
for (i = 0; i < HPAGE_PMD_NR; i++)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 695dedf8687a..2700c1db5a1a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1513,7 +1513,7 @@ static struct memory_stat memory_stats[] = {
* on some architectures, the macro of HPAGE_PMD_SIZE is not
* constant(e.g. powerpc).
*/
- { "anon_thp", 0, NR_ANON_THPS },
+ { "anon_thp", PAGE_SIZE, NR_ANON_THPS },
{ "file_thp", 0, NR_FILE_THPS },
{ "shmem_thp", 0, NR_SHMEM_THPS },
#endif
@@ -1546,8 +1546,7 @@ static int __init memory_stats_init(void)
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (memory_stats[i].idx == NR_ANON_THPS ||
- memory_stats[i].idx == NR_FILE_THPS ||
+ if (memory_stats[i].idx == NR_FILE_THPS ||
memory_stats[i].idx == NR_SHMEM_THPS)
memory_stats[i].ratio = HPAGE_PMD_SIZE;
#endif
@@ -4069,10 +4068,6 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
nr = memcg_page_state_local(memcg, memcg1_stats[i]);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (memcg1_stats[i] == NR_ANON_THPS)
- nr *= HPAGE_PMD_NR;
-#endif
seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
}
@@ -4103,10 +4098,6 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
nr = memcg_page_state(memcg, memcg1_stats[i]);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (memcg1_stats[i] == NR_ANON_THPS)
- nr *= HPAGE_PMD_NR;
-#endif
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
(u64)nr * PAGE_SIZE);
}
@@ -5634,10 +5625,11 @@ static int mem_cgroup_move_account(struct page *page,
__mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
__mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
if (PageTransHuge(page)) {
- __dec_lruvec_state(from_vec, NR_ANON_THPS);
- __inc_lruvec_state(to_vec, NR_ANON_THPS);
+ __mod_lruvec_state(from_vec, NR_ANON_THPS,
+ -nr_pages);
+ __mod_lruvec_state(to_vec, NR_ANON_THPS,
+ nr_pages);
}
-
}
} else {
__mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 56e603eea1dd..f97ca98d361f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5570,7 +5570,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
* HPAGE_PMD_NR),
- K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
+ K(node_page_state(pgdat, NR_ANON_THPS)),
#endif
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
node_page_state(pgdat, NR_KERNEL_STACK_KB),
diff --git a/mm/rmap.c b/mm/rmap.c
index 08c56aaf72eb..f59e92e26b61 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1144,7 +1144,8 @@ void do_page_add_anon_rmap(struct page *page,
* disabled.
*/
if (compound)
- __inc_lruvec_page_state(page, NR_ANON_THPS);
+ __mod_lruvec_page_state(page, NR_ANON_THPS,
+ HPAGE_PMD_NR);
__mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
}
@@ -1186,7 +1187,7 @@ void page_add_new_anon_rmap(struct page *page,
if (hpage_pincount_available(page))
atomic_set(compound_pincount_ptr(page), 0);
- __inc_lruvec_page_state(page, NR_ANON_THPS);
+ __mod_lruvec_page_state(page, NR_ANON_THPS, HPAGE_PMD_NR);
} else {
/* Anon THP always mapped first with PMD */
VM_BUG_ON_PAGE(PageTransCompound(page), page);
@@ -1292,7 +1293,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
return;
- __dec_lruvec_page_state(page, NR_ANON_THPS);
+ __mod_lruvec_page_state(page, NR_ANON_THPS, -HPAGE_PMD_NR);
if (TestClearPageDoubleMap(page)) {
/*
--
2.11.0
the global and per-node counters are stored in pages, however memcg
and lruvec counters are stored in bytes. This scheme looks weird.
So convert all vmstat slab counters to bytes.
Signed-off-by: Muchun Song <[email protected]>
---
include/linux/vmstat.h | 17 ++++++++++-------
mm/vmstat.c | 21 ++++++++++-----------
2 files changed, 20 insertions(+), 18 deletions(-)
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 322dcbfcc933..fd1a3d5d4926 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -197,18 +197,26 @@ static inline
unsigned long global_node_page_state_pages(enum node_stat_item item)
{
long x = atomic_long_read(&vm_node_stat[item]);
+
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
#endif
+ if (vmstat_item_in_bytes(item))
+ x >>= PAGE_SHIFT;
return x;
}
static inline unsigned long global_node_page_state(enum node_stat_item item)
{
- VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+ long x = atomic_long_read(&vm_node_stat[item]);
- return global_node_page_state_pages(item);
+ VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+#ifdef CONFIG_SMP
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
}
static inline unsigned long zone_page_state(struct zone *zone,
@@ -312,11 +320,6 @@ static inline void __mod_zone_page_state(struct zone *zone,
static inline void __mod_node_page_state(struct pglist_data *pgdat,
enum node_stat_item item, int delta)
{
- if (vmstat_item_in_bytes(item)) {
- VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
- delta >>= PAGE_SHIFT;
- }
-
node_page_state_add(delta, pgdat, item);
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8d77ee426e22..7fb0c7cb9516 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -345,11 +345,6 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
long x;
long t;
- if (vmstat_item_in_bytes(item)) {
- VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
- delta >>= PAGE_SHIFT;
- }
-
x = delta + __this_cpu_read(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -554,11 +549,6 @@ static inline void mod_node_state(struct pglist_data *pgdat,
s8 __percpu *p = pcp->vm_node_stat_diff + item;
long o, n, t, z;
- if (vmstat_item_in_bytes(item)) {
- VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
- delta >>= PAGE_SHIFT;
- }
-
do {
z = 0; /* overflow to node counters */
@@ -1012,19 +1002,28 @@ unsigned long node_page_state_pages(struct pglist_data *pgdat,
enum node_stat_item item)
{
long x = atomic_long_read(&pgdat->vm_stat[item]);
+
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
#endif
+ if (vmstat_item_in_bytes(item))
+ x >>= PAGE_SHIFT;
return x;
}
unsigned long node_page_state(struct pglist_data *pgdat,
enum node_stat_item item)
{
+ long x = atomic_long_read(&pgdat->vm_stat[item]);
+
VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
- return node_page_state_pages(pgdat, item);
+#ifdef CONFIG_SMP
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
}
#endif
--
2.11.0
Convert NR_KERNEL_SCS_KB account to bytes
Signed-off-by: Muchun Song <[email protected]>
---
drivers/base/node.c | 2 +-
fs/proc/meminfo.c | 2 +-
include/linux/mmzone.h | 2 +-
kernel/scs.c | 4 ++--
mm/page_alloc.c | 2 +-
5 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 92a75bad35c9..bc01ce0b2fcd 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -448,7 +448,7 @@ static ssize_t node_read_meminfo(struct device *dev,
nid, K(i.sharedram),
nid, node_page_state(pgdat, NR_KERNEL_STACK_B) / SZ_1K,
#ifdef CONFIG_SHADOW_CALL_STACK
- nid, node_page_state(pgdat, NR_KERNEL_SCS_KB),
+ nid, node_page_state(pgdat, NR_KERNEL_SCS_B) / SZ_1K,
#endif
nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)),
nid, 0UL,
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 799a537d4218..69895e83d4fc 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -104,7 +104,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
global_node_page_state(NR_KERNEL_STACK_B) / SZ_1K);
#ifdef CONFIG_SHADOW_CALL_STACK
seq_printf(m, "ShadowCallStack:%8lu kB\n",
- global_node_page_state(NR_KERNEL_SCS_KB));
+ global_node_page_state(NR_KERNEL_SCS_B) / SZ_1K);
#endif
show_val_kb(m, "PageTables: ",
global_zone_page_state(NR_PAGETABLE));
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index bd34416293ec..1f9c83778629 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -204,7 +204,7 @@ enum node_stat_item {
NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */
NR_KERNEL_STACK_B, /* measured in byte */
#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
- NR_KERNEL_SCS_KB, /* measured in KiB */
+ NR_KERNEL_SCS_B, /* measured in byte */
#endif
NR_VM_NODE_STAT_ITEMS
};
diff --git a/kernel/scs.c b/kernel/scs.c
index 4ff4a7ba0094..8db89c932ddc 100644
--- a/kernel/scs.c
+++ b/kernel/scs.c
@@ -17,8 +17,8 @@ static void __scs_account(void *s, int account)
{
struct page *scs_page = virt_to_page(s);
- mod_node_page_state(page_pgdat(scs_page), NR_KERNEL_SCS_KB,
- account * (SCS_SIZE / SZ_1K));
+ mod_node_page_state(page_pgdat(scs_page), NR_KERNEL_SCS_B,
+ account * SCS_SIZE);
}
static void *scs_alloc(int node)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d2821ba7f682..58916b3afdab 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5574,7 +5574,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
node_page_state(pgdat, NR_KERNEL_STACK_B) / SZ_1K,
#ifdef CONFIG_SHADOW_CALL_STACK
- node_page_state(pgdat, NR_KERNEL_SCS_KB),
+ node_page_state(pgdat, NR_KERNEL_SCS_B) / SZ_1K,
#endif
pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
"yes" : "no");
--
2.11.0
Now the unit of the vmstat counters are either pages or bytes. So we can
adjust the node_page_state to always returns values in pages and remove
the node_page_state_pages.
Signed-off-by: Muchun Song <[email protected]>
---
drivers/base/node.c | 10 +++++-----
fs/proc/meminfo.c | 12 ++++++------
include/linux/vmstat.h | 17 +----------------
kernel/power/snapshot.c | 2 +-
mm/oom_kill.c | 2 +-
mm/page_alloc.c | 10 +++++-----
mm/vmscan.c | 2 +-
mm/vmstat.c | 23 ++++++-----------------
8 files changed, 26 insertions(+), 52 deletions(-)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index bc01ce0b2fcd..42298e3552e5 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -374,8 +374,8 @@ static ssize_t node_read_meminfo(struct device *dev,
unsigned long sreclaimable, sunreclaimable;
si_meminfo_node(&i, nid);
- sreclaimable = node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B);
- sunreclaimable = node_page_state_pages(pgdat, NR_SLAB_UNRECLAIMABLE_B);
+ sreclaimable = node_page_state(pgdat, NR_SLAB_RECLAIMABLE_B);
+ sunreclaimable = node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE_B);
len = sysfs_emit_at(buf, len,
"Node %d MemTotal: %8lu kB\n"
"Node %d MemFree: %8lu kB\n"
@@ -446,9 +446,9 @@ static ssize_t node_read_meminfo(struct device *dev,
nid, K(node_page_state(pgdat, NR_FILE_MAPPED)),
nid, K(node_page_state(pgdat, NR_ANON_MAPPED)),
nid, K(i.sharedram),
- nid, node_page_state(pgdat, NR_KERNEL_STACK_B) / SZ_1K,
+ nid, K(node_page_state(pgdat, NR_KERNEL_STACK_B)),
#ifdef CONFIG_SHADOW_CALL_STACK
- nid, node_page_state(pgdat, NR_KERNEL_SCS_B) / SZ_1K,
+ nid, K(node_page_state(pgdat, NR_KERNEL_SCS_B)),
#endif
nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)),
nid, 0UL,
@@ -517,7 +517,7 @@ static ssize_t node_read_vmstat(struct device *dev,
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
len += sysfs_emit_at(buf, len, "%s %lu\n",
node_stat_name(i),
- node_page_state_pages(pgdat, i));
+ node_page_state(pgdat, i));
return len;
}
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 69895e83d4fc..95ea5f062161 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -52,8 +52,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
available = si_mem_available();
- sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B);
- sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B);
+ sreclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE_B);
+ sunreclaim = global_node_page_state(NR_SLAB_UNRECLAIMABLE_B);
show_val_kb(m, "MemTotal: ", i.totalram);
show_val_kb(m, "MemFree: ", i.freeram);
@@ -100,11 +100,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "Slab: ", sreclaimable + sunreclaim);
show_val_kb(m, "SReclaimable: ", sreclaimable);
show_val_kb(m, "SUnreclaim: ", sunreclaim);
- seq_printf(m, "KernelStack: %8lu kB\n",
- global_node_page_state(NR_KERNEL_STACK_B) / SZ_1K);
+ show_val_kb(m, "KernelStack: ",
+ global_node_page_state(NR_KERNEL_STACK_B));
#ifdef CONFIG_SHADOW_CALL_STACK
- seq_printf(m, "ShadowCallStack:%8lu kB\n",
- global_node_page_state(NR_KERNEL_SCS_B) / SZ_1K);
+ show_val_kb(m, "ShadowCallStack:",
+ global_node_page_state(NR_KERNEL_SCS_B));
#endif
show_val_kb(m, "PageTables: ",
global_zone_page_state(NR_PAGETABLE));
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index afd84dc2398c..ae821e016fdd 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -193,8 +193,7 @@ static inline unsigned long global_zone_page_state(enum zone_stat_item item)
return x;
}
-static inline
-unsigned long global_node_page_state_pages(enum node_stat_item item)
+static inline unsigned long global_node_page_state(enum node_stat_item item)
{
long x = atomic_long_read(&vm_node_stat[item]);
@@ -207,17 +206,6 @@ unsigned long global_node_page_state_pages(enum node_stat_item item)
return x;
}
-static inline unsigned long global_node_page_state(enum node_stat_item item)
-{
- long x = atomic_long_read(&vm_node_stat[item]);
-
-#ifdef CONFIG_SMP
- if (x < 0)
- x = 0;
-#endif
- return x;
-}
-
static inline unsigned long zone_page_state(struct zone *zone,
enum zone_stat_item item)
{
@@ -258,12 +246,9 @@ extern unsigned long sum_zone_node_page_state(int node,
extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item);
extern unsigned long node_page_state(struct pglist_data *pgdat,
enum node_stat_item item);
-extern unsigned long node_page_state_pages(struct pglist_data *pgdat,
- enum node_stat_item item);
#else
#define sum_zone_node_page_state(node, item) global_zone_page_state(item)
#define node_page_state(node, item) global_node_page_state(item)
-#define node_page_state_pages(node, item) global_node_page_state_pages(item)
#endif /* CONFIG_NUMA */
#ifdef CONFIG_SMP
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d63560e1cf87..664520bdaa20 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1705,7 +1705,7 @@ static unsigned long minimum_image_size(unsigned long saveable)
{
unsigned long size;
- size = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B)
+ size = global_node_page_state(NR_SLAB_RECLAIMABLE_B)
+ global_node_page_state(NR_ACTIVE_ANON)
+ global_node_page_state(NR_INACTIVE_ANON)
+ global_node_page_state(NR_ACTIVE_FILE)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 04b19b7b5435..73861473c7d4 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -188,7 +188,7 @@ static bool should_dump_unreclaim_slab(void)
global_node_page_state(NR_ISOLATED_FILE) +
global_node_page_state(NR_UNEVICTABLE);
- return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
+ return (global_node_page_state(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
}
/**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 58916b3afdab..d16c9388c0b8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5372,7 +5372,7 @@ long si_mem_available(void)
* items that are in use, and cannot be freed. Cap this estimate at the
* low watermark.
*/
- reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
+ reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE_B) +
global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
available += reclaimable - min(reclaimable / 2, wmark_low);
@@ -5516,8 +5516,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
global_node_page_state(NR_UNEVICTABLE),
global_node_page_state(NR_FILE_DIRTY),
global_node_page_state(NR_WRITEBACK),
- global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
- global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
+ global_node_page_state(NR_SLAB_RECLAIMABLE_B),
+ global_node_page_state(NR_SLAB_UNRECLAIMABLE_B),
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM),
global_zone_page_state(NR_PAGETABLE),
@@ -5572,9 +5572,9 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(node_page_state(pgdat, NR_ANON_THPS)),
#endif
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
- node_page_state(pgdat, NR_KERNEL_STACK_B) / SZ_1K,
+ K(node_page_state(pgdat, NR_KERNEL_STACK_B)),
#ifdef CONFIG_SHADOW_CALL_STACK
- node_page_state(pgdat, NR_KERNEL_SCS_B) / SZ_1K,
+ K(node_page_state(pgdat, NR_KERNEL_SCS_B)),
#endif
pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
"yes" : "no");
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 469016222cdb..5d3c8fa68979 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4220,7 +4220,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
* unmapped file backed pages.
*/
if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
- node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
+ node_page_state(pgdat, NR_SLAB_RECLAIMABLE_B) <=
pgdat->min_slab_pages)
return NODE_RECLAIM_FULL;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 25751b1d8e2e..b7cdef585efd 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1000,22 +1000,9 @@ unsigned long sum_zone_numa_state(int node,
}
/*
- * Determine the per node value of a stat item.
+ * Determine the per node value of a stat item. This always returns
+ * values in pages.
*/
-unsigned long node_page_state_pages(struct pglist_data *pgdat,
- enum node_stat_item item)
-{
- long x = atomic_long_read(&pgdat->vm_stat[item]);
-
-#ifdef CONFIG_SMP
- if (x < 0)
- x = 0;
-#endif
- if (vmstat_item_in_bytes(item))
- x >>= PAGE_SHIFT;
- return x;
-}
-
unsigned long node_page_state(struct pglist_data *pgdat,
enum node_stat_item item)
{
@@ -1025,6 +1012,8 @@ unsigned long node_page_state(struct pglist_data *pgdat,
if (x < 0)
x = 0;
#endif
+ if (vmstat_item_in_bytes(item))
+ x >>= PAGE_SHIFT;
return x;
}
#endif
@@ -1626,7 +1615,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
seq_printf(m, "\n per-node stats");
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
seq_printf(m, "\n %-12s %lu", node_stat_name(i),
- node_page_state_pages(pgdat, i));
+ node_page_state(pgdat, i));
}
}
seq_printf(m,
@@ -1747,7 +1736,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
#endif
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- v[i] = global_node_page_state_pages(i);
+ v[i] = global_node_page_state(i);
v += NR_VM_NODE_STAT_ITEMS;
global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
--
2.11.0
Convert NR_FILE_PMDMAPPED account to pages
Signed-off-by: Muchun Song <[email protected]>
---
drivers/base/node.c | 3 +--
fs/proc/meminfo.c | 2 +-
mm/rmap.c | 6 ++++--
3 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index e5abc6144dab..f77652e6339f 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -465,8 +465,7 @@ static ssize_t node_read_meminfo(struct device *dev,
nid, K(node_page_state(pgdat, NR_SHMEM_THPS)),
nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
nid, K(node_page_state(pgdat, NR_FILE_THPS)),
- nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) *
- HPAGE_PMD_NR)
+ nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED))
#endif
);
len += hugetlb_report_node_meminfo(buf, len, nid);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 84886b2cc2f7..5a83012d8b72 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -137,7 +137,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "FileHugePages: ",
global_node_page_state(NR_FILE_THPS));
show_val_kb(m, "FilePmdMapped: ",
- global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR);
+ global_node_page_state(NR_FILE_PMDMAPPED));
#endif
#ifdef CONFIG_CMA
diff --git a/mm/rmap.c b/mm/rmap.c
index 3089ad6bf468..e383c5619501 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1222,7 +1222,8 @@ void page_add_file_rmap(struct page *page, bool compound)
__mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
HPAGE_PMD_NR);
else
- __inc_node_page_state(page, NR_FILE_PMDMAPPED);
+ __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
+ HPAGE_PMD_NR);
} else {
if (PageTransCompound(page) && page_mapping(page)) {
VM_WARN_ON_ONCE(!PageLocked(page));
@@ -1264,7 +1265,8 @@ static void page_remove_file_rmap(struct page *page, bool compound)
__mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
-HPAGE_PMD_NR);
else
- __dec_node_page_state(page, NR_FILE_PMDMAPPED);
+ __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
+ -HPAGE_PMD_NR);
} else {
if (!atomic_add_negative(-1, &page->_mapcount))
return;
--
2.11.0
Some vmstat counters are being accounted in bytes not pages, so the
stat_threshold should also scale to bytes.
The vmstat counters are already long type for memcg (can reference
to struct lruvec_stat). For the global per-node vmstat counters
also can scale to long. But the maximum vmstat threshold is 125,
so the type of s32 is enough.
Signed-off-by: Muchun Song <[email protected]>
---
include/linux/mmzone.h | 17 ++++++-----------
include/linux/vmstat.h | 1 -
mm/vmstat.c | 24 +++++++++++++-----------
3 files changed, 19 insertions(+), 23 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1f9c83778629..d53328551225 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -216,17 +216,12 @@ enum node_stat_item {
*/
static __always_inline bool vmstat_item_in_bytes(int idx)
{
- /*
- * Global and per-node slab counters track slab pages.
- * It's expected that changes are multiples of PAGE_SIZE.
- * Internally values are stored in pages.
- *
- * Per-memcg and per-lruvec counters track memory, consumed
- * by individual slab objects. These counters are actually
- * byte-precise.
- */
return (idx == NR_SLAB_RECLAIMABLE_B ||
- idx == NR_SLAB_UNRECLAIMABLE_B);
+ idx == NR_SLAB_UNRECLAIMABLE_B ||
+#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
+ idx == NR_KERNEL_SCS_B ||
+#endif
+ idx == NR_KERNEL_STACK_B);
}
/*
@@ -340,7 +335,7 @@ struct per_cpu_pageset {
struct per_cpu_nodestat {
s8 stat_threshold;
- s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
+ s32 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
};
#endif /* !__GENERATING_BOUNDS.H */
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index fd1a3d5d4926..afd84dc2398c 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -211,7 +211,6 @@ static inline unsigned long global_node_page_state(enum node_stat_item item)
{
long x = atomic_long_read(&vm_node_stat[item]);
- VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7fb0c7cb9516..25751b1d8e2e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -341,13 +341,15 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
long delta)
{
struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
- s8 __percpu *p = pcp->vm_node_stat_diff + item;
+ s32 __percpu *p = pcp->vm_node_stat_diff + item;
long x;
long t;
x = delta + __this_cpu_read(*p);
t = __this_cpu_read(pcp->stat_threshold);
+ if (vmstat_item_in_bytes(item))
+ t <<= PAGE_SHIFT;
if (unlikely(abs(x) > t)) {
node_page_state_add(x, pgdat, item);
@@ -399,15 +401,15 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
{
struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
- s8 __percpu *p = pcp->vm_node_stat_diff + item;
- s8 v, t;
+ s32 __percpu *p = pcp->vm_node_stat_diff + item;
+ s32 v, t;
VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v > t)) {
- s8 overstep = t >> 1;
+ s32 overstep = t >> 1;
node_page_state_add(v + overstep, pgdat, item);
__this_cpu_write(*p, -overstep);
@@ -445,8 +447,8 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
{
struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
- s8 __percpu *p = pcp->vm_node_stat_diff + item;
- s8 v, t;
+ s32 __percpu *p = pcp->vm_node_stat_diff + item;
+ s32 v, t;
VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
@@ -546,7 +548,7 @@ static inline void mod_node_state(struct pglist_data *pgdat,
enum node_stat_item item, int delta, int overstep_mode)
{
struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
- s8 __percpu *p = pcp->vm_node_stat_diff + item;
+ s32 __percpu *p = pcp->vm_node_stat_diff + item;
long o, n, t, z;
do {
@@ -563,6 +565,8 @@ static inline void mod_node_state(struct pglist_data *pgdat,
* for all cpus in a node.
*/
t = this_cpu_read(pcp->stat_threshold);
+ if (vmstat_item_in_bytes(item))
+ t <<= PAGE_SHIFT;
o = this_cpu_read(*p);
n = delta + o;
@@ -829,7 +833,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
- int v;
+ s32 v;
v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
if (v) {
@@ -899,7 +903,7 @@ void cpu_vm_stats_fold(int cpu)
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
if (p->vm_node_stat_diff[i]) {
- int v;
+ s32 v;
v = p->vm_node_stat_diff[i];
p->vm_node_stat_diff[i] = 0;
@@ -1017,8 +1021,6 @@ unsigned long node_page_state(struct pglist_data *pgdat,
{
long x = atomic_long_read(&pgdat->vm_stat[item]);
- VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
-
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
--
2.11.0
Although the ratio of the slab is one, we also should read the ratio
from the related memory_stats instead of hard-coding. And the local
variable of size is already the value of slab_unreclaimable. So we
do not need to read again.
The unit of the vmstat counters are either pages or bytes now. So we
can drop the ratio in struct memory_stat. This can make the code clean
and simple. And get rid of the awkward mix of static and runtime
initialization of the memory_stats table.
Signed-off-by: Muchun Song <[email protected]>
---
mm/memcontrol.c | 108 ++++++++++++++++++++++++++++++++++++--------------------
1 file changed, 70 insertions(+), 38 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 48d70c1ad301..49fbcf003bf5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1493,48 +1493,71 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
struct memory_stat {
const char *name;
- unsigned int ratio;
unsigned int idx;
};
static const struct memory_stat memory_stats[] = {
- { "anon", PAGE_SIZE, NR_ANON_MAPPED },
- { "file", PAGE_SIZE, NR_FILE_PAGES },
- { "kernel_stack", 1, NR_KERNEL_STACK_B },
- { "percpu", 1, MEMCG_PERCPU_B },
- { "sock", PAGE_SIZE, MEMCG_SOCK },
- { "shmem", PAGE_SIZE, NR_SHMEM },
- { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
- { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
- { "file_writeback", PAGE_SIZE, NR_WRITEBACK },
+ { "anon", NR_ANON_MAPPED },
+ { "file", NR_FILE_PAGES },
+ { "kernel_stack", NR_KERNEL_STACK_B },
+ { "percpu", MEMCG_PERCPU_B },
+ { "sock", MEMCG_SOCK },
+ { "shmem", NR_SHMEM },
+ { "file_mapped", NR_FILE_MAPPED },
+ { "file_dirty", NR_FILE_DIRTY },
+ { "file_writeback", NR_WRITEBACK },
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- { "anon_thp", PAGE_SIZE, NR_ANON_THPS },
- { "file_thp", PAGE_SIZE, NR_FILE_THPS },
- { "shmem_thp", PAGE_SIZE, NR_SHMEM_THPS },
+ { "anon_thp", NR_ANON_THPS },
+ { "file_thp", NR_FILE_THPS },
+ { "shmem_thp", NR_SHMEM_THPS },
#endif
- { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
- { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
- { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
- { "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
- { "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
-
- /*
- * Note: The slab_reclaimable and slab_unreclaimable must be
- * together and slab_reclaimable must be in front.
- */
- { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
- { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
+ { "inactive_anon", NR_INACTIVE_ANON },
+ { "active_anon", NR_ACTIVE_ANON },
+ { "inactive_file", NR_INACTIVE_FILE },
+ { "active_file", NR_ACTIVE_FILE },
+ { "unevictable", NR_UNEVICTABLE },
+ { "slab_reclaimable", NR_SLAB_RECLAIMABLE_B },
+ { "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B },
/* The memory events */
- { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
- { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
- { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
- { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
- { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
- { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
- { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
+ { "workingset_refault_anon", WORKINGSET_REFAULT_ANON },
+ { "workingset_refault_file", WORKINGSET_REFAULT_FILE },
+ { "workingset_activate_anon", WORKINGSET_ACTIVATE_ANON },
+ { "workingset_activate_file", WORKINGSET_ACTIVATE_FILE },
+ { "workingset_restore_anon", WORKINGSET_RESTORE_ANON },
+ { "workingset_restore_file", WORKINGSET_RESTORE_FILE },
+ { "workingset_nodereclaim", WORKINGSET_NODERECLAIM },
};
+/* Translate stat items to the correct unit for memory.stat output */
+static int memcg_page_state_unit(int item)
+{
+ int unit;
+
+ switch (item) {
+ case WORKINGSET_REFAULT_ANON:
+ case WORKINGSET_REFAULT_FILE:
+ case WORKINGSET_ACTIVATE_ANON:
+ case WORKINGSET_ACTIVATE_FILE:
+ case WORKINGSET_RESTORE_ANON:
+ case WORKINGSET_RESTORE_FILE:
+ case WORKINGSET_NODERECLAIM:
+ unit = 1;
+ break;
+ default:
+ unit = memcg_stat_item_in_bytes(item) ? 1 : PAGE_SIZE;
+ break;
+ }
+
+ return unit;
+}
+
+static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
+ int item)
+{
+ return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
+}
+
static char *memory_stat_format(struct mem_cgroup *memcg)
{
struct seq_buf s;
@@ -1558,13 +1581,16 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
u64 size;
- size = memcg_page_state(memcg, memory_stats[i].idx);
- size *= memory_stats[i].ratio;
+ size = memcg_page_state_output(memcg, memory_stats[i].idx);
seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
+ /*
+ * We are printing reclaimable, unreclaimable of the slab
+ * and the sum of both.
+ */
if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
- size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
- memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
+ size += memcg_page_state_output(memcg,
+ NR_SLAB_RECLAIMABLE_B);
seq_buf_printf(&s, "slab %llu\n", size);
}
}
@@ -6358,6 +6384,12 @@ static int memory_stat_show(struct seq_file *m, void *v)
}
#ifdef CONFIG_NUMA
+static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec,
+ int item)
+{
+ return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item);
+}
+
static int memory_numa_stat_show(struct seq_file *m, void *v)
{
int i;
@@ -6375,8 +6407,8 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)
struct lruvec *lruvec;
lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
- size = lruvec_page_state(lruvec, memory_stats[i].idx);
- size *= memory_stats[i].ratio;
+ size = lruvec_page_state_output(lruvec,
+ memory_stats[i].idx);
seq_printf(m, " N%d=%llu", nid, size);
}
seq_putc(m, '\n');
--
2.11.0
On Sun 06-12-20 18:14:39, Muchun Song wrote:
> Hi,
>
> This patch series is aimed to convert all THP vmstat counters to pages
> and some KiB vmstat counters to bytes.
>
> The unit of some vmstat counters are pages, some are bytes, some are
> HPAGE_PMD_NR, and some are KiB. When we want to expose these vmstat
> counters to the userspace, we have to know the unit of the vmstat counters
> is which one. It makes the code complex. Because there are too many choices,
> the probability of making a mistake will be greater.
>
> For example, the below is some bug fix:
> - 7de2e9f195b9 ("mm: memcontrol: correct the NR_ANON_THPS counter of hierarchical memcg")
> - not committed(it is the first commit in this series) ("mm: memcontrol: fix NR_ANON_THPS account")
>
> This patch series can make the code simple (161 insertions(+), 187 deletions(-)).
> And make the unit of the vmstat counters are either pages or bytes. Fewer choices
> means lower probability of making mistakes :).
>
> This was inspired by Johannes and Roman. Thanks to them.
It would be really great if you could summarize the current and after
the patch state so that exceptions are clear and easier to review. The
existing situation is rather convoluted but we have at least units part
of the name so it is not too hard to notice that. Reducing exeptions
sounds nice but I am not really sure it is such an improvement it is
worth a lot of code churn. Especially when it comes to KB vs B. Counting
THPs as regular pages sounds like a good plan to me because we can
expect that THP will be of a different size in the future - especially
for file THPs.
> Changes in v1 -> v2:
> - Change the series subject from "Convert all THP vmstat counters to pages"
> to "Convert all vmstat counters to pages or bytes".
> - Convert NR_KERNEL_SCS_KB account to bytes.
> - Convert vmstat slab counters to bytes.
> - Remove {global_}node_page_state_pages.
>
> Muchun Song (12):
> mm: memcontrol: fix NR_ANON_THPS account
> mm: memcontrol: convert NR_ANON_THPS account to pages
> mm: memcontrol: convert NR_FILE_THPS account to pages
> mm: memcontrol: convert NR_SHMEM_THPS account to pages
> mm: memcontrol: convert NR_SHMEM_PMDMAPPED account to pages
> mm: memcontrol: convert NR_FILE_PMDMAPPED account to pages
> mm: memcontrol: convert kernel stack account to bytes
> mm: memcontrol: convert NR_KERNEL_SCS_KB account to bytes
> mm: memcontrol: convert vmstat slab counters to bytes
> mm: memcontrol: scale stat_threshold for byted-sized vmstat
> mm: memcontrol: make the slab calculation consistent
> mm: memcontrol: remove {global_}node_page_state_pages
>
> drivers/base/node.c | 25 ++++-----
> fs/proc/meminfo.c | 22 ++++----
> include/linux/mmzone.h | 21 +++-----
> include/linux/vmstat.h | 21 ++------
> kernel/fork.c | 8 +--
> kernel/power/snapshot.c | 2 +-
> kernel/scs.c | 4 +-
> mm/filemap.c | 4 +-
> mm/huge_memory.c | 9 ++--
> mm/khugepaged.c | 4 +-
> mm/memcontrol.c | 131 ++++++++++++++++++++++++------------------------
> mm/oom_kill.c | 2 +-
> mm/page_alloc.c | 17 +++----
> mm/rmap.c | 19 ++++---
> mm/shmem.c | 3 +-
> mm/vmscan.c | 2 +-
> mm/vmstat.c | 54 ++++++++------------
> 17 files changed, 161 insertions(+), 187 deletions(-)
>
> --
> 2.11.0
--
Michal Hocko
SUSE Labs
On Mon, Dec 7, 2020 at 9:00 PM Michal Hocko <[email protected]> wrote:
>
> On Sun 06-12-20 18:14:39, Muchun Song wrote:
> > Hi,
> >
> > This patch series is aimed to convert all THP vmstat counters to pages
> > and some KiB vmstat counters to bytes.
> >
> > The unit of some vmstat counters are pages, some are bytes, some are
> > HPAGE_PMD_NR, and some are KiB. When we want to expose these vmstat
> > counters to the userspace, we have to know the unit of the vmstat counters
> > is which one. It makes the code complex. Because there are too many choices,
> > the probability of making a mistake will be greater.
> >
> > For example, the below is some bug fix:
> > - 7de2e9f195b9 ("mm: memcontrol: correct the NR_ANON_THPS counter of hierarchical memcg")
> > - not committed(it is the first commit in this series) ("mm: memcontrol: fix NR_ANON_THPS account")
> >
> > This patch series can make the code simple (161 insertions(+), 187 deletions(-)).
> > And make the unit of the vmstat counters are either pages or bytes. Fewer choices
> > means lower probability of making mistakes :).
> >
> > This was inspired by Johannes and Roman. Thanks to them.
>
> It would be really great if you could summarize the current and after
> the patch state so that exceptions are clear and easier to review. The
Agree. Will do in the next version. Thanks.
> existing situation is rather convoluted but we have at least units part
> of the name so it is not too hard to notice that. Reducing exeptions
> sounds nice but I am not really sure it is such an improvement it is
> worth a lot of code churn. Especially when it comes to KB vs B. Counting
There are two vmstat counters (NR_KERNEL_STACK_KB and
NR_KERNEL_SCS_KB) whose units are KB. If we do this, all
vmstat counter units are either pages or bytes in the end. When
we expose those counters to userspace, it can be easy. You can
reference to:
[RESEND PATCH v2 11/12] mm: memcontrol: make the slab calculation consistent
From this point of view, I think that it is worth doing this. Right?
> THPs as regular pages sounds like a good plan to me because we can
> expect that THP will be of a different size in the future - especially
> for file THPs. It can be easy to convert.
>
> > Changes in v1 -> v2:
> > - Change the series subject from "Convert all THP vmstat counters to pages"
> > to "Convert all vmstat counters to pages or bytes".
> > - Convert NR_KERNEL_SCS_KB account to bytes.
> > - Convert vmstat slab counters to bytes.
> > - Remove {global_}node_page_state_pages.
> >
> > Muchun Song (12):
> > mm: memcontrol: fix NR_ANON_THPS account
> > mm: memcontrol: convert NR_ANON_THPS account to pages
> > mm: memcontrol: convert NR_FILE_THPS account to pages
> > mm: memcontrol: convert NR_SHMEM_THPS account to pages
> > mm: memcontrol: convert NR_SHMEM_PMDMAPPED account to pages
> > mm: memcontrol: convert NR_FILE_PMDMAPPED account to pages
> > mm: memcontrol: convert kernel stack account to bytes
> > mm: memcontrol: convert NR_KERNEL_SCS_KB account to bytes
> > mm: memcontrol: convert vmstat slab counters to bytes
> > mm: memcontrol: scale stat_threshold for byted-sized vmstat
> > mm: memcontrol: make the slab calculation consistent
> > mm: memcontrol: remove {global_}node_page_state_pages
> >
> > drivers/base/node.c | 25 ++++-----
> > fs/proc/meminfo.c | 22 ++++----
> > include/linux/mmzone.h | 21 +++-----
> > include/linux/vmstat.h | 21 ++------
> > kernel/fork.c | 8 +--
> > kernel/power/snapshot.c | 2 +-
> > kernel/scs.c | 4 +-
> > mm/filemap.c | 4 +-
> > mm/huge_memory.c | 9 ++--
> > mm/khugepaged.c | 4 +-
> > mm/memcontrol.c | 131 ++++++++++++++++++++++++------------------------
> > mm/oom_kill.c | 2 +-
> > mm/page_alloc.c | 17 +++----
> > mm/rmap.c | 19 ++++---
> > mm/shmem.c | 3 +-
> > mm/vmscan.c | 2 +-
> > mm/vmstat.c | 54 ++++++++------------
> > 17 files changed, 161 insertions(+), 187 deletions(-)
> >
> > --
> > 2.11.0
>
> --
> Michal Hocko
> SUSE Labs
--
Yours,
Muchun
On Mon 07-12-20 22:52:30, Muchun Song wrote:
> On Mon, Dec 7, 2020 at 9:00 PM Michal Hocko <[email protected]> wrote:
> >
> > On Sun 06-12-20 18:14:39, Muchun Song wrote:
> > > Hi,
> > >
> > > This patch series is aimed to convert all THP vmstat counters to pages
> > > and some KiB vmstat counters to bytes.
> > >
> > > The unit of some vmstat counters are pages, some are bytes, some are
> > > HPAGE_PMD_NR, and some are KiB. When we want to expose these vmstat
> > > counters to the userspace, we have to know the unit of the vmstat counters
> > > is which one. It makes the code complex. Because there are too many choices,
> > > the probability of making a mistake will be greater.
> > >
> > > For example, the below is some bug fix:
> > > - 7de2e9f195b9 ("mm: memcontrol: correct the NR_ANON_THPS counter of hierarchical memcg")
> > > - not committed(it is the first commit in this series) ("mm: memcontrol: fix NR_ANON_THPS account")
> > >
> > > This patch series can make the code simple (161 insertions(+), 187 deletions(-)).
> > > And make the unit of the vmstat counters are either pages or bytes. Fewer choices
> > > means lower probability of making mistakes :).
> > >
> > > This was inspired by Johannes and Roman. Thanks to them.
> >
> > It would be really great if you could summarize the current and after
> > the patch state so that exceptions are clear and easier to review. The
>
> Agree. Will do in the next version. Thanks.
>
>
> > existing situation is rather convoluted but we have at least units part
> > of the name so it is not too hard to notice that. Reducing exeptions
> > sounds nice but I am not really sure it is such an improvement it is
> > worth a lot of code churn. Especially when it comes to KB vs B. Counting
>
> There are two vmstat counters (NR_KERNEL_STACK_KB and
> NR_KERNEL_SCS_KB) whose units are KB. If we do this, all
> vmstat counter units are either pages or bytes in the end. When
> we expose those counters to userspace, it can be easy. You can
> reference to:
>
> [RESEND PATCH v2 11/12] mm: memcontrol: make the slab calculation consistent
>
> From this point of view, I think that it is worth doing this. Right?
Well, unless I am missing something, we have two counters in bytes, two
in kB, both clearly distinguishable by the B/KB suffix. Changing KB to B
will certainly reduce the different classes of units, no question about
that, but I am not really sure this is worth all the code churn. Maybe
others will think otherwise.
As I've said the THP accounting change makes more sense to me because it
allows future changes which are already undergoing so there is more
merit in those.
--
Michal Hocko
SUSE Labs
On 12/7/20 7:02 AM, Michal Hocko wrote:
> On Mon 07-12-20 22:52:30, Muchun Song wrote:
>> On Mon, Dec 7, 2020 at 9:00 PM Michal Hocko <[email protected]> wrote:
>>>
>>> On Sun 06-12-20 18:14:39, Muchun Song wrote:
>>>> Hi,
>>>>
>>>> This patch series is aimed to convert all THP vmstat counters to pages
>>>> and some KiB vmstat counters to bytes.
>>>>
>>>> The unit of some vmstat counters are pages, some are bytes, some are
>>>> HPAGE_PMD_NR, and some are KiB. When we want to expose these vmstat
>>>> counters to the userspace, we have to know the unit of the vmstat counters
>>>> is which one. It makes the code complex. Because there are too many choices,
>>>> the probability of making a mistake will be greater.
>>>>
>>>> For example, the below is some bug fix:
>>>> - 7de2e9f195b9 ("mm: memcontrol: correct the NR_ANON_THPS counter of hierarchical memcg")
>>>> - not committed(it is the first commit in this series) ("mm: memcontrol: fix NR_ANON_THPS account")
>>>>
>>>> This patch series can make the code simple (161 insertions(+), 187 deletions(-)).
>>>> And make the unit of the vmstat counters are either pages or bytes. Fewer choices
>>>> means lower probability of making mistakes :).
>>>>
>>>> This was inspired by Johannes and Roman. Thanks to them.
>>>
>>> It would be really great if you could summarize the current and after
>>> the patch state so that exceptions are clear and easier to review. The
>>
>> Agree. Will do in the next version. Thanks.
>>
>>
>>> existing situation is rather convoluted but we have at least units part
>>> of the name so it is not too hard to notice that. Reducing exeptions
>>> sounds nice but I am not really sure it is such an improvement it is
>>> worth a lot of code churn. Especially when it comes to KB vs B. Counting
>>
>> There are two vmstat counters (NR_KERNEL_STACK_KB and
>> NR_KERNEL_SCS_KB) whose units are KB. If we do this, all
>> vmstat counter units are either pages or bytes in the end. When
>> we expose those counters to userspace, it can be easy. You can
>> reference to:
>>
>> [RESEND PATCH v2 11/12] mm: memcontrol: make the slab calculation consistent
>>
>> From this point of view, I think that it is worth doing this. Right?
>
> Well, unless I am missing something, we have two counters in bytes, two
> in kB, both clearly distinguishable by the B/KB suffix. Changing KB to B
> will certainly reduce the different classes of units, no question about
> that, but I am not really sure this is worth all the code churn. Maybe
> others will think otherwise.
>
> As I've said the THP accounting change makes more sense to me because it
> allows future changes which are already undergoing so there is more
> merit in those.
>
Hi,
Are there any documentation changes that go with these patches?
Or are none needed?
If the patches change the output in /proc/* or /sys/* then I expect
there would need to be some doc changes.
And is there any chance of confusing userspace s/w (binary or scripts)
with these changes?
thanks.
--
~Randy
On Sun, Dec 06, 2020 at 06:14:48PM +0800, Muchun Song wrote:
> the global and per-node counters are stored in pages, however memcg
> and lruvec counters are stored in bytes. This scheme looks weird.
> So convert all vmstat slab counters to bytes.
There is a reason for this weird scheme:
percpu caches (see struct per_cpu_nodestat) are s8, so counting in bytes
will lead to overfills. Switching to s32 can lead to an increase in
the cache thrashing, especially on small machines.
>
> Signed-off-by: Muchun Song <[email protected]>
> ---
> include/linux/vmstat.h | 17 ++++++++++-------
> mm/vmstat.c | 21 ++++++++++-----------
> 2 files changed, 20 insertions(+), 18 deletions(-)
>
> diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
> index 322dcbfcc933..fd1a3d5d4926 100644
> --- a/include/linux/vmstat.h
> +++ b/include/linux/vmstat.h
> @@ -197,18 +197,26 @@ static inline
> unsigned long global_node_page_state_pages(enum node_stat_item item)
> {
> long x = atomic_long_read(&vm_node_stat[item]);
> +
> #ifdef CONFIG_SMP
> if (x < 0)
> x = 0;
> #endif
> + if (vmstat_item_in_bytes(item))
> + x >>= PAGE_SHIFT;
> return x;
> }
>
> static inline unsigned long global_node_page_state(enum node_stat_item item)
> {
> - VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
> + long x = atomic_long_read(&vm_node_stat[item]);
>
> - return global_node_page_state_pages(item);
> + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
> +#ifdef CONFIG_SMP
> + if (x < 0)
> + x = 0;
> +#endif
> + return x;
> }
>
> static inline unsigned long zone_page_state(struct zone *zone,
> @@ -312,11 +320,6 @@ static inline void __mod_zone_page_state(struct zone *zone,
> static inline void __mod_node_page_state(struct pglist_data *pgdat,
> enum node_stat_item item, int delta)
> {
> - if (vmstat_item_in_bytes(item)) {
> - VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
> - delta >>= PAGE_SHIFT;
> - }
> -
> node_page_state_add(delta, pgdat, item);
> }
>
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index 8d77ee426e22..7fb0c7cb9516 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -345,11 +345,6 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
> long x;
> long t;
>
> - if (vmstat_item_in_bytes(item)) {
> - VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
> - delta >>= PAGE_SHIFT;
> - }
> -
> x = delta + __this_cpu_read(*p);
>
> t = __this_cpu_read(pcp->stat_threshold);
> @@ -554,11 +549,6 @@ static inline void mod_node_state(struct pglist_data *pgdat,
> s8 __percpu *p = pcp->vm_node_stat_diff + item;
> long o, n, t, z;
>
> - if (vmstat_item_in_bytes(item)) {
> - VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
> - delta >>= PAGE_SHIFT;
> - }
> -
> do {
> z = 0; /* overflow to node counters */
>
> @@ -1012,19 +1002,28 @@ unsigned long node_page_state_pages(struct pglist_data *pgdat,
> enum node_stat_item item)
> {
> long x = atomic_long_read(&pgdat->vm_stat[item]);
> +
> #ifdef CONFIG_SMP
> if (x < 0)
> x = 0;
> #endif
> + if (vmstat_item_in_bytes(item))
> + x >>= PAGE_SHIFT;
> return x;
> }
>
> unsigned long node_page_state(struct pglist_data *pgdat,
> enum node_stat_item item)
> {
> + long x = atomic_long_read(&pgdat->vm_stat[item]);
> +
> VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
>
> - return node_page_state_pages(pgdat, item);
> +#ifdef CONFIG_SMP
> + if (x < 0)
> + x = 0;
> +#endif
> + return x;
> }
> #endif
>
> --
> 2.11.0
>
On Mon, Dec 07, 2020 at 04:02:54PM +0100, Michal Hocko wrote:
> On Mon 07-12-20 22:52:30, Muchun Song wrote:
> > On Mon, Dec 7, 2020 at 9:00 PM Michal Hocko <[email protected]> wrote:
> > >
> > > On Sun 06-12-20 18:14:39, Muchun Song wrote:
> > > > Hi,
> > > >
> > > > This patch series is aimed to convert all THP vmstat counters to pages
> > > > and some KiB vmstat counters to bytes.
> > > >
> > > > The unit of some vmstat counters are pages, some are bytes, some are
> > > > HPAGE_PMD_NR, and some are KiB. When we want to expose these vmstat
> > > > counters to the userspace, we have to know the unit of the vmstat counters
> > > > is which one. It makes the code complex. Because there are too many choices,
> > > > the probability of making a mistake will be greater.
> > > >
> > > > For example, the below is some bug fix:
> > > > - 7de2e9f195b9 ("mm: memcontrol: correct the NR_ANON_THPS counter of hierarchical memcg")
> > > > - not committed(it is the first commit in this series) ("mm: memcontrol: fix NR_ANON_THPS account")
> > > >
> > > > This patch series can make the code simple (161 insertions(+), 187 deletions(-)).
> > > > And make the unit of the vmstat counters are either pages or bytes. Fewer choices
> > > > means lower probability of making mistakes :).
> > > >
> > > > This was inspired by Johannes and Roman. Thanks to them.
> > >
> > > It would be really great if you could summarize the current and after
> > > the patch state so that exceptions are clear and easier to review. The
> >
> > Agree. Will do in the next version. Thanks.
> >
> >
> > > existing situation is rather convoluted but we have at least units part
> > > of the name so it is not too hard to notice that. Reducing exeptions
> > > sounds nice but I am not really sure it is such an improvement it is
> > > worth a lot of code churn. Especially when it comes to KB vs B. Counting
> >
> > There are two vmstat counters (NR_KERNEL_STACK_KB and
> > NR_KERNEL_SCS_KB) whose units are KB. If we do this, all
> > vmstat counter units are either pages or bytes in the end. When
> > we expose those counters to userspace, it can be easy. You can
> > reference to:
> >
> > [RESEND PATCH v2 11/12] mm: memcontrol: make the slab calculation consistent
> >
> > From this point of view, I think that it is worth doing this. Right?
>
> Well, unless I am missing something, we have two counters in bytes, two
> in kB, both clearly distinguishable by the B/KB suffix. Changing KB to B
> will certainly reduce the different classes of units, no question about
> that, but I am not really sure this is worth all the code churn. Maybe
> others will think otherwise.
Even if it was me who suggested it, I do agree. It's nice to have a smaller number
of units, but if it creates a lot of hassle, then it makes not much sense.
I think we need to look at the final version of patches and decide if it
worth it or not.
>
> As I've said the THP accounting change makes more sense to me because it
> allows future changes which are already undergoing so there is more
> merit in those.
+1
And this part is absolutely trivial.
On Mon, Dec 07, 2020 at 11:46:22AM -0800, Roman Gushchin wrote:
> On Sun, Dec 06, 2020 at 06:14:48PM +0800, Muchun Song wrote:
> > the global and per-node counters are stored in pages, however memcg
> > and lruvec counters are stored in bytes. This scheme looks weird.
> > So convert all vmstat slab counters to bytes.
>
> There is a reason for this weird scheme:
> percpu caches (see struct per_cpu_nodestat) are s8, so counting in bytes
> will lead to overfills. Switching to s32 can lead to an increase in
> the cache thrashing, especially on small machines.
JFYI:
I've tried to convert all slab counters to bytes and change those s8 percpu batches to s32
about a year ago. Here is a link to that thread:
https://patchwork.kernel.org/project/linux-mm/patch/[email protected]/
Thanks!
On Mon, 7 Dec 2020, Roman Gushchin wrote:
> On Mon, Dec 07, 2020 at 04:02:54PM +0100, Michal Hocko wrote:
> >
> > As I've said the THP accounting change makes more sense to me because it
> > allows future changes which are already undergoing so there is more
> > merit in those.
>
> +1
> And this part is absolutely trivial.
It does need to be recognized that, with these changes, every THP stats
update overflows the per-cpu counter, resorting to atomic global updates.
And I'd like to see that mentioned in the commit message.
But this change is consistent with 4.7's 8f182270dfec ("mm/swap.c: flush
lru pvecs on compound page arrival"): we accepted greater overhead for
greater accuracy back then, so I think it's okay to do so for THP stats.
Hugh
On Tue, Dec 8, 2020 at 2:51 AM Randy Dunlap <[email protected]> wrote:
>
> On 12/7/20 7:02 AM, Michal Hocko wrote:
> > On Mon 07-12-20 22:52:30, Muchun Song wrote:
> >> On Mon, Dec 7, 2020 at 9:00 PM Michal Hocko <[email protected]> wrote:
> >>>
> >>> On Sun 06-12-20 18:14:39, Muchun Song wrote:
> >>>> Hi,
> >>>>
> >>>> This patch series is aimed to convert all THP vmstat counters to pages
> >>>> and some KiB vmstat counters to bytes.
> >>>>
> >>>> The unit of some vmstat counters are pages, some are bytes, some are
> >>>> HPAGE_PMD_NR, and some are KiB. When we want to expose these vmstat
> >>>> counters to the userspace, we have to know the unit of the vmstat counters
> >>>> is which one. It makes the code complex. Because there are too many choices,
> >>>> the probability of making a mistake will be greater.
> >>>>
> >>>> For example, the below is some bug fix:
> >>>> - 7de2e9f195b9 ("mm: memcontrol: correct the NR_ANON_THPS counter of hierarchical memcg")
> >>>> - not committed(it is the first commit in this series) ("mm: memcontrol: fix NR_ANON_THPS account")
> >>>>
> >>>> This patch series can make the code simple (161 insertions(+), 187 deletions(-)).
> >>>> And make the unit of the vmstat counters are either pages or bytes. Fewer choices
> >>>> means lower probability of making mistakes :).
> >>>>
> >>>> This was inspired by Johannes and Roman. Thanks to them.
> >>>
> >>> It would be really great if you could summarize the current and after
> >>> the patch state so that exceptions are clear and easier to review. The
> >>
> >> Agree. Will do in the next version. Thanks.
> >>
> >>
> >>> existing situation is rather convoluted but we have at least units part
> >>> of the name so it is not too hard to notice that. Reducing exeptions
> >>> sounds nice but I am not really sure it is such an improvement it is
> >>> worth a lot of code churn. Especially when it comes to KB vs B. Counting
> >>
> >> There are two vmstat counters (NR_KERNEL_STACK_KB and
> >> NR_KERNEL_SCS_KB) whose units are KB. If we do this, all
> >> vmstat counter units are either pages or bytes in the end. When
> >> we expose those counters to userspace, it can be easy. You can
> >> reference to:
> >>
> >> [RESEND PATCH v2 11/12] mm: memcontrol: make the slab calculation consistent
> >>
> >> From this point of view, I think that it is worth doing this. Right?
> >
> > Well, unless I am missing something, we have two counters in bytes, two
> > in kB, both clearly distinguishable by the B/KB suffix. Changing KB to B
> > will certainly reduce the different classes of units, no question about
> > that, but I am not really sure this is worth all the code churn. Maybe
> > others will think otherwise.
> >
> > As I've said the THP accounting change makes more sense to me because it
> > allows future changes which are already undergoing so there is more
> > merit in those.
> >
>
> Hi,
>
> Are there any documentation changes that go with these patches?
> Or are none needed?
>
> If the patches change the output in /proc/* or /sys/* then I expect
> there would need to be some doc changes.
Oh, we do not change the output. It is transparent to userspace.
Thanks.
>
> And is there any chance of confusing userspace s/w (binary or scripts)
> with these changes?
>
> thanks.
> --
> ~Randy
>
--
Yours,
Muchun
On Mon, Dec 7, 2020 at 11:02 PM Michal Hocko <[email protected]> wrote:
>
> On Mon 07-12-20 22:52:30, Muchun Song wrote:
> > On Mon, Dec 7, 2020 at 9:00 PM Michal Hocko <[email protected]> wrote:
> > >
> > > On Sun 06-12-20 18:14:39, Muchun Song wrote:
> > > > Hi,
> > > >
> > > > This patch series is aimed to convert all THP vmstat counters to pages
> > > > and some KiB vmstat counters to bytes.
> > > >
> > > > The unit of some vmstat counters are pages, some are bytes, some are
> > > > HPAGE_PMD_NR, and some are KiB. When we want to expose these vmstat
> > > > counters to the userspace, we have to know the unit of the vmstat counters
> > > > is which one. It makes the code complex. Because there are too many choices,
> > > > the probability of making a mistake will be greater.
> > > >
> > > > For example, the below is some bug fix:
> > > > - 7de2e9f195b9 ("mm: memcontrol: correct the NR_ANON_THPS counter of hierarchical memcg")
> > > > - not committed(it is the first commit in this series) ("mm: memcontrol: fix NR_ANON_THPS account")
> > > >
> > > > This patch series can make the code simple (161 insertions(+), 187 deletions(-)).
> > > > And make the unit of the vmstat counters are either pages or bytes. Fewer choices
> > > > means lower probability of making mistakes :).
> > > >
> > > > This was inspired by Johannes and Roman. Thanks to them.
> > >
> > > It would be really great if you could summarize the current and after
> > > the patch state so that exceptions are clear and easier to review. The
> >
> > Agree. Will do in the next version. Thanks.
> >
> >
> > > existing situation is rather convoluted but we have at least units part
> > > of the name so it is not too hard to notice that. Reducing exeptions
> > > sounds nice but I am not really sure it is such an improvement it is
> > > worth a lot of code churn. Especially when it comes to KB vs B. Counting
> >
> > There are two vmstat counters (NR_KERNEL_STACK_KB and
> > NR_KERNEL_SCS_KB) whose units are KB. If we do this, all
> > vmstat counter units are either pages or bytes in the end. When
> > we expose those counters to userspace, it can be easy. You can
> > reference to:
> >
> > [RESEND PATCH v2 11/12] mm: memcontrol: make the slab calculation consistent
> >
> > From this point of view, I think that it is worth doing this. Right?
>
> Well, unless I am missing something, we have two counters in bytes, two
> in kB, both clearly distinguishable by the B/KB suffix. Changing KB to B
> will certainly reduce the different classes of units, no question about
> that, but I am not really sure this is worth all the code churn. Maybe
> others will think otherwise.
>
> As I've said the THP accounting change makes more sense to me because it
> allows future changes which are already undergoing so there is more
> merit in those.
OK, will delete the convert of KB to B. Thanks.
> --
> Michal Hocko
> SUSE Labs
--
Yours,
Muchun
On Tue, Dec 8, 2020 at 4:33 AM Hugh Dickins <[email protected]> wrote:
>
> On Mon, 7 Dec 2020, Roman Gushchin wrote:
> > On Mon, Dec 07, 2020 at 04:02:54PM +0100, Michal Hocko wrote:
> > >
> > > As I've said the THP accounting change makes more sense to me because it
> > > allows future changes which are already undergoing so there is more
> > > merit in those.
> >
> > +1
> > And this part is absolutely trivial.
>
> It does need to be recognized that, with these changes, every THP stats
> update overflows the per-cpu counter, resorting to atomic global updates.
> And I'd like to see that mentioned in the commit message.
Thanks for reminding me. Will add.
>
> But this change is consistent with 4.7's 8f182270dfec ("mm/swap.c: flush
> lru pvecs on compound page arrival"): we accepted greater overhead for
> greater accuracy back then, so I think it's okay to do so for THP stats.
Agree. Thanks.
>
> Hugh
--
Yours,
Muchun
On Tue, Dec 8, 2020 at 3:46 AM Roman Gushchin <[email protected]> wrote:
>
> On Sun, Dec 06, 2020 at 06:14:48PM +0800, Muchun Song wrote:
> > the global and per-node counters are stored in pages, however memcg
> > and lruvec counters are stored in bytes. This scheme looks weird.
> > So convert all vmstat slab counters to bytes.
>
> There is a reason for this weird scheme:
> percpu caches (see struct per_cpu_nodestat) are s8, so counting in bytes
> will lead to overfills. Switching to s32 can lead to an increase in
> the cache thrashing, especially on small machines.
Thanks Roman. I see now.
>
> >
> > Signed-off-by: Muchun Song <[email protected]>
> > ---
> > include/linux/vmstat.h | 17 ++++++++++-------
> > mm/vmstat.c | 21 ++++++++++-----------
> > 2 files changed, 20 insertions(+), 18 deletions(-)
> >
> > diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
> > index 322dcbfcc933..fd1a3d5d4926 100644
> > --- a/include/linux/vmstat.h
> > +++ b/include/linux/vmstat.h
> > @@ -197,18 +197,26 @@ static inline
> > unsigned long global_node_page_state_pages(enum node_stat_item item)
> > {
> > long x = atomic_long_read(&vm_node_stat[item]);
> > +
> > #ifdef CONFIG_SMP
> > if (x < 0)
> > x = 0;
> > #endif
> > + if (vmstat_item_in_bytes(item))
> > + x >>= PAGE_SHIFT;
> > return x;
> > }
> >
> > static inline unsigned long global_node_page_state(enum node_stat_item item)
> > {
> > - VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
> > + long x = atomic_long_read(&vm_node_stat[item]);
> >
> > - return global_node_page_state_pages(item);
> > + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
> > +#ifdef CONFIG_SMP
> > + if (x < 0)
> > + x = 0;
> > +#endif
> > + return x;
> > }
> >
> > static inline unsigned long zone_page_state(struct zone *zone,
> > @@ -312,11 +320,6 @@ static inline void __mod_zone_page_state(struct zone *zone,
> > static inline void __mod_node_page_state(struct pglist_data *pgdat,
> > enum node_stat_item item, int delta)
> > {
> > - if (vmstat_item_in_bytes(item)) {
> > - VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
> > - delta >>= PAGE_SHIFT;
> > - }
> > -
> > node_page_state_add(delta, pgdat, item);
> > }
> >
> > diff --git a/mm/vmstat.c b/mm/vmstat.c
> > index 8d77ee426e22..7fb0c7cb9516 100644
> > --- a/mm/vmstat.c
> > +++ b/mm/vmstat.c
> > @@ -345,11 +345,6 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
> > long x;
> > long t;
> >
> > - if (vmstat_item_in_bytes(item)) {
> > - VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
> > - delta >>= PAGE_SHIFT;
> > - }
> > -
> > x = delta + __this_cpu_read(*p);
> >
> > t = __this_cpu_read(pcp->stat_threshold);
> > @@ -554,11 +549,6 @@ static inline void mod_node_state(struct pglist_data *pgdat,
> > s8 __percpu *p = pcp->vm_node_stat_diff + item;
> > long o, n, t, z;
> >
> > - if (vmstat_item_in_bytes(item)) {
> > - VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
> > - delta >>= PAGE_SHIFT;
> > - }
> > -
> > do {
> > z = 0; /* overflow to node counters */
> >
> > @@ -1012,19 +1002,28 @@ unsigned long node_page_state_pages(struct pglist_data *pgdat,
> > enum node_stat_item item)
> > {
> > long x = atomic_long_read(&pgdat->vm_stat[item]);
> > +
> > #ifdef CONFIG_SMP
> > if (x < 0)
> > x = 0;
> > #endif
> > + if (vmstat_item_in_bytes(item))
> > + x >>= PAGE_SHIFT;
> > return x;
> > }
> >
> > unsigned long node_page_state(struct pglist_data *pgdat,
> > enum node_stat_item item)
> > {
> > + long x = atomic_long_read(&pgdat->vm_stat[item]);
> > +
> > VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
> >
> > - return node_page_state_pages(pgdat, item);
> > +#ifdef CONFIG_SMP
> > + if (x < 0)
> > + x = 0;
> > +#endif
> > + return x;
> > }
> > #endif
> >
> > --
> > 2.11.0
> >
--
Yours,
Muchun
On Sun, Dec 06, 2020 at 06:14:41PM +0800, Muchun Song wrote:
> @@ -1144,7 +1144,8 @@ void do_page_add_anon_rmap(struct page *page,
> * disabled.
> */
> if (compound)
> - __inc_lruvec_page_state(page, NR_ANON_THPS);
> + __mod_lruvec_page_state(page, NR_ANON_THPS,
> + HPAGE_PMD_NR);
> __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
What I mistakenly wrote about the previous patch applies to this and
the following patches, though:
/proc/vmstat currently prints number of anon, file and shmem THPs; you
are changing it to print number of 4k pages in those THPs.
That's an ABI change we cannot really do.
On Fri, Dec 11, 2020 at 12:12 AM Johannes Weiner <[email protected]> wrote:
>
> On Sun, Dec 06, 2020 at 06:14:41PM +0800, Muchun Song wrote:
> > @@ -1144,7 +1144,8 @@ void do_page_add_anon_rmap(struct page *page,
> > * disabled.
> > */
> > if (compound)
> > - __inc_lruvec_page_state(page, NR_ANON_THPS);
> > + __mod_lruvec_page_state(page, NR_ANON_THPS,
> > + HPAGE_PMD_NR);
> > __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
>
> What I mistakenly wrote about the previous patch applies to this and
> the following patches, though:
>
> /proc/vmstat currently prints number of anon, file and shmem THPs; you
> are changing it to print number of 4k pages in those THPs.
>
> That's an ABI change we cannot really do.
Agree. Sorry. I forgot to fix the /proc/vmstat. Thanks for your
reminder.
--
Yours,
Muchun