2023-06-21 18:23:54

by Yuanchu Xie

[permalink] [raw]
Subject: [RFC PATCH v2 1/6] mm: aggregate working set information into histograms

Hierarchically aggregate all memcgs' MGLRU generations and their
page counts into working set histograms.
The histograms break down the system's working set per-node,
per-anon/file.

Signed-off-by: T.J. Alumbaugh <[email protected]>
Signed-off-by: Yuanchu Xie <[email protected]>
---
drivers/base/node.c | 3 +
include/linux/mmzone.h | 4 +
include/linux/wsr.h | 73 +++++++++++
mm/Kconfig | 7 +
mm/Makefile | 1 +
mm/internal.h | 1 +
mm/mmzone.c | 3 +
mm/vmscan.c | 3 +
mm/wsr.c | 288 +++++++++++++++++++++++++++++++++++++++++
9 files changed, 383 insertions(+)
create mode 100644 include/linux/wsr.h
create mode 100644 mm/wsr.c

diff --git a/drivers/base/node.c b/drivers/base/node.c
index faf3597a96da9..e326debe22d8f 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -21,6 +21,7 @@
#include <linux/swap.h>
#include <linux/slab.h>
#include <linux/hugetlb.h>
+#include <linux/wsr.h>

static struct bus_type node_subsys = {
.name = "node",
@@ -616,6 +617,7 @@ static int register_node(struct node *node, int num)
} else {
hugetlb_register_node(node);
compaction_register_node(node);
+ wsr_register_node(node);
}

return error;
@@ -632,6 +634,7 @@ void unregister_node(struct node *node)
{
hugetlb_unregister_node(node);
compaction_unregister_node(node);
+ wsr_unregister_node(node);
node_remove_accesses(node);
node_remove_caches(node);
device_unregister(&node->dev);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index cd28a100d9e4f..96f0d8f3584e4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -21,6 +21,7 @@
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <linux/local_lock.h>
+#include <linux/wsr.h>
#include <asm/page.h>

/* Free memory management - zoned buddy allocator. */
@@ -527,7 +528,10 @@ struct lruvec {
struct lru_gen_struct lrugen;
/* to concurrently iterate lru_gen_mm_list */
struct lru_gen_mm_state mm_state;
+#ifdef CONFIG_WSR
+ struct wsr __wsr;
#endif
+#endif /* CONFIG_LRU_GEN */
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
#endif
diff --git a/include/linux/wsr.h b/include/linux/wsr.h
new file mode 100644
index 0000000000000..fa46b4d61177d
--- /dev/null
+++ b/include/linux/wsr.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_WSR_H
+#define _LINUX_WSR_H
+
+#include <linux/types.h>
+#include <linux/mutex.h>
+
+struct node;
+struct lruvec;
+struct mem_cgroup;
+struct pglist_data;
+struct scan_control;
+struct lru_gen_mm_walk;
+
+#ifdef CONFIG_WSR
+#define ANON_AND_FILE 2
+
+#define MIN_NR_BINS 4
+#define MAX_NR_BINS 16
+
+struct ws_bin {
+ unsigned long idle_age;
+ unsigned long nr_pages[ANON_AND_FILE];
+};
+
+struct wsr {
+ /* protects bins */
+ struct mutex bins_lock;
+ struct ws_bin bins[MAX_NR_BINS];
+};
+
+void wsr_register_node(struct node *node);
+void wsr_unregister_node(struct node *node);
+
+void wsr_init(struct lruvec *lruvec);
+void wsr_destroy(struct lruvec *lruvec);
+struct wsr *lruvec_wsr(struct lruvec *lruvec);
+
+ssize_t wsr_intervals_ms_parse(char *src, struct ws_bin *bins);
+
+/*
+ * wsr->bins needs to be locked
+ */
+void wsr_refresh(struct wsr *wsr, struct mem_cgroup *root,
+ struct pglist_data *pgdat);
+#else
+struct ws_bin;
+struct wsr;
+
+static inline void wsr_register_node(struct node *node)
+{
+}
+static inline void wsr_unregister_node(struct node *node)
+{
+}
+static inline void wsr_init(struct lruvec *lruvec)
+{
+}
+static inline void wsr_destroy(struct lruvec *lruvec)
+{
+}
+/* lruvec_wsr is intentially omitted */
+static inline ssize_t wsr_intervals_ms_parse(char *src, struct ws_bin *bins)
+{
+ return -EINVAL;
+}
+static inline void wsr_refresh(struct wsr *wsr, struct mem_cgroup *root,
+ struct pglist_data *pgdat)
+{
+}
+#endif /* CONFIG_WSR */
+
+#endif /* _LINUX_WSR_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index ff7b209dec055..8a84c1402159a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1183,6 +1183,13 @@ config LRU_GEN_STATS
This option has a per-memcg and per-node memory overhead.
# }

+config WSR
+ bool "Working set reporting"
+ depends on LRU_GEN
+ help
+ This option enables working set reporting, separate backends
+ WIP. Currently only supports MGLRU.
+
source "mm/damon/Kconfig"

endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 8e105e5b3e293..12e2da5ba2d04 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -98,6 +98,7 @@ obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
+obj-$(CONFIG_WSR) += wsr.o
ifdef CONFIG_SWAP
obj-$(CONFIG_MEMCG) += swap_cgroup.o
endif
diff --git a/mm/internal.h b/mm/internal.h
index bcf75a8b032de..88dba0b11f663 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -180,6 +180,7 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
/*
* in mm/vmscan.c:
*/
+struct scan_control;
int isolate_lru_page(struct page *page);
int folio_isolate_lru(struct folio *folio);
void putback_lru_page(struct page *page);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 68e1511be12de..22a8282f67150 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -8,6 +8,7 @@

#include <linux/stddef.h>
#include <linux/mm.h>
+#include <linux/wsr.h>
#include <linux/mmzone.h>

struct pglist_data *first_online_pgdat(void)
@@ -89,6 +90,8 @@ void lruvec_init(struct lruvec *lruvec)
*/
list_del(&lruvec->lists[LRU_UNEVICTABLE]);

+ wsr_init(lruvec);
+
lru_gen_init_lruvec(lruvec);
}

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5b7b8d4f5297f..150e3cd70c65e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -55,6 +55,7 @@
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include <linux/khugepaged.h>
+#include <linux/wsr.h>

#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -5890,6 +5891,8 @@ static int __init init_lru_gen(void)
if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
pr_err("lru_gen: failed to create sysfs group\n");

+ wsr_register_node(NULL);
+
debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);

diff --git a/mm/wsr.c b/mm/wsr.c
new file mode 100644
index 0000000000000..1e4c0ce69caf7
--- /dev/null
+++ b/mm/wsr.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+#include <linux/wsr.h>
+
+#include <linux/node.h>
+#include <linux/mmzone.h>
+#include <linux/mm.h>
+#include <linux/mm_inline.h>
+
+#include "internal.h"
+
+/* For now just embed wsr in the lruvec.
+ * Consider only allocating struct wsr when it's used
+ * since sizeof(struct wsr) is ~864 bytes.
+ */
+struct wsr *lruvec_wsr(struct lruvec *lruvec)
+{
+ return &lruvec->__wsr;
+}
+
+void wsr_init(struct lruvec *lruvec)
+{
+ struct wsr *wsr = lruvec_wsr(lruvec);
+
+ mutex_init(&wsr->bins_lock);
+ wsr->bins[0].idle_age = -1;
+}
+
+void wsr_destroy(struct lruvec *lruvec)
+{
+ struct wsr *wsr = lruvec_wsr(lruvec);
+
+ mutex_destroy(&wsr->bins_lock);
+ memset(wsr, 0, sizeof(*wsr));
+}
+
+ssize_t wsr_intervals_ms_parse(char *src, struct ws_bin *bins)
+{
+ int err, i = 0;
+ char *cur, *next = strim(src);
+
+ while ((cur = strsep(&next, ","))) {
+ unsigned int msecs;
+
+ err = kstrtouint(cur, 0, &msecs);
+ if (err)
+ return err;
+
+ bins[i].idle_age = msecs_to_jiffies(msecs);
+ if (i > 0 && bins[i].idle_age <= bins[i - 1].idle_age)
+ return -EINVAL;
+
+ if (++i == MAX_NR_BINS)
+ return -ERANGE;
+ }
+
+ if (i && i < MIN_NR_BINS - 1)
+ return -ERANGE;
+
+ bins[i].idle_age = -1;
+ return 0;
+}
+
+static void collect_wsr(struct wsr *wsr, const struct lruvec *lruvec)
+{
+ int gen, type, zone;
+ const struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ unsigned long curr_timestamp = jiffies;
+ unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq);
+ unsigned long min_seq[ANON_AND_FILE] = {
+ READ_ONCE(lruvec->lrugen.min_seq[LRU_GEN_ANON]),
+ READ_ONCE(lruvec->lrugen.min_seq[LRU_GEN_FILE]),
+ };
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ unsigned long seq;
+ // TODO update bins hierarchically
+ struct ws_bin *bin = wsr->bins;
+
+ lockdep_assert_held(&wsr->bins_lock);
+ for (seq = max_seq; seq + 1 > min_seq[type]; seq--) {
+ unsigned long birth, gen_start = curr_timestamp, error, size = 0;
+
+ gen = lru_gen_from_seq(seq);
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
+ size += max(
+ READ_ONCE(lrugen->nr_pages[gen][type]
+ [zone]),
+ 0L);
+
+ birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
+ if (seq != max_seq) {
+ int next_gen = lru_gen_from_seq(seq + 1);
+
+ gen_start = READ_ONCE(
+ lruvec->lrugen.timestamps[next_gen]);
+ }
+
+ error = size;
+ /* gen exceeds the idle_age of bin */
+ while (bin->idle_age != -1 &&
+ time_before(birth + bin->idle_age,
+ curr_timestamp)) {
+ unsigned long proportion =
+ gen_start -
+ (curr_timestamp - bin->idle_age);
+ unsigned long gen_len = gen_start - birth;
+
+ if (!gen_len)
+ break;
+ if (proportion) {
+ unsigned long split_bin =
+ size / gen_len *
+ proportion;
+ bin->nr_pages[type] += split_bin;
+ error -= split_bin;
+ }
+ gen_start = curr_timestamp - bin->idle_age;
+ bin++;
+
+ }
+ bin->nr_pages[type] += error;
+ }
+ }
+}
+
+static void refresh_wsr(struct wsr *wsr, struct mem_cgroup *root,
+ struct pglist_data *pgdat)
+{
+ struct ws_bin *bin;
+ struct mem_cgroup *memcg;
+
+ lockdep_assert_held(&wsr->bins_lock);
+ VM_WARN_ON_ONCE(wsr->bins->idle_age == -1);
+
+ for (bin = wsr->bins; bin->idle_age != -1; bin++) {
+ bin->nr_pages[0] = 0;
+ bin->nr_pages[1] = 0;
+ }
+ // the last used bin has idle_age == -1.
+ bin->nr_pages[0] = 0;
+ bin->nr_pages[1] = 0;
+
+ memcg = mem_cgroup_iter(root, NULL, NULL);
+ do {
+ struct lruvec *lruvec =
+ mem_cgroup_lruvec(memcg, pgdat);
+
+ collect_wsr(wsr, lruvec);
+
+ cond_resched();
+ } while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
+}
+static struct pglist_data *kobj_to_pgdat(struct kobject *kobj)
+{
+ int nid = IS_ENABLED(CONFIG_NUMA) ? kobj_to_dev(kobj)->id :
+ first_memory_node;
+
+ return NODE_DATA(nid);
+}
+
+static struct wsr *kobj_to_wsr(struct kobject *kobj)
+{
+ return lruvec_wsr(mem_cgroup_lruvec(NULL, kobj_to_pgdat(kobj)));
+}
+
+static ssize_t intervals_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct ws_bin *bin;
+ int len = 0;
+ struct wsr *wsr = kobj_to_wsr(kobj);
+
+ mutex_lock(&wsr->bins_lock);
+
+ for (bin = wsr->bins; bin->idle_age != -1; bin++)
+ len += sysfs_emit_at(buf, len, "%u,", jiffies_to_msecs(bin->idle_age));
+
+ len += sysfs_emit_at(buf, len, "%lld\n", LLONG_MAX);
+
+ mutex_unlock(&wsr->bins_lock);
+
+ return len;
+}
+
+static ssize_t intervals_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *src, size_t len)
+{
+ char *buf;
+ struct ws_bin *bins;
+ int err = 0;
+ struct wsr *wsr = kobj_to_wsr(kobj);
+
+ bins = kzalloc(sizeof(wsr->bins), GFP_KERNEL);
+ if (!bins)
+ return -ENOMEM;
+
+ buf = kstrdup(src, GFP_KERNEL);
+ if (!buf) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ err = wsr_intervals_ms_parse(buf, bins);
+ if (err)
+ goto failed;
+
+ mutex_lock(&wsr->bins_lock);
+ memcpy(wsr->bins, bins, sizeof(wsr->bins));
+ mutex_unlock(&wsr->bins_lock);
+failed:
+ kfree(buf);
+ kfree(bins);
+
+ return err ?: len;
+}
+
+static struct kobj_attribute intervals_ms_attr = __ATTR_RW(intervals_ms);
+
+static ssize_t histogram_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct ws_bin *bin;
+ int len = 0;
+ struct wsr *wsr = kobj_to_wsr(kobj);
+
+ mutex_lock(&wsr->bins_lock);
+
+ refresh_wsr(wsr, NULL, kobj_to_pgdat(kobj));
+
+ for (bin = wsr->bins; bin->idle_age != -1; bin++)
+ len += sysfs_emit_at(buf, len, "%u anon=%lu file=%lu\n",
+ jiffies_to_msecs(bin->idle_age), bin->nr_pages[0],
+ bin->nr_pages[1]);
+
+ len += sysfs_emit_at(buf, len, "%lld anon=%lu file=%lu\n", LLONG_MAX,
+ bin->nr_pages[0], bin->nr_pages[1]);
+
+ mutex_unlock(&wsr->bins_lock);
+
+ return len;
+}
+
+static struct kobj_attribute histogram_attr = __ATTR_RO(histogram);
+
+static struct attribute *wsr_attrs[] = {
+ &intervals_ms_attr.attr,
+ &histogram_attr.attr,
+ NULL
+};
+
+static const struct attribute_group wsr_attr_group = {
+ .name = "wsr",
+ .attrs = wsr_attrs,
+};
+
+void wsr_register_node(struct node *node)
+{
+ struct kobject *kobj = node ? &node->dev.kobj : mm_kobj;
+ struct wsr *wsr;
+
+ if (IS_ENABLED(CONFIG_NUMA) && !node)
+ return;
+
+ wsr = kobj_to_wsr(kobj);
+
+ /* wsr should be initialized when pgdat was initialized
+ * or when the root memcg was initialized
+ */
+ if (sysfs_create_group(kobj, &wsr_attr_group)) {
+ pr_warn("WSR failed to created group");
+ return;
+ }
+}
+
+void wsr_unregister_node(struct node *node)
+{
+ struct kobject *kobj = &node->dev.kobj;
+ struct wsr *wsr;
+
+ if (IS_ENABLED(CONFIG_NUMA) && !node)
+ return;
+
+ wsr = kobj_to_wsr(kobj);
+ sysfs_remove_group(kobj, &wsr_attr_group);
+ wsr_destroy(mem_cgroup_lruvec(NULL, kobj_to_pgdat(kobj)));
+}
--
2.41.0.162.gfafddb0af9-goog