2023-05-09 19:09:52

by Yuanchu Xie

[permalink] [raw]
Subject: [RFC PATCH 1/2] mm: multigen-LRU: working set reporting

From: talumbau <[email protected]>

A single patch to be broken up into multiple patches.
- Add working set reporting structure.
- Add per-node and per-memcg interfaces for working set reporting.
- Implement working set backend for MGLRU.

Signed-off-by: T.J. Alumbaugh <[email protected]>
Signed-off-by: Yuanchu Xie <[email protected]>
---
drivers/base/node.c | 2 +
include/linux/memcontrol.h | 6 +
include/linux/mmzone.h | 14 +-
include/linux/wss.h | 57 +++++
mm/Kconfig | 7 +
mm/Makefile | 1 +
mm/memcontrol.c | 349 ++++++++++++++++++++++++++-
mm/mmzone.c | 2 +
mm/vmscan.c | 479 ++++++++++++++++++++++++++++++++++++-
mm/wss.c | 56 +++++
10 files changed, 970 insertions(+), 3 deletions(-)
create mode 100644 include/linux/wss.h
create mode 100644 mm/wss.c

diff --git a/drivers/base/node.c b/drivers/base/node.c
index faf3597a96da..047908978088 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -616,6 +616,7 @@ static int register_node(struct node *node, int num)
} else {
hugetlb_register_node(node);
compaction_register_node(node);
+ wss_register_node(node);
}

return error;
@@ -632,6 +633,7 @@ void unregister_node(struct node *node)
{
hugetlb_unregister_node(node);
compaction_unregister_node(node);
+ wss_unregister_node(node);
node_remove_accesses(node);
node_remove_caches(node);
device_unregister(&node->dev);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 85dc9b88ea37..95d4a0bc89e7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -10,6 +10,7 @@

#ifndef _LINUX_MEMCONTROL_H
#define _LINUX_MEMCONTROL_H
+#include <linux/wait.h>
#include <linux/cgroup.h>
#include <linux/vm_event_item.h>
#include <linux/hardirq.h>
@@ -325,6 +326,11 @@ struct mem_cgroup {
struct lru_gen_mm_list mm_list;
#endif

+#ifdef CONFIG_WSS
+ int wss_event;
+ wait_queue_head_t wss_wait_queue;
+#endif
+
struct mem_cgroup_per_node *nodeinfo[];
};

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index cd28a100d9e4..506c29aaf124 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -21,6 +21,7 @@
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <linux/local_lock.h>
+#include <linux/wss.h>
#include <asm/page.h>

/* Free memory management - zoned buddy allocator. */
@@ -361,6 +362,7 @@ enum lruvec_flags {

#ifndef __GENERATING_BOUNDS_H

+struct node;
struct lruvec;
struct page_vma_mapped_walk;

@@ -481,7 +483,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
#ifdef CONFIG_MEMCG
void lru_gen_init_memcg(struct mem_cgroup *memcg);
void lru_gen_exit_memcg(struct mem_cgroup *memcg);
-#endif
+#endif /* CONFIG_MEMCG */

#else /* !CONFIG_LRU_GEN */

@@ -503,6 +505,14 @@ static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
}
#endif

+static inline void wss_register_node(struct node *node)
+{
+}
+
+static inline void wss_unregister_node(struct node *node)
+{
+}
+
#endif /* CONFIG_LRU_GEN */

struct lruvec {
@@ -527,6 +537,8 @@ struct lruvec {
struct lru_gen_struct lrugen;
/* to concurrently iterate lru_gen_mm_list */
struct lru_gen_mm_state mm_state;
+ /* only accessed through lruvec_wss */
+ struct wss __wss;
#endif
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
diff --git a/include/linux/wss.h b/include/linux/wss.h
new file mode 100644
index 000000000000..942efce0f9c2
--- /dev/null
+++ b/include/linux/wss.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_WSS_H
+#define _LINUX_WSS_H
+
+#include <linux/types.h>
+#include <linux/mutex.h>
+
+struct node;
+struct lruvec;
+struct mem_cgroup;
+struct pglist_data;
+struct scan_control;
+struct lru_gen_mm_walk;
+
+#define ANON_AND_FILE 2
+
+#define MIN_NR_BINS 4
+#define MAX_NR_BINS 16
+
+struct wss_bin {
+ unsigned long idle_age;
+ unsigned long nr_pages[ANON_AND_FILE];
+};
+
+struct wss {
+ /* protects bins */
+ struct mutex bins_lock;
+ /* protects reaccess_bins */
+ struct mutex reaccess_bins_lock;
+ struct kernfs_node *notifier;
+ unsigned long timestamp;
+ unsigned long report_threshold;
+ unsigned long refresh_threshold;
+ struct wss_bin bins[MAX_NR_BINS];
+ struct wss_bin reaccess_bins[MAX_NR_BINS];
+};
+
+void wss_register_node(struct node *node);
+void wss_unregister_node(struct node *node);
+
+void wss_init(struct wss *wss);
+void wss_destroy(struct wss *wss);
+struct wss *lruvec_wss(struct lruvec *lruvec);
+
+ssize_t wss_intervals_ms_parse(char *src, struct wss_bin *bins);
+
+/*
+ * wss->bins needs to be locked
+ * refreshes wss based on the refresh threshold
+ */
+void wss_refresh(struct wss *wss, struct mem_cgroup *root,
+ struct pglist_data *pgdat);
+void report_reaccess(struct lruvec *lruvec, struct lru_gen_mm_walk *walk);
+void report_wss(struct pglist_data *pgdat, struct scan_control *sc);
+
+#endif /* _LINUX_WSS_H */
+
diff --git a/mm/Kconfig b/mm/Kconfig
index ff7b209dec05..b3a32c2b360f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1183,6 +1183,13 @@ config LRU_GEN_STATS
This option has a per-memcg and per-node memory overhead.
# }

+config WSS
+ bool "Working set reporting"
+ depends on LRU_GEN
+ help
+ This option enables working set reporting, separate backends
+ WIP. Currently only supports MGLRU.
+
source "mm/damon/Kconfig"

endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 8e105e5b3e29..409b4fc97485 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -98,6 +98,7 @@ obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
+obj-$(CONFIG_WSS) += wss.o
ifdef CONFIG_SWAP
obj-$(CONFIG_MEMCG) += swap_cgroup.o
endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2eee092f8f11..08e574c86b18 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -25,6 +25,7 @@
* Copyright (C) 2020 Alibaba, Inc, Alex Shi
*/

+#include <linux/wait.h>
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
@@ -65,6 +66,7 @@
#include <linux/seq_buf.h>
#include "internal.h"
#include <net/sock.h>
+#include <linux/wss.h>
#include <net/ip.h>
#include "slab.h"
#include "swap.h"
@@ -5233,6 +5235,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn)
return;

+ wss_destroy(lruvec_wss(&pn->lruvec));
free_percpu(pn->lruvec_stats_percpu);
kfree(pn);
}
@@ -5311,6 +5314,10 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
memcg->deferred_split_queue.split_queue_len = 0;
+#endif
+#ifdef CONFIG_WSS
+ memcg->wss_event = 0;
+ init_waitqueue_head(&memcg->wss_wait_queue);
#endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
lru_gen_init_memcg(memcg);
@@ -5411,6 +5418,9 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock_irq(&memcg->event_list_lock);

+ wake_up_pollfree(&memcg->wss_wait_queue);
+ synchronize_rcu();
+
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);

@@ -6642,6 +6652,306 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
return nbytes;
}

+#ifdef CONFIG_WSS
+static int memory_wss_intervals_ms_show(struct seq_file *m, void *v)
+{
+ int nid;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct wss *wss;
+ struct wss_bin *bin;
+
+ wss = lruvec_wss(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+ mutex_lock(&wss->bins_lock);
+ seq_printf(m, "N%d=", nid);
+ for (bin = wss->bins; bin->idle_age != -1; bin++)
+ seq_printf(m, "%u,", jiffies_to_msecs(bin->idle_age));
+ mutex_unlock(&wss->bins_lock);
+
+ seq_printf(m, "%lld ", LLONG_MAX);
+ }
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static ssize_t memory_wss_intervals_ms_parse(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ unsigned int *nid_out,
+ struct wss_bin *bins)
+{
+ char *node, *intervals;
+ unsigned int nid;
+ int err;
+
+ buf = strstrip(buf);
+ intervals = buf;
+ node = strsep(&intervals, "=");
+
+ if (*node != 'N')
+ return -EINVAL;
+
+ err = kstrtouint(node + 1, 0, &nid);
+ if (err)
+ return err;
+
+ if (nid >= nr_node_ids || !node_state(nid, N_MEMORY))
+ return -EINVAL;
+
+ err = wss_intervals_ms_parse(intervals, bins);
+ if (err)
+ return err;
+
+ *nid_out = nid;
+ return 0;
+}
+
+static ssize_t memory_wss_intervals_ms_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ unsigned int nid;
+ int err;
+ struct wss *wss;
+ struct wss_bin *bins;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+
+ bins = kzalloc(sizeof(wss->bins), GFP_KERNEL);
+ if (!bins)
+ return -ENOMEM;
+
+ err = memory_wss_intervals_ms_parse(of, buf, nbytes, &nid, bins);
+ if (err)
+ goto failed;
+
+ wss = lruvec_wss(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+ mutex_lock(&wss->bins_lock);
+ memcpy(wss->bins, bins, sizeof(wss->bins));
+ mutex_unlock(&wss->bins_lock);
+failed:
+ kfree(bins);
+ return err ?: nbytes;
+}
+
+static int memory_reaccess_intervals_ms_show(struct seq_file *m, void *v)
+{
+ int nid;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct wss *wss;
+ struct wss_bin *bin;
+
+ wss = lruvec_wss(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+ mutex_lock(&wss->reaccess_bins_lock);
+ seq_printf(m, "N%d=", nid);
+ for (bin = wss->reaccess_bins; bin->idle_age != -1; bin++)
+ seq_printf(m, "%u,", jiffies_to_msecs(bin->idle_age));
+ mutex_unlock(&wss->reaccess_bins_lock);
+
+ seq_printf(m, "%lld ", LLONG_MAX);
+ }
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static ssize_t memory_reaccess_intervals_ms_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ unsigned int nid;
+ int err;
+ struct wss *wss;
+ struct wss_bin *bins;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+
+ bins = kzalloc(sizeof(wss->reaccess_bins), GFP_KERNEL);
+ if (!bins)
+ return -ENOMEM;
+
+ err = memory_wss_intervals_ms_parse(of, buf, nbytes, &nid, bins);
+ if (err)
+ goto failed;
+
+ wss = lruvec_wss(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+ mutex_lock(&wss->reaccess_bins_lock);
+ memcpy(wss->reaccess_bins, bins, sizeof(wss->reaccess_bins));
+ mutex_unlock(&wss->reaccess_bins_lock);
+failed:
+ kfree(bins);
+ return err ?: nbytes;
+}
+
+static int memory_wss_refresh_ms_show(struct seq_file *m, void *v)
+{
+ int nid;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct wss *wss =
+ lruvec_wss(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+
+ seq_printf(m, "N%d=%u ", nid,
+ jiffies_to_msecs(READ_ONCE(wss->refresh_threshold)));
+ }
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static ssize_t memory_wss_threshold_parse(char *buf, size_t nbytes,
+ unsigned int *nid_out,
+ unsigned int *msecs)
+{
+ char *node, *threshold;
+ unsigned int nid;
+ int err;
+
+ buf = strstrip(buf);
+ threshold = buf;
+ node = strsep(&threshold, "=");
+
+ if (*node != 'N')
+ return -EINVAL;
+
+ err = kstrtouint(node + 1, 0, &nid);
+ if (err)
+ return err;
+
+ if (nid >= nr_node_ids || !node_state(nid, N_MEMORY))
+ return -EINVAL;
+
+ err = kstrtouint(threshold, 0, msecs);
+ if (err)
+ return err;
+
+ *nid_out = nid;
+
+ return nbytes;
+}
+
+static ssize_t memory_wss_refresh_ms_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ unsigned int nid, msecs;
+ struct wss *wss;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ ssize_t ret = memory_wss_threshold_parse(buf, nbytes, &nid, &msecs);
+
+ if (ret < 0)
+ return ret;
+
+ wss = lruvec_wss(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+ WRITE_ONCE(wss->refresh_threshold, msecs_to_jiffies(msecs));
+ return ret;
+}
+
+static int memory_wss_report_ms_show(struct seq_file *m, void *v)
+{
+ int nid;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct wss *wss =
+ lruvec_wss(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+
+ seq_printf(m, "N%d=%u ", nid,
+ jiffies_to_msecs(READ_ONCE(wss->report_threshold)));
+ }
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static ssize_t memory_wss_report_ms_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ unsigned int nid, msecs;
+ struct wss *wss;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ ssize_t ret = memory_wss_threshold_parse(buf, nbytes, &nid, &msecs);
+
+ if (ret < 0)
+ return ret;
+
+ wss = lruvec_wss(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+ WRITE_ONCE(wss->report_threshold, msecs_to_jiffies(msecs));
+ return ret;
+}
+
+static int memory_wss_histogram_show(struct seq_file *m, void *v)
+{
+ int nid;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct wss *wss =
+ lruvec_wss(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+ struct wss_bin *bin;
+
+ seq_printf(m, "N%d\n", nid);
+
+ mutex_lock(&wss->bins_lock);
+ wss_refresh(wss, memcg, NODE_DATA(nid));
+ for (bin = wss->bins; bin->idle_age != -1; bin++)
+ seq_printf(m, "%u anon=%lu file=%lu\n",
+ jiffies_to_msecs(bin->idle_age),
+ bin->nr_pages[0], bin->nr_pages[1]);
+
+ seq_printf(m, "%lld anon=%lu file=%lu\n", LLONG_MAX,
+ bin->nr_pages[0], bin->nr_pages[1]);
+
+ mutex_unlock(&wss->bins_lock);
+ }
+
+ return 0;
+}
+
+__poll_t memory_wss_histogram_poll(struct kernfs_open_file *of,
+ struct poll_table_struct *pt)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+
+ if (memcg->css.flags & CSS_DYING)
+ return DEFAULT_POLLMASK;
+
+ poll_wait(of->file, &memcg->wss_wait_queue, pt);
+ if (cmpxchg(&memcg->wss_event, 1, 0) == 1)
+ return DEFAULT_POLLMASK | EPOLLPRI;
+ return DEFAULT_POLLMASK;
+}
+
+static int memory_reaccess_histogram_show(struct seq_file *m, void *v)
+{
+ int nid;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct wss *wss =
+ lruvec_wss(mem_cgroup_lruvec(memcg, NODE_DATA(nid)));
+ struct wss_bin *bin;
+
+ seq_printf(m, "N%d\n", nid);
+
+ mutex_lock(&wss->reaccess_bins_lock);
+ wss_refresh(wss, memcg, NODE_DATA(nid));
+ for (bin = wss->reaccess_bins; bin->idle_age != -1; bin++)
+ seq_printf(m, "%u anon=%lu file=%lu\n",
+ jiffies_to_msecs(bin->idle_age),
+ bin->nr_pages[0], bin->nr_pages[1]);
+
+ seq_printf(m, "%lld anon=%lu file=%lu\n", LLONG_MAX,
+ bin->nr_pages[0], bin->nr_pages[1]);
+
+ mutex_unlock(&wss->reaccess_bins_lock);
+ }
+
+ return 0;
+}
+#endif
+
static struct cftype memory_files[] = {
{
.name = "current",
@@ -6710,7 +7020,44 @@ static struct cftype memory_files[] = {
.flags = CFTYPE_NS_DELEGATABLE,
.write = memory_reclaim,
},
- { } /* terminate */
+#ifdef CONFIG_WSS
+ {
+ .name = "wss.intervals_ms",
+ .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_wss_intervals_ms_show,
+ .write = memory_wss_intervals_ms_write,
+ },
+ {
+ .name = "wss.refresh_ms",
+ .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_wss_refresh_ms_show,
+ .write = memory_wss_refresh_ms_write,
+ },
+ {
+ .name = "wss.report_ms",
+ .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_wss_report_ms_show,
+ .write = memory_wss_report_ms_write,
+ },
+ {
+ .name = "wss.histogram",
+ .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_wss_histogram_show,
+ .poll = memory_wss_histogram_poll,
+ },
+ {
+ .name = "reaccess.intervals_ms",
+ .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_reaccess_intervals_ms_show,
+ .write = memory_reaccess_intervals_ms_write,
+ },
+ {
+ .name = "reaccess.histogram",
+ .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_reaccess_histogram_show,
+ },
+#endif
+ {} /* terminate */
};

struct cgroup_subsys memory_cgrp_subsys = {
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 68e1511be12d..6e70c44897cc 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -89,6 +89,8 @@ void lruvec_init(struct lruvec *lruvec)
*/
list_del(&lruvec->lists[LRU_UNEVICTABLE]);

+ wss_init(&lruvec->__wss);
+
lru_gen_init_lruvec(lruvec);
}

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5b7b8d4f5297..b3adf924691c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -10,6 +10,7 @@
* Multiqueue VM started 5.8.00, Rik van Riel.
*/

+#include "linux/jiffies.h"
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mm.h>
@@ -55,6 +56,7 @@
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include <linux/khugepaged.h>
+#include <linux/wss.h>

#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -4225,6 +4227,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
mem_cgroup_unlock_pages();

if (walk->batched) {
+ report_reaccess(lruvec, walk);
spin_lock_irq(&lruvec->lru_lock);
reset_batch_size(lruvec, walk);
spin_unlock_irq(&lruvec->lru_lock);
@@ -4465,6 +4468,470 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
return true;
}

+/******************************************************************************
+ * working set monitoring
+ ******************************************************************************/
+
+static void collect_wss(struct wss *wss, const struct lruvec *lruvec,
+ bool can_swap)
+{
+ int gen, type, zone;
+ const struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ unsigned long curr_timestamp = jiffies;
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ unsigned long seq;
+ // TODO update bins hierarchically
+ struct wss_bin *bin = wss->bins;
+
+ lockdep_assert_held(&wss->bins_lock);
+ for (seq = max_seq; seq + 1 > min_seq[type]; seq--) {
+ unsigned long birth, gen_start = curr_timestamp, error, size = 0;
+
+ gen = lru_gen_from_seq(seq);
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
+ size += max(
+ READ_ONCE(lrugen->nr_pages[gen][type]
+ [zone]),
+ 0L);
+
+ birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
+ if (seq != max_seq) {
+ int next_gen = lru_gen_from_seq(seq + 1);
+
+ gen_start = READ_ONCE(
+ lruvec->lrugen.timestamps[next_gen]);
+ }
+
+ error = size;
+ /* gen exceeds the idle_age of bin */
+ while (bin->idle_age != -1 &&
+ time_before(birth + bin->idle_age,
+ curr_timestamp)) {
+ unsigned long proportion =
+ gen_start -
+ (curr_timestamp - bin->idle_age);
+ unsigned long gen_len = gen_start - birth;
+
+ if (!gen_len)
+ break;
+ if (proportion) {
+ unsigned long split_bin =
+ size / gen_len *
+ proportion;
+ bin->nr_pages[type] += split_bin;
+ error -= split_bin;
+ }
+ gen_start = curr_timestamp - bin->idle_age;
+ bin++;
+
+ }
+ bin->nr_pages[type] += error;
+ }
+ }
+}
+
+static void refresh_wss(struct wss *wss, struct mem_cgroup *root,
+ struct pglist_data *pgdat, struct scan_control *sc,
+ unsigned long refresh_threshold)
+{
+ struct wss_bin *bin;
+ struct mem_cgroup *memcg;
+
+ lockdep_assert_held(&wss->bins_lock);
+ VM_WARN_ON_ONCE(wss->bins->idle_age == -1);
+
+ for (bin = wss->bins; bin->idle_age != -1; bin++) {
+ bin->nr_pages[0] = 0;
+ bin->nr_pages[1] = 0;
+ }
+ // the last used bin has idle_age == -1.
+ bin->nr_pages[0] = 0;
+ bin->nr_pages[1] = 0;
+
+ memcg = mem_cgroup_iter(root, NULL, NULL);
+ do {
+ struct lruvec *lruvec =
+ mem_cgroup_lruvec(memcg, pgdat);
+ bool can_swap = get_swappiness(lruvec, sc);
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ mem_cgroup_calculate_protection(root, memcg);
+ if (!mem_cgroup_below_min(root, memcg) && refresh_threshold &&
+ min_seq[!can_swap] + MAX_NR_GENS - 1 > max_seq) {
+ int gen = lru_gen_from_seq(max_seq);
+ unsigned long birth =
+ READ_ONCE(lruvec->lrugen.timestamps[gen]);
+
+ if (time_is_before_jiffies(birth + refresh_threshold))
+ try_to_inc_max_seq(lruvec, max_seq, sc,
+ can_swap, false);
+ }
+
+ collect_wss(wss, lruvec, can_swap);
+
+ cond_resched();
+ } while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
+}
+
+void report_wss(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ static DEFINE_RATELIMIT_STATE(rate, HZ, 3);
+
+ struct mem_cgroup *memcg = sc->target_mem_cgroup;
+ struct wss *wss = lruvec_wss(mem_cgroup_lruvec(memcg, pgdat));
+ unsigned long threshold;
+
+ threshold = READ_ONCE(wss->report_threshold);
+
+ if (sc->priority == DEF_PRIORITY)
+ return;
+
+ if (READ_ONCE(wss->bins->idle_age) == -1)
+ return;
+
+ if (!threshold || time_is_after_jiffies(wss->timestamp + threshold))
+ return;
+
+ if (!__ratelimit(&rate))
+ return;
+
+ if (!mutex_trylock(&wss->bins_lock))
+ return;
+
+ refresh_wss(wss, memcg, pgdat, sc, 0);
+ WRITE_ONCE(wss->timestamp, jiffies);
+
+ mutex_unlock(&wss->bins_lock);
+
+ if (wss->notifier)
+ kernfs_notify(wss->notifier);
+ if (memcg && cmpxchg(&memcg->wss_event, 0, 1) == 0)
+ wake_up_interruptible(&memcg->wss_wait_queue);
+}
+
+static void collect_reaccess_locked(struct wss *wss,
+ struct lru_gen_struct *lrugen,
+ struct lru_gen_mm_walk *walk)
+{
+ int gen, type, zone;
+ unsigned long curr_timestamp = jiffies;
+ unsigned long max_seq = READ_ONCE(walk->max_seq);
+ unsigned long min_seq[ANON_AND_FILE] = {
+ READ_ONCE(lrugen->min_seq[LRU_GEN_ANON]),
+ READ_ONCE(lrugen->min_seq[LRU_GEN_FILE]),
+ };
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ unsigned long seq;
+ struct wss_bin *bin = wss->reaccess_bins;
+
+ lockdep_assert_held(&wss->reaccess_bins_lock);
+ /* Skip max_seq because a reaccess moves a page from another seq
+ * to max_seq. We use the negative change in page count from
+ * other seqs to track the number of reaccesses.
+ */
+ for (seq = max_seq - 1; seq + 1 > min_seq[type]; seq--) {
+ long error;
+ int next_gen;
+ unsigned long birth, gen_start;
+ long delta = 0;
+
+ gen = lru_gen_from_seq(seq);
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ long nr_pages = walk->nr_pages[gen][type][zone];
+
+ if (nr_pages < 0)
+ delta += -nr_pages;
+ }
+
+ birth = READ_ONCE(lrugen->timestamps[gen]);
+ next_gen = lru_gen_from_seq(seq + 1);
+ gen_start = READ_ONCE(lrugen->timestamps[next_gen]);
+
+ /* ensure gen_start is within idle_age of bin */
+ while (bin->idle_age != -1 &&
+ time_before(gen_start + bin->idle_age,
+ curr_timestamp))
+ bin++;
+
+ error = delta;
+ /* gen exceeds the idle_age of bin */
+ while (bin->idle_age != -1 &&
+ time_before(birth + bin->idle_age,
+ curr_timestamp)) {
+ unsigned long proportion =
+ gen_start -
+ (curr_timestamp - bin->idle_age);
+ unsigned long gen_len = gen_start - birth;
+
+ if (!gen_len)
+ break;
+ if (proportion) {
+ unsigned long split_bin =
+ delta / gen_len * proportion;
+ bin->nr_pages[type] += split_bin;
+ error -= split_bin;
+ }
+ gen_start = curr_timestamp - bin->idle_age;
+ bin++;
+ }
+ bin->nr_pages[type] += error;
+ }
+ }
+}
+
+static void collect_reaccess(struct wss *wss,
+ struct lru_gen_struct *lrugen,
+ struct lru_gen_mm_walk *walk)
+{
+ if (READ_ONCE(wss->reaccess_bins->idle_age) == -1)
+ return;
+
+ mutex_lock(&wss->reaccess_bins_lock);
+ collect_reaccess_locked(wss, lrugen, walk);
+ mutex_unlock(&wss->reaccess_bins_lock);
+}
+
+void report_reaccess(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
+{
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+
+ while (memcg) {
+ collect_reaccess(lruvec_wss(mem_cgroup_lruvec(
+ memcg, lruvec_pgdat(lruvec))),
+ lrugen, walk);
+ memcg = parent_mem_cgroup(memcg);
+ }
+}
+
+static struct pglist_data *kobj_to_pgdat(struct kobject *kobj)
+{
+ int nid = IS_ENABLED(CONFIG_NUMA) ? kobj_to_dev(kobj)->id :
+ first_memory_node;
+
+ return NODE_DATA(nid);
+}
+
+static struct wss *kobj_to_wss(struct kobject *kobj)
+{
+ return lruvec_wss(mem_cgroup_lruvec(NULL, kobj_to_pgdat(kobj)));
+}
+
+static ssize_t report_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct wss *wss = kobj_to_wss(kobj);
+ unsigned long threshold = READ_ONCE(wss->report_threshold);
+
+ return sysfs_emit(buf, "%u\n", jiffies_to_msecs(threshold));
+}
+
+static ssize_t report_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ unsigned int msecs;
+ struct wss *wss = kobj_to_wss(kobj);
+
+ if (kstrtouint(buf, 0, &msecs))
+ return -EINVAL;
+
+ WRITE_ONCE(wss->report_threshold, msecs_to_jiffies(msecs));
+
+ return len;
+}
+
+static struct kobj_attribute report_ms_attr = __ATTR_RW(report_ms);
+
+static ssize_t refresh_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct wss *wss = kobj_to_wss(kobj);
+ unsigned long threshold = READ_ONCE(wss->refresh_threshold);
+
+ return sysfs_emit(buf, "%u\n", jiffies_to_msecs(threshold));
+}
+
+static ssize_t refresh_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ unsigned int msecs;
+ struct wss *wss = kobj_to_wss(kobj);
+
+ if (kstrtouint(buf, 0, &msecs))
+ return -EINVAL;
+
+ WRITE_ONCE(wss->refresh_threshold, msecs_to_jiffies(msecs));
+
+ return len;
+}
+
+static struct kobj_attribute refresh_ms_attr = __ATTR_RW(refresh_ms);
+
+static ssize_t intervals_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct wss_bin *bin;
+ int len = 0;
+ struct wss *wss = kobj_to_wss(kobj);
+
+ mutex_lock(&wss->bins_lock);
+
+ for (bin = wss->bins; bin->idle_age != -1; bin++)
+ len += sysfs_emit_at(buf, len, "%u,", jiffies_to_msecs(bin->idle_age));
+
+ len += sysfs_emit_at(buf, len, "%lld\n", LLONG_MAX);
+
+ mutex_unlock(&wss->bins_lock);
+
+ return len;
+}
+
+static ssize_t intervals_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *src, size_t len)
+{
+ char *buf;
+ struct wss_bin *bins;
+ int err = 0;
+ struct wss *wss = kobj_to_wss(kobj);
+
+ bins = kzalloc(sizeof(wss->bins), GFP_KERNEL);
+ if (!bins)
+ return -ENOMEM;
+
+ buf = kstrdup(src, GFP_KERNEL);
+ if (!buf) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ err = wss_intervals_ms_parse(buf, bins);
+ if (err)
+ goto failed;
+
+ mutex_lock(&wss->bins_lock);
+ memcpy(wss->bins, bins, sizeof(wss->bins));
+ mutex_unlock(&wss->bins_lock);
+failed:
+ kfree(buf);
+ kfree(bins);
+
+ return err ?: len;
+}
+
+static struct kobj_attribute intervals_ms_attr = __ATTR_RW(intervals_ms);
+
+void wss_refresh(struct wss *wss, struct mem_cgroup *root,
+ struct pglist_data *pgdat)
+{
+ unsigned int flags;
+ struct scan_control sc = {
+ .may_writepage = true,
+ .may_unmap = true,
+ .may_swap = true,
+ .reclaim_idx = MAX_NR_ZONES - 1,
+ .gfp_mask = GFP_KERNEL,
+ };
+
+ lockdep_assert_held(&wss->bins_lock);
+
+ if (wss->bins->idle_age != -1) {
+ unsigned long timestamp = READ_ONCE(wss->timestamp);
+ unsigned long threshold = READ_ONCE(wss->refresh_threshold);
+
+ if (time_is_before_jiffies(timestamp + threshold)) {
+ set_task_reclaim_state(current, &sc.reclaim_state);
+ flags = memalloc_noreclaim_save();
+ refresh_wss(wss, root, pgdat, &sc, threshold);
+ memalloc_noreclaim_restore(flags);
+ set_task_reclaim_state(current, NULL);
+ }
+ }
+}
+
+static ssize_t histogram_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct wss_bin *bin;
+ int len = 0;
+ struct wss *wss = kobj_to_wss(kobj);
+
+ mutex_lock(&wss->bins_lock);
+
+ wss_refresh(wss, NULL, kobj_to_pgdat(kobj));
+
+ for (bin = wss->bins; bin->idle_age != -1; bin++)
+ len += sysfs_emit_at(buf, len, "%u anon=%lu file=%lu\n",
+ jiffies_to_msecs(bin->idle_age), bin->nr_pages[0],
+ bin->nr_pages[1]);
+
+ len += sysfs_emit_at(buf, len, "%lld anon=%lu file=%lu\n", LLONG_MAX,
+ bin->nr_pages[0], bin->nr_pages[1]);
+
+ mutex_unlock(&wss->bins_lock);
+
+ return len;
+}
+
+static struct kobj_attribute histogram_attr = __ATTR_RO(histogram);
+
+static struct attribute *wss_attrs[] = {
+ &report_ms_attr.attr,
+ &refresh_ms_attr.attr,
+ &intervals_ms_attr.attr,
+ &histogram_attr.attr,
+ NULL
+};
+
+static const struct attribute_group wss_attr_group = {
+ .name = "wss",
+ .attrs = wss_attrs,
+};
+
+void wss_register_node(struct node *node)
+{
+ struct kobject *kobj = node ? &node->dev.kobj : mm_kobj;
+ struct wss *wss;
+
+ if (IS_ENABLED(CONFIG_NUMA) && !node)
+ return;
+
+ wss = kobj_to_wss(kobj);
+
+ /* wss should be initialized when pgdat was initialized
+ * or when the root memcg was initialized
+ */
+ if (sysfs_create_group(kobj, &wss_attr_group)) {
+ pr_warn("WSS failed to created group");
+ return;
+ }
+
+ wss->notifier = kernfs_walk_and_get(kobj->sd, "wss/histogram");
+}
+
+void wss_unregister_node(struct node *node)
+{
+ struct kobject *kobj = &node->dev.kobj;
+ struct wss *wss;
+
+ if (IS_ENABLED(CONFIG_NUMA) && !node)
+ return;
+
+ wss = kobj_to_wss(kobj);
+ kernfs_put(wss->notifier);
+ sysfs_remove_group(kobj, &wss_attr_group);
+ wss_destroy(wss);
+}
+
+/*******************************************************************************/
+
+
static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
{
@@ -4569,6 +5036,8 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)

VM_WARN_ON_ONCE(!current_is_kswapd());

+ report_wss(pgdat, sc);
+
sc->last_reclaimed = sc->nr_reclaimed;

/*
@@ -5076,11 +5545,14 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
sc->nr_scanned -= folio_nr_pages(folio);
}

+ walk = current->reclaim_state->mm_walk;
+ if (walk && walk->batched)
+ report_reaccess(lruvec, walk);
+
spin_lock_irq(&lruvec->lru_lock);

move_folios_to_lru(lruvec, &list);

- walk = current->reclaim_state->mm_walk;
if (walk && walk->batched)
reset_batch_size(lruvec, walk);

@@ -5890,6 +6362,8 @@ static int __init init_lru_gen(void)
if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
pr_err("lru_gen: failed to create sysfs group\n");

+ wss_register_node(NULL);
+
debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);

@@ -6411,6 +6885,9 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
if (zone->zone_pgdat == last_pgdat)
continue;
last_pgdat = zone->zone_pgdat;
+
+ if (!sc->proactive)
+ report_wss(zone->zone_pgdat, sc);
shrink_node(zone->zone_pgdat, sc);
}

diff --git a/mm/wss.c b/mm/wss.c
new file mode 100644
index 000000000000..f7cbe59db079
--- /dev/null
+++ b/mm/wss.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/wss.h>
+#include <linux/mmzone.h>
+
+/* For now just embed wss in the lruvec.
+ * Consider only allocating struct wss when it's used
+ * since sizeof(struct wss) is ~864 bytes.
+ */
+struct wss *lruvec_wss(struct lruvec *lruvec)
+{
+ return &lruvec->__wss;
+}
+
+void wss_init(struct wss *wss)
+{
+ mutex_init(&wss->bins_lock);
+ mutex_init(&wss->reaccess_bins_lock);
+ wss->bins[0].idle_age = -1;
+ wss->notifier = NULL;
+ wss->reaccess_bins[0].idle_age = -1;
+}
+
+void wss_destroy(struct wss *wss)
+{
+ mutex_destroy(&wss->bins_lock);
+ mutex_destroy(&wss->reaccess_bins_lock);
+ memset(wss, 0, sizeof(*wss));
+}
+
+ssize_t wss_intervals_ms_parse(char *src, struct wss_bin *bins)
+{
+ int err, i = 0;
+ char *cur, *next = strim(src);
+
+ while ((cur = strsep(&next, ","))) {
+ unsigned int msecs;
+
+ err = kstrtouint(cur, 0, &msecs);
+ if (err)
+ return err;
+
+ bins[i].idle_age = msecs_to_jiffies(msecs);
+ if (i > 0 && bins[i].idle_age <= bins[i - 1].idle_age)
+ return -EINVAL;
+
+ if (++i == MAX_NR_BINS)
+ return -ERANGE;
+ }
+
+ if (i && i < MIN_NR_BINS - 1)
+ return -ERANGE;
+
+ bins[i].idle_age = -1;
+ return 0;
+}
--
2.40.1.521.gf1e218fcd8-goog


2023-05-10 08:31:14

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [RFC PATCH 1/2] mm: multigen-LRU: working set reporting

On Wed, May 10, 2023 at 02:54:18AM +0800, Yuanchu Xie wrote:
> From: talumbau <[email protected]>

Please fix the name here.

>
> A single patch to be broken up into multiple patches.

What does this mean?

> - Add working set reporting structure.
> - Add per-node and per-memcg interfaces for working set reporting.
> - Implement working set backend for MGLRU.

Please break it up to be reviewable, otherwise no one will review it.

thanks,

greg k-h